mirror of
https://github.com/yacy/yacy_search_server.git
synced 2024-09-19 00:01:41 +02:00
- added autoReCrawl folders to bookmarks (DATA/SETTINGS/autoReCrawl.conf)
- the serverBusyThread checks folders every 60 min. (==> autoReCrawl_idlesleep in yacy.conf) - added option to create bookmarks from CrawlStart URL git-svn-id: https://svn.berlios.de/svnroot/repos/yacy/trunk@5033 6c8d7289-2bf4-0310-a012-ef5d649a1542
This commit is contained in:
parent
ebb40d324b
commit
e1574fe02e
8
defaults/autoReCrawl.conf
Normal file
8
defaults/autoReCrawl.conf
Normal file
|
@ -0,0 +1,8 @@
|
|||
# YaCy autoReCrawl configuration for bookmark folders
|
||||
#
|
||||
# schedule|folder|filter|crawlingdepth|crawlingIfOlder|DomFilterDepth|DomMaxPages|crawlingQ|indexText|indexMedia|crawlOrder|xsstopw|storeHTCache
|
||||
3600000 /autoReCrawl/hourly .* 1 59 -1 -1 1 1 1 1 0 0
|
||||
86400000 /autoReCrawl/daily .* 3 1439 -1 -1 1 1 1 1 0 0
|
||||
604800000 /autoReCrawl/weekly .* 3 10079 -1 -1 1 1 1 1 0 0
|
||||
2678400000 /autoReCrawl/monthly .* 4 44639 -1 -1 1 1 1 1 0 0
|
||||
# eof
|
|
@ -134,7 +134,7 @@ public class Bookmarks {
|
|||
final String pathString = post.get("path");
|
||||
tagsString=tagsString+","+pathString;
|
||||
if(tagsString.equals("")){
|
||||
tagsString="unsorted"; //default tag
|
||||
tagsString="/unsorted"; //default tag
|
||||
}
|
||||
final Set<String> tags=listManager.string2set(bookmarksDB.cleanTagsString(tagsString));
|
||||
final bookmarksDB.Bookmark bookmark = sb.bookmarksDB.createBookmark(url, username);
|
||||
|
|
|
@ -56,6 +56,25 @@
|
|||
Existing start URLs are always re-crawled.
|
||||
Other already visited URLs are sorted out as "double", if they are not allowed using the re-crawl option.
|
||||
</td>
|
||||
</tr>
|
||||
<tr valign="top" class="TableCellDark">
|
||||
<td>Create Bookmark</td>
|
||||
<td>
|
||||
<label for="createBookmark">Use</label>:
|
||||
<input type="checkbox" name="createBookmark" id="createBookmark" />
|
||||
<label for="bookmarkFolder"> Bookmark Folder</label>:
|
||||
<input name="bookmarkFolder" id="bookmarkFolder" type="text" size="20" maxlength="100" value="/crawlStart" /><br />
|
||||
<br/><br/>This option works with "Starting Point: From URL" only!
|
||||
</td>
|
||||
<td>
|
||||
This option lets you create a bookmark from your crawl start URL. For automatic re-crawling you can use the following default folders:<br/>
|
||||
<ul>
|
||||
<li>/autoReCrawl/hourly</li>
|
||||
<li>/autoReCrawl/daily</li>
|
||||
<li>/autoReCrawl/weekly</li>
|
||||
<li>/autoReCrawl/monthly</li>
|
||||
</ul>
|
||||
</td>
|
||||
</tr>
|
||||
<tr valign="top" class="TableCellLight">
|
||||
<td><label for="crawlingDepth">Crawling Depth</label>:</td>
|
||||
|
|
|
@ -32,6 +32,7 @@ import java.util.Date;
|
|||
import java.util.HashMap;
|
||||
import java.util.Iterator;
|
||||
import java.util.Map;
|
||||
import java.util.Set;
|
||||
import java.util.regex.Pattern;
|
||||
import java.util.regex.PatternSyntaxException;
|
||||
|
||||
|
@ -40,6 +41,8 @@ import de.anomic.crawler.CrawlEntry;
|
|||
import de.anomic.crawler.CrawlProfile;
|
||||
import de.anomic.crawler.SitemapImporter;
|
||||
import de.anomic.crawler.ZURL;
|
||||
import de.anomic.data.bookmarksDB;
|
||||
import de.anomic.data.listManager;
|
||||
import de.anomic.htmlFilter.htmlFilterContentScraper;
|
||||
import de.anomic.htmlFilter.htmlFilterWriter;
|
||||
import de.anomic.http.httpHeader;
|
||||
|
@ -207,6 +210,19 @@ public class WatchCrawler_p {
|
|||
final String reasonString = sb.crawlStacker.stackCrawl(url, null, sb.webIndex.seedDB.mySeed().hash, "CRAWLING-ROOT", new Date(), 0, pe);
|
||||
|
||||
if (reasonString == null) {
|
||||
// create a bookmark from crawl start url
|
||||
Set<String> tags=listManager.string2set(bookmarksDB.cleanTagsString(post.get("bookmarkFolder","/crawlStart")));
|
||||
tags.add("crawlStart");
|
||||
if (post.get("createBookmark","off").equals("on")) {
|
||||
bookmarksDB.Bookmark bookmark = sb.bookmarksDB.createBookmark(crawlingStart, "admin");
|
||||
if(bookmark != null){
|
||||
bookmark.setProperty(bookmarksDB.Bookmark.BOOKMARK_TITLE, crawlingStart);
|
||||
bookmark.setOwner("admin");
|
||||
bookmark.setPublic(false);
|
||||
bookmark.setTags(tags, true);
|
||||
sb.bookmarksDB.saveBookmark(bookmark);
|
||||
}
|
||||
}
|
||||
// liftoff!
|
||||
prop.put("info", "8");//start msg
|
||||
prop.putHTML("info_crawlingURL", (post.get("crawlingURL")));
|
||||
|
@ -227,8 +243,7 @@ public class WatchCrawler_p {
|
|||
m.remove("specificFilter");
|
||||
m.put("intention", post.get("intention", "").replace(',', '/'));
|
||||
sb.webIndex.newsPool.publishMyNews(yacyNewsRecord.newRecord(sb.webIndex.seedDB.mySeed(), yacyNewsPool.CATEGORY_CRAWL_START, m));
|
||||
}
|
||||
|
||||
}
|
||||
} else {
|
||||
prop.put("info", "5"); //Crawling failed
|
||||
prop.putHTML("info_crawlingURL", (post.get("crawlingURL")));
|
||||
|
|
|
@ -23,8 +23,13 @@
|
|||
|
||||
package de.anomic.data;
|
||||
|
||||
import java.io.BufferedReader;
|
||||
import java.io.ByteArrayInputStream;
|
||||
import java.io.File;
|
||||
import java.io.FileInputStream;
|
||||
import java.io.FileNotFoundException;
|
||||
import java.io.FileReader;
|
||||
import java.io.FileWriter;
|
||||
import java.io.IOException;
|
||||
import java.io.InputStream;
|
||||
import java.io.InputStreamReader;
|
||||
|
@ -54,6 +59,9 @@ import org.w3c.dom.Node;
|
|||
import org.w3c.dom.NodeList;
|
||||
import org.xml.sax.SAXException;
|
||||
|
||||
import de.anomic.crawler.CrawlEntry;
|
||||
import de.anomic.crawler.CrawlProfile;
|
||||
import de.anomic.crawler.ZURL;
|
||||
import de.anomic.htmlFilter.htmlFilterContentScraper;
|
||||
import de.anomic.htmlFilter.htmlFilterWriter;
|
||||
import de.anomic.index.indexWord;
|
||||
|
@ -62,9 +70,14 @@ import de.anomic.kelondro.kelondroCloneableIterator;
|
|||
import de.anomic.kelondro.kelondroException;
|
||||
import de.anomic.kelondro.kelondroMap;
|
||||
import de.anomic.kelondro.kelondroNaturalOrder;
|
||||
import de.anomic.plasma.plasmaSwitchboard;
|
||||
import de.anomic.server.serverBusyThread;
|
||||
import de.anomic.server.serverDate;
|
||||
import de.anomic.server.serverFileUtils;
|
||||
import de.anomic.server.serverInstantBusyThread;
|
||||
import de.anomic.server.logging.serverLog;
|
||||
import de.anomic.yacy.yacyNewsPool;
|
||||
import de.anomic.yacy.yacyNewsRecord;
|
||||
import de.anomic.yacy.yacyURL;
|
||||
|
||||
public class bookmarksDB {
|
||||
|
@ -75,6 +88,7 @@ public class bookmarksDB {
|
|||
final static int SORT_ALPHA = 1;
|
||||
final static int SORT_SIZE = 2;
|
||||
final static int SHOW_ALL = -1;
|
||||
final static String SLEEP_TIME = "3600000"; // default sleepTime: check for recrawls every hour
|
||||
|
||||
// bookmarks
|
||||
kelondroMap bookmarksTable; // kelondroMap bookmarksTable;
|
||||
|
@ -85,7 +99,9 @@ public class bookmarksDB {
|
|||
|
||||
// dates
|
||||
kelondroMap datesTable;
|
||||
|
||||
|
||||
// autoReCrawl
|
||||
private serverBusyThread autoReCrawl;
|
||||
|
||||
// ------------------------------------
|
||||
// bookmarksDB's class constructor
|
||||
|
@ -109,6 +125,14 @@ public class bookmarksDB {
|
|||
this.datesTable = new kelondroMap(new kelondroBLOBTree(datesFile, true, true, 20, 256, '_', kelondroNaturalOrder.naturalOrder, true, false, false), 500);
|
||||
if (!datesExisted) rebuildDates();
|
||||
|
||||
// autoReCrawl
|
||||
plasmaSwitchboard sb = plasmaSwitchboard.getSwitchboard();
|
||||
this.autoReCrawl = new serverInstantBusyThread(this, "autoReCrawl", null, null);
|
||||
long sleepTime = Long.parseLong(sb.getConfig("autoReCrawl_idlesleep" , SLEEP_TIME));
|
||||
sb.deployThread("autoReCrawl", "autoReCrawl Scheduler", "simple scheduler for automatic re-crawls of bookmarked urls", null, autoReCrawl, -1,
|
||||
sleepTime, sleepTime, Long.parseLong(sb.getConfig("autoReCrawl_memprereq" , "-1"))
|
||||
);
|
||||
serverLog.logInfo("BOOKMARKS", "autoReCrawl - thread initialized checking every "+(sleepTime/1000/60)+" minutes for recrawls");
|
||||
}
|
||||
|
||||
// -----------------------------------------------------
|
||||
|
@ -122,6 +146,153 @@ public class bookmarksDB {
|
|||
datesTable.close();
|
||||
}
|
||||
|
||||
// -----------------------------------------------------
|
||||
// bookmarksDB's functions for autoReCrawl
|
||||
// -----------------------------------------------------
|
||||
|
||||
public boolean autoReCrawl() {
|
||||
|
||||
// read crontab
|
||||
File f = new File (plasmaSwitchboard.getSwitchboard().getRootPath(),"DATA/SETTINGS/autoReCrawl.conf");
|
||||
String s;
|
||||
try {
|
||||
BufferedReader in = new BufferedReader(new InputStreamReader(new FileInputStream(f)));
|
||||
serverLog.logInfo("BOOKMARKS", "autoReCrawl - reading schedules from " + f);
|
||||
while( null != (s = in.readLine()) ) {
|
||||
if (!s.startsWith("#") && s.length()>0) {
|
||||
String parser[] = s.split("\t");
|
||||
if (parser.length == 13) {
|
||||
folderReCrawl(Long.parseLong(parser[0]), parser[1], parser[2], Integer.parseInt(parser[3]), Integer.parseInt(parser[4]),
|
||||
Integer.parseInt(parser[5]), Integer.parseInt(parser[6]), Boolean.parseBoolean(parser[7]),
|
||||
Boolean.parseBoolean(parser[8]), Boolean.parseBoolean(parser[9]),
|
||||
Boolean.parseBoolean(parser[10]), Boolean.parseBoolean(parser[11]),
|
||||
Boolean.parseBoolean(parser[12])
|
||||
);
|
||||
}
|
||||
}
|
||||
}
|
||||
in.close();
|
||||
} catch( FileNotFoundException ex ) {
|
||||
try {
|
||||
serverLog.logInfo("BOOKMARKS", "autoReCrawl - creating new autoReCrawl.conf");
|
||||
File inputFile = new File(plasmaSwitchboard.getSwitchboard().getRootPath(),"defaults/autoReCrawl.conf");
|
||||
File outputFile = new File(plasmaSwitchboard.getSwitchboard().getRootPath(),"DATA/SETTINGS/autoReCrawl.conf");
|
||||
FileReader i = new FileReader(inputFile);
|
||||
FileWriter o = new FileWriter(outputFile);
|
||||
int c;
|
||||
while ((c = i.read()) != -1)
|
||||
o.write(c);
|
||||
i.close();
|
||||
o.close();
|
||||
autoReCrawl();
|
||||
return true;
|
||||
} catch( FileNotFoundException e ) {
|
||||
serverLog.logSevere("BOOKMARKS", "autoReCrawl - file not found error: defaults/autoReCrawl.conf", e);
|
||||
return false;
|
||||
} catch (IOException e) {
|
||||
serverLog.logSevere("BOOKMARKS", "autoReCrawl - IOException: defaults/autoReCrawl.conf", e);
|
||||
return false;
|
||||
}
|
||||
} catch( Exception ex ) {
|
||||
serverLog.logSevere("BOOKMARKS", "autoReCrawl - error reading " + f, ex);
|
||||
return false;
|
||||
}
|
||||
return true;
|
||||
}
|
||||
|
||||
public void folderReCrawl (long schedule, String folder, String newcrawlingfilter, int newcrawlingdepth, int crawlingIfOlder,
|
||||
int crawlingDomFilterDepth, int crawlingDomMaxPages, boolean crawlingQ, boolean indexText, boolean indexMedia,
|
||||
boolean crawlOrder, boolean xsstopw, boolean storeHTCache) {
|
||||
|
||||
plasmaSwitchboard sb = plasmaSwitchboard.getSwitchboard();
|
||||
Iterator<String> bit=getBookmarksIterator(folder, true);
|
||||
serverLog.logInfo("BOOKMARKS", "autoReCrawl - processing: "+folder);
|
||||
|
||||
boolean xdstopw = xsstopw;
|
||||
boolean xpstopw = xsstopw;
|
||||
|
||||
while(bit.hasNext()) {
|
||||
|
||||
Bookmark bm = getBookmark(bit.next());
|
||||
long sleepTime = Long.parseLong(sb.getConfig("autoReCrawl_idlesleep" , SLEEP_TIME));
|
||||
long interTime = (System.currentTimeMillis()-bm.getTimeStamp())%schedule;
|
||||
|
||||
Date date=new Date(bm.getTimeStamp());
|
||||
serverLog.logInfo("BOOKMARKS", "autoReCrawl - checking schedule for: "+"["+serverDate.formatISO8601(date)+"] "+bm.getUrl());
|
||||
|
||||
if (interTime >= 0 && interTime < sleepTime) {
|
||||
try {
|
||||
// check if the crawl filter works correctly
|
||||
Pattern.compile(newcrawlingfilter);
|
||||
|
||||
// set crawlingStart to BookmarkUrl
|
||||
String crawlingStart = bm.getUrl();
|
||||
|
||||
// stack request
|
||||
// first delete old entry, if exists
|
||||
yacyURL crawlingStartURL = new yacyURL(crawlingStart, null);
|
||||
String urlhash = crawlingStartURL.hash();
|
||||
sb.webIndex.removeURL(urlhash);
|
||||
sb.crawlQueues.noticeURL.removeByURLHash(urlhash);
|
||||
sb.crawlQueues.errorURL.remove(urlhash);
|
||||
|
||||
// stack url
|
||||
sb.webIndex.profilesPassiveCrawls.removeEntry(crawlingStartURL.hash()); // if there is an old entry, delete it
|
||||
CrawlProfile.entry pe = sb.webIndex.profilesActiveCrawls.newEntry(
|
||||
"autoReCrawl", crawlingStartURL, newcrawlingfilter, newcrawlingfilter,
|
||||
newcrawlingdepth, newcrawlingdepth,
|
||||
crawlingIfOlder, crawlingDomFilterDepth, crawlingDomMaxPages,
|
||||
crawlingQ,
|
||||
indexText, indexMedia,
|
||||
storeHTCache, true, crawlOrder, xsstopw, xdstopw, xpstopw);
|
||||
String reasonString = sb.crawlStacker.stackCrawl(crawlingStartURL, null, sb.webIndex.seedDB.mySeed().hash, "CRAWLING-ROOT", new Date(), 0, pe);
|
||||
|
||||
if (reasonString == null) {
|
||||
serverLog.logInfo("BOOKMARKS", "autoReCrawl - adding crawl profile for: " + crawlingStart);
|
||||
// generate a YaCyNews if the global flag was set
|
||||
if (crawlOrder) {
|
||||
Map<String, String> m = new HashMap<String, String>(pe.map()); // must be cloned
|
||||
m.remove("specificDepth");
|
||||
m.remove("indexText");
|
||||
m.remove("indexMedia");
|
||||
m.remove("remoteIndexing");
|
||||
m.remove("xsstopw");
|
||||
m.remove("xpstopw");
|
||||
m.remove("xdstopw");
|
||||
m.remove("storeTXCache");
|
||||
m.remove("storeHTCache");
|
||||
m.remove("generalFilter");
|
||||
m.remove("specificFilter");
|
||||
m.put("intention", "Automatic ReCrawl!");
|
||||
sb.webIndex.newsPool.publishMyNews(yacyNewsRecord.newRecord(sb.webIndex.seedDB.mySeed(), yacyNewsPool.CATEGORY_CRAWL_START, m));
|
||||
}
|
||||
} else {
|
||||
serverLog.logInfo("BOOKMARKS", "autoReCrawl error adding crawl profile: " + crawlingStart + "- " + reasonString);
|
||||
ZURL.Entry ee = sb.crawlQueues.errorURL.newEntry(
|
||||
new CrawlEntry(
|
||||
sb.webIndex.seedDB.mySeed().hash,
|
||||
crawlingStartURL,
|
||||
"",
|
||||
"",
|
||||
new Date(),
|
||||
pe.handle(),
|
||||
0,
|
||||
0,
|
||||
0),
|
||||
sb.webIndex.seedDB.mySeed().hash,
|
||||
new Date(),
|
||||
1,
|
||||
reasonString);
|
||||
|
||||
ee.store();
|
||||
sb.crawlQueues.errorURL.push(ee);
|
||||
}
|
||||
} catch (MalformedURLException e1) {}
|
||||
} // if
|
||||
} // while(bit.hasNext())
|
||||
return;
|
||||
} // } autoReCrawl()
|
||||
|
||||
// -------------------------------------
|
||||
// bookmarksDB's public helper functions
|
||||
// -------------------------------------
|
||||
|
|
Loading…
Reference in New Issue
Block a user