- added autoReCrawl folders to bookmarks (DATA/SETTINGS/autoReCrawl.conf)

- the serverBusyThread checks folders every 60 min. (==> autoReCrawl_idlesleep in yacy.conf)
- added option to create bookmarks from CrawlStart URL

git-svn-id: https://svn.berlios.de/svnroot/repos/yacy/trunk@5033 6c8d7289-2bf4-0310-a012-ef5d649a1542
This commit is contained in:
apfelmaennchen 2008-08-04 20:43:36 +00:00
parent ebb40d324b
commit e1574fe02e
5 changed files with 217 additions and 4 deletions

View File

@ -0,0 +1,8 @@
# YaCy autoReCrawl configuration for bookmark folders
#
# schedule|folder|filter|crawlingdepth|crawlingIfOlder|DomFilterDepth|DomMaxPages|crawlingQ|indexText|indexMedia|crawlOrder|xsstopw|storeHTCache
3600000 /autoReCrawl/hourly .* 1 59 -1 -1 1 1 1 1 0 0
86400000 /autoReCrawl/daily .* 3 1439 -1 -1 1 1 1 1 0 0
604800000 /autoReCrawl/weekly .* 3 10079 -1 -1 1 1 1 1 0 0
2678400000 /autoReCrawl/monthly .* 4 44639 -1 -1 1 1 1 1 0 0
# eof

View File

@ -134,7 +134,7 @@ public class Bookmarks {
final String pathString = post.get("path");
tagsString=tagsString+","+pathString;
if(tagsString.equals("")){
tagsString="unsorted"; //default tag
tagsString="/unsorted"; //default tag
}
final Set<String> tags=listManager.string2set(bookmarksDB.cleanTagsString(tagsString));
final bookmarksDB.Bookmark bookmark = sb.bookmarksDB.createBookmark(url, username);

View File

@ -56,6 +56,25 @@
Existing start URLs are always re-crawled.
Other already visited URLs are sorted out as "double", if they are not allowed using the re-crawl option.
</td>
</tr>
<tr valign="top" class="TableCellDark">
<td>Create Bookmark</td>
<td>
<label for="createBookmark">Use</label>:
<input type="checkbox" name="createBookmark" id="createBookmark" />&nbsp;&nbsp;&nbsp;
<label for="bookmarkFolder"> Bookmark Folder</label>:
<input name="bookmarkFolder" id="bookmarkFolder" type="text" size="20" maxlength="100" value="/crawlStart" /><br />
<br/><br/>This option works with "Starting Point: From URL" only!
</td>
<td>
This option lets you create a bookmark from your crawl start URL. For automatic re-crawling you can use the following default folders:<br/>
<ul>
<li>/autoReCrawl/hourly</li>
<li>/autoReCrawl/daily</li>
<li>/autoReCrawl/weekly</li>
<li>/autoReCrawl/monthly</li>
</ul>
</td>
</tr>
<tr valign="top" class="TableCellLight">
<td><label for="crawlingDepth">Crawling Depth</label>:</td>

View File

@ -32,6 +32,7 @@ import java.util.Date;
import java.util.HashMap;
import java.util.Iterator;
import java.util.Map;
import java.util.Set;
import java.util.regex.Pattern;
import java.util.regex.PatternSyntaxException;
@ -40,6 +41,8 @@ import de.anomic.crawler.CrawlEntry;
import de.anomic.crawler.CrawlProfile;
import de.anomic.crawler.SitemapImporter;
import de.anomic.crawler.ZURL;
import de.anomic.data.bookmarksDB;
import de.anomic.data.listManager;
import de.anomic.htmlFilter.htmlFilterContentScraper;
import de.anomic.htmlFilter.htmlFilterWriter;
import de.anomic.http.httpHeader;
@ -207,6 +210,19 @@ public class WatchCrawler_p {
final String reasonString = sb.crawlStacker.stackCrawl(url, null, sb.webIndex.seedDB.mySeed().hash, "CRAWLING-ROOT", new Date(), 0, pe);
if (reasonString == null) {
// create a bookmark from crawl start url
Set<String> tags=listManager.string2set(bookmarksDB.cleanTagsString(post.get("bookmarkFolder","/crawlStart")));
tags.add("crawlStart");
if (post.get("createBookmark","off").equals("on")) {
bookmarksDB.Bookmark bookmark = sb.bookmarksDB.createBookmark(crawlingStart, "admin");
if(bookmark != null){
bookmark.setProperty(bookmarksDB.Bookmark.BOOKMARK_TITLE, crawlingStart);
bookmark.setOwner("admin");
bookmark.setPublic(false);
bookmark.setTags(tags, true);
sb.bookmarksDB.saveBookmark(bookmark);
}
}
// liftoff!
prop.put("info", "8");//start msg
prop.putHTML("info_crawlingURL", (post.get("crawlingURL")));
@ -227,8 +243,7 @@ public class WatchCrawler_p {
m.remove("specificFilter");
m.put("intention", post.get("intention", "").replace(',', '/'));
sb.webIndex.newsPool.publishMyNews(yacyNewsRecord.newRecord(sb.webIndex.seedDB.mySeed(), yacyNewsPool.CATEGORY_CRAWL_START, m));
}
}
} else {
prop.put("info", "5"); //Crawling failed
prop.putHTML("info_crawlingURL", (post.get("crawlingURL")));

View File

@ -23,8 +23,13 @@
package de.anomic.data;
import java.io.BufferedReader;
import java.io.ByteArrayInputStream;
import java.io.File;
import java.io.FileInputStream;
import java.io.FileNotFoundException;
import java.io.FileReader;
import java.io.FileWriter;
import java.io.IOException;
import java.io.InputStream;
import java.io.InputStreamReader;
@ -54,6 +59,9 @@ import org.w3c.dom.Node;
import org.w3c.dom.NodeList;
import org.xml.sax.SAXException;
import de.anomic.crawler.CrawlEntry;
import de.anomic.crawler.CrawlProfile;
import de.anomic.crawler.ZURL;
import de.anomic.htmlFilter.htmlFilterContentScraper;
import de.anomic.htmlFilter.htmlFilterWriter;
import de.anomic.index.indexWord;
@ -62,9 +70,14 @@ import de.anomic.kelondro.kelondroCloneableIterator;
import de.anomic.kelondro.kelondroException;
import de.anomic.kelondro.kelondroMap;
import de.anomic.kelondro.kelondroNaturalOrder;
import de.anomic.plasma.plasmaSwitchboard;
import de.anomic.server.serverBusyThread;
import de.anomic.server.serverDate;
import de.anomic.server.serverFileUtils;
import de.anomic.server.serverInstantBusyThread;
import de.anomic.server.logging.serverLog;
import de.anomic.yacy.yacyNewsPool;
import de.anomic.yacy.yacyNewsRecord;
import de.anomic.yacy.yacyURL;
public class bookmarksDB {
@ -75,6 +88,7 @@ public class bookmarksDB {
final static int SORT_ALPHA = 1;
final static int SORT_SIZE = 2;
final static int SHOW_ALL = -1;
final static String SLEEP_TIME = "3600000"; // default sleepTime: check for recrawls every hour
// bookmarks
kelondroMap bookmarksTable; // kelondroMap bookmarksTable;
@ -85,7 +99,9 @@ public class bookmarksDB {
// dates
kelondroMap datesTable;
// autoReCrawl
private serverBusyThread autoReCrawl;
// ------------------------------------
// bookmarksDB's class constructor
@ -109,6 +125,14 @@ public class bookmarksDB {
this.datesTable = new kelondroMap(new kelondroBLOBTree(datesFile, true, true, 20, 256, '_', kelondroNaturalOrder.naturalOrder, true, false, false), 500);
if (!datesExisted) rebuildDates();
// autoReCrawl
plasmaSwitchboard sb = plasmaSwitchboard.getSwitchboard();
this.autoReCrawl = new serverInstantBusyThread(this, "autoReCrawl", null, null);
long sleepTime = Long.parseLong(sb.getConfig("autoReCrawl_idlesleep" , SLEEP_TIME));
sb.deployThread("autoReCrawl", "autoReCrawl Scheduler", "simple scheduler for automatic re-crawls of bookmarked urls", null, autoReCrawl, -1,
sleepTime, sleepTime, Long.parseLong(sb.getConfig("autoReCrawl_memprereq" , "-1"))
);
serverLog.logInfo("BOOKMARKS", "autoReCrawl - thread initialized checking every "+(sleepTime/1000/60)+" minutes for recrawls");
}
// -----------------------------------------------------
@ -122,6 +146,153 @@ public class bookmarksDB {
datesTable.close();
}
// -----------------------------------------------------
// bookmarksDB's functions for autoReCrawl
// -----------------------------------------------------
public boolean autoReCrawl() {
// read crontab
File f = new File (plasmaSwitchboard.getSwitchboard().getRootPath(),"DATA/SETTINGS/autoReCrawl.conf");
String s;
try {
BufferedReader in = new BufferedReader(new InputStreamReader(new FileInputStream(f)));
serverLog.logInfo("BOOKMARKS", "autoReCrawl - reading schedules from " + f);
while( null != (s = in.readLine()) ) {
if (!s.startsWith("#") && s.length()>0) {
String parser[] = s.split("\t");
if (parser.length == 13) {
folderReCrawl(Long.parseLong(parser[0]), parser[1], parser[2], Integer.parseInt(parser[3]), Integer.parseInt(parser[4]),
Integer.parseInt(parser[5]), Integer.parseInt(parser[6]), Boolean.parseBoolean(parser[7]),
Boolean.parseBoolean(parser[8]), Boolean.parseBoolean(parser[9]),
Boolean.parseBoolean(parser[10]), Boolean.parseBoolean(parser[11]),
Boolean.parseBoolean(parser[12])
);
}
}
}
in.close();
} catch( FileNotFoundException ex ) {
try {
serverLog.logInfo("BOOKMARKS", "autoReCrawl - creating new autoReCrawl.conf");
File inputFile = new File(plasmaSwitchboard.getSwitchboard().getRootPath(),"defaults/autoReCrawl.conf");
File outputFile = new File(plasmaSwitchboard.getSwitchboard().getRootPath(),"DATA/SETTINGS/autoReCrawl.conf");
FileReader i = new FileReader(inputFile);
FileWriter o = new FileWriter(outputFile);
int c;
while ((c = i.read()) != -1)
o.write(c);
i.close();
o.close();
autoReCrawl();
return true;
} catch( FileNotFoundException e ) {
serverLog.logSevere("BOOKMARKS", "autoReCrawl - file not found error: defaults/autoReCrawl.conf", e);
return false;
} catch (IOException e) {
serverLog.logSevere("BOOKMARKS", "autoReCrawl - IOException: defaults/autoReCrawl.conf", e);
return false;
}
} catch( Exception ex ) {
serverLog.logSevere("BOOKMARKS", "autoReCrawl - error reading " + f, ex);
return false;
}
return true;
}
public void folderReCrawl (long schedule, String folder, String newcrawlingfilter, int newcrawlingdepth, int crawlingIfOlder,
int crawlingDomFilterDepth, int crawlingDomMaxPages, boolean crawlingQ, boolean indexText, boolean indexMedia,
boolean crawlOrder, boolean xsstopw, boolean storeHTCache) {
plasmaSwitchboard sb = plasmaSwitchboard.getSwitchboard();
Iterator<String> bit=getBookmarksIterator(folder, true);
serverLog.logInfo("BOOKMARKS", "autoReCrawl - processing: "+folder);
boolean xdstopw = xsstopw;
boolean xpstopw = xsstopw;
while(bit.hasNext()) {
Bookmark bm = getBookmark(bit.next());
long sleepTime = Long.parseLong(sb.getConfig("autoReCrawl_idlesleep" , SLEEP_TIME));
long interTime = (System.currentTimeMillis()-bm.getTimeStamp())%schedule;
Date date=new Date(bm.getTimeStamp());
serverLog.logInfo("BOOKMARKS", "autoReCrawl - checking schedule for: "+"["+serverDate.formatISO8601(date)+"] "+bm.getUrl());
if (interTime >= 0 && interTime < sleepTime) {
try {
// check if the crawl filter works correctly
Pattern.compile(newcrawlingfilter);
// set crawlingStart to BookmarkUrl
String crawlingStart = bm.getUrl();
// stack request
// first delete old entry, if exists
yacyURL crawlingStartURL = new yacyURL(crawlingStart, null);
String urlhash = crawlingStartURL.hash();
sb.webIndex.removeURL(urlhash);
sb.crawlQueues.noticeURL.removeByURLHash(urlhash);
sb.crawlQueues.errorURL.remove(urlhash);
// stack url
sb.webIndex.profilesPassiveCrawls.removeEntry(crawlingStartURL.hash()); // if there is an old entry, delete it
CrawlProfile.entry pe = sb.webIndex.profilesActiveCrawls.newEntry(
"autoReCrawl", crawlingStartURL, newcrawlingfilter, newcrawlingfilter,
newcrawlingdepth, newcrawlingdepth,
crawlingIfOlder, crawlingDomFilterDepth, crawlingDomMaxPages,
crawlingQ,
indexText, indexMedia,
storeHTCache, true, crawlOrder, xsstopw, xdstopw, xpstopw);
String reasonString = sb.crawlStacker.stackCrawl(crawlingStartURL, null, sb.webIndex.seedDB.mySeed().hash, "CRAWLING-ROOT", new Date(), 0, pe);
if (reasonString == null) {
serverLog.logInfo("BOOKMARKS", "autoReCrawl - adding crawl profile for: " + crawlingStart);
// generate a YaCyNews if the global flag was set
if (crawlOrder) {
Map<String, String> m = new HashMap<String, String>(pe.map()); // must be cloned
m.remove("specificDepth");
m.remove("indexText");
m.remove("indexMedia");
m.remove("remoteIndexing");
m.remove("xsstopw");
m.remove("xpstopw");
m.remove("xdstopw");
m.remove("storeTXCache");
m.remove("storeHTCache");
m.remove("generalFilter");
m.remove("specificFilter");
m.put("intention", "Automatic ReCrawl!");
sb.webIndex.newsPool.publishMyNews(yacyNewsRecord.newRecord(sb.webIndex.seedDB.mySeed(), yacyNewsPool.CATEGORY_CRAWL_START, m));
}
} else {
serverLog.logInfo("BOOKMARKS", "autoReCrawl error adding crawl profile: " + crawlingStart + "- " + reasonString);
ZURL.Entry ee = sb.crawlQueues.errorURL.newEntry(
new CrawlEntry(
sb.webIndex.seedDB.mySeed().hash,
crawlingStartURL,
"",
"",
new Date(),
pe.handle(),
0,
0,
0),
sb.webIndex.seedDB.mySeed().hash,
new Date(),
1,
reasonString);
ee.store();
sb.crawlQueues.errorURL.push(ee);
}
} catch (MalformedURLException e1) {}
} // if
} // while(bit.hasNext())
return;
} // } autoReCrawl()
// -------------------------------------
// bookmarksDB's public helper functions
// -------------------------------------