mirror of
https://github.com/yacy/yacy_search_server.git
synced 2024-09-19 00:01:41 +02:00
enhancement to crawling and remote crawling:
- for redirector and remote crawling place crawling url on notice queue instead of direct enqueueing in crawler queue - when a request to a remote crawl provider fails, remove the peer from the network to prevent that the url fetcher gets stuck another time again git-svn-id: https://svn.berlios.de/svnroot/repos/yacy/trunk@5320 6c8d7289-2bf4-0310-a012-ef5d649a1542
This commit is contained in:
parent
3f746be5d4
commit
1b18d4bcf3
|
@ -76,17 +76,7 @@ public class rct_p {
|
|||
if (urlRejectReason == null) {
|
||||
// stack url
|
||||
if (sb.getLog().isFinest()) sb.getLog().logFinest("crawlOrder: stack: url='" + url + "'");
|
||||
final String reasonString = sb.crawlStacker.stackCrawl(url, referrer, peerhash, "REMOTE-CRAWLING", loaddate, 0, sb.webIndex.defaultRemoteProfile);
|
||||
|
||||
if (reasonString == null) {
|
||||
// done
|
||||
env.getLog().logInfo("crawlOrder: added remote crawl url: " + urlToString(url));
|
||||
} else if (reasonString.startsWith("double")) {
|
||||
// case where we have already the url loaded;
|
||||
env.getLog().logInfo("crawlOrder: ignored double remote crawl url: " + urlToString(url));
|
||||
} else {
|
||||
env.getLog().logInfo("crawlOrder: ignored [" + reasonString + "] remote crawl url: " + urlToString(url));
|
||||
}
|
||||
sb.crawlStacker.enqueueEntry(url, (referrer == null) ? null : referrer.hash(), peerhash, "REMOTE-CRAWLING", loaddate, 0, sb.webIndex.defaultRemoteProfile);
|
||||
} else {
|
||||
env.getLog().logWarning("crawlOrder: Rejected URL '" + urlToString(url) + "': " + urlRejectReason);
|
||||
}
|
||||
|
|
|
@ -362,7 +362,13 @@ public class CrawlQueues {
|
|||
|
||||
// we know a peer which should provide remote crawl entries. load them now.
|
||||
final RSSFeed feed = yacyClient.queryRemoteCrawlURLs(sb.webIndex.seedDB, seed, 30, 5000);
|
||||
if (feed == null) return true;
|
||||
if (feed == null || feed.size() == 0) {
|
||||
// something is wrong with this provider. To prevent that we get not stuck with this peer
|
||||
// we remove it from the peer list
|
||||
sb.webIndex.peerActions.peerDeparture(seed, "no results from provided remote crawls");
|
||||
return true;
|
||||
}
|
||||
|
||||
// parse the rss
|
||||
yacyURL url, referrer;
|
||||
Date loaddate;
|
||||
|
@ -389,17 +395,7 @@ public class CrawlQueues {
|
|||
if (urlRejectReason == null) {
|
||||
// stack url
|
||||
if (sb.getLog().isFinest()) sb.getLog().logFinest("crawlOrder: stack: url='" + url + "'");
|
||||
final String reasonString = sb.crawlStacker.stackCrawl(url, referrer, hash, item.getDescription(), loaddate, 0, sb.webIndex.defaultRemoteProfile);
|
||||
|
||||
if (reasonString == null) {
|
||||
// done
|
||||
log.logInfo("crawlOrder: added remote crawl url: " + urlToString(url));
|
||||
} else if (reasonString.startsWith("double")) {
|
||||
// case where we have already the url loaded;
|
||||
log.logInfo("crawlOrder: ignored double remote crawl url: " + urlToString(url));
|
||||
} else {
|
||||
log.logInfo("crawlOrder: ignored [" + reasonString + "] remote crawl url: " + urlToString(url));
|
||||
}
|
||||
sb.crawlStacker.enqueueEntry(url, (referrer == null) ? null : referrer.hash(), hash, item.getDescription(), loaddate, 0, sb.webIndex.defaultRemoteProfile);
|
||||
} else {
|
||||
log.logWarning("crawlOrder: Rejected URL '" + urlToString(url) + "': " + urlRejectReason);
|
||||
}
|
||||
|
|
|
@ -210,6 +210,33 @@ public final class CrawlStacker extends Thread {
|
|||
return true;
|
||||
}
|
||||
|
||||
public String stackCrawl(
|
||||
final yacyURL url,
|
||||
final String referrerhash,
|
||||
final String initiatorHash,
|
||||
final String name,
|
||||
final Date loadDate,
|
||||
final int currentdepth,
|
||||
final CrawlProfile.entry profile) {
|
||||
// stacks a crawl item. The position can also be remote
|
||||
// returns null if successful, a reason string if not successful
|
||||
//this.log.logFinest("stackCrawl: nexturlString='" + nexturlString + "'");
|
||||
|
||||
// add the url into the crawling queue
|
||||
final CrawlEntry entry = new CrawlEntry(
|
||||
initiatorHash, // initiator, needed for p2p-feedback
|
||||
url, // url clear text string
|
||||
(referrerhash == null) ? "" : referrerhash, // last url in crawling queue
|
||||
name, // load date
|
||||
loadDate, // the anchor name
|
||||
(profile == null) ? null : profile.handle(), // profile must not be null!
|
||||
currentdepth, // depth so far
|
||||
0, // anchors, default value
|
||||
0 // forkfactor, default value
|
||||
);
|
||||
return stackCrawl(entry);
|
||||
}
|
||||
|
||||
public void enqueueEntry(
|
||||
final yacyURL nexturl,
|
||||
final String referrerhash,
|
||||
|
@ -342,26 +369,6 @@ public final class CrawlStacker extends Thread {
|
|||
return new CrawlEntry(entry);
|
||||
}
|
||||
|
||||
public String stackCrawl(final yacyURL url, final yacyURL referrer, final String initiatorHash, final String name, final Date loadDate, final int currentdepth, final CrawlProfile.entry profile) {
|
||||
// stacks a crawl item. The position can also be remote
|
||||
// returns null if successful, a reason string if not successful
|
||||
//this.log.logFinest("stackCrawl: nexturlString='" + nexturlString + "'");
|
||||
|
||||
// add the url into the crawling queue
|
||||
final CrawlEntry entry = new CrawlEntry(
|
||||
initiatorHash, // initiator, needed for p2p-feedback
|
||||
url, // url clear text string
|
||||
(referrer == null) ? "" : referrer.hash(), // last url in crawling queue
|
||||
name, // load date
|
||||
loadDate, // the anchor name
|
||||
(profile == null) ? null : profile.handle(), // profile must not be null!
|
||||
currentdepth, // depth so far
|
||||
0, // anchors, default value
|
||||
0 // forkfactor, default value
|
||||
);
|
||||
return stackCrawl(entry);
|
||||
}
|
||||
|
||||
public String stackCrawl(final CrawlEntry entry) {
|
||||
// stacks a crawl item. The position can also be remote
|
||||
// returns null if successful, a reason string if not successful
|
||||
|
|
|
@ -195,7 +195,7 @@ public class urlRedirectord implements serverHandler, Cloneable {
|
|||
sb.crawlQueues.errorURL.remove(urlhash);
|
||||
|
||||
// enqueuing URL for crawling
|
||||
reasonString = sb.crawlStacker.stackCrawl(
|
||||
sb.crawlStacker.enqueueEntry(
|
||||
reqURL,
|
||||
null,
|
||||
sb.webIndex.seedDB.mySeed().hash,
|
||||
|
|
Loading…
Reference in New Issue
Block a user