From 1b18d4bcf3956ec31d66c794dc70806f29a38576 Mon Sep 17 00:00:00 2001 From: orbiter Date: Thu, 6 Nov 2008 12:30:55 +0000 Subject: [PATCH] enhancement to crawling and remote crawling: - for redirector and remote crawling place crawling url on notice queue instead of direct enqueueing in crawler queue - when a request to a remote crawl provider fails, remove the peer from the network to prevent that the url fetcher gets stuck another time again git-svn-id: https://svn.berlios.de/svnroot/repos/yacy/trunk@5320 6c8d7289-2bf4-0310-a012-ef5d649a1542 --- htroot/rct_p.java | 12 +---- source/de/anomic/crawler/CrawlQueues.java | 20 ++++---- source/de/anomic/crawler/CrawlStacker.java | 47 +++++++++++-------- .../anomic/urlRedirector/urlRedirectord.java | 2 +- 4 files changed, 37 insertions(+), 44 deletions(-) diff --git a/htroot/rct_p.java b/htroot/rct_p.java index 034551b72..e056ec725 100644 --- a/htroot/rct_p.java +++ b/htroot/rct_p.java @@ -76,17 +76,7 @@ public class rct_p { if (urlRejectReason == null) { // stack url if (sb.getLog().isFinest()) sb.getLog().logFinest("crawlOrder: stack: url='" + url + "'"); - final String reasonString = sb.crawlStacker.stackCrawl(url, referrer, peerhash, "REMOTE-CRAWLING", loaddate, 0, sb.webIndex.defaultRemoteProfile); - - if (reasonString == null) { - // done - env.getLog().logInfo("crawlOrder: added remote crawl url: " + urlToString(url)); - } else if (reasonString.startsWith("double")) { - // case where we have already the url loaded; - env.getLog().logInfo("crawlOrder: ignored double remote crawl url: " + urlToString(url)); - } else { - env.getLog().logInfo("crawlOrder: ignored [" + reasonString + "] remote crawl url: " + urlToString(url)); - } + sb.crawlStacker.enqueueEntry(url, (referrer == null) ? null : referrer.hash(), peerhash, "REMOTE-CRAWLING", loaddate, 0, sb.webIndex.defaultRemoteProfile); } else { env.getLog().logWarning("crawlOrder: Rejected URL '" + urlToString(url) + "': " + urlRejectReason); } diff --git a/source/de/anomic/crawler/CrawlQueues.java b/source/de/anomic/crawler/CrawlQueues.java index 938726f18..e51b01d86 100644 --- a/source/de/anomic/crawler/CrawlQueues.java +++ b/source/de/anomic/crawler/CrawlQueues.java @@ -362,7 +362,13 @@ public class CrawlQueues { // we know a peer which should provide remote crawl entries. load them now. final RSSFeed feed = yacyClient.queryRemoteCrawlURLs(sb.webIndex.seedDB, seed, 30, 5000); - if (feed == null) return true; + if (feed == null || feed.size() == 0) { + // something is wrong with this provider. To prevent that we get not stuck with this peer + // we remove it from the peer list + sb.webIndex.peerActions.peerDeparture(seed, "no results from provided remote crawls"); + return true; + } + // parse the rss yacyURL url, referrer; Date loaddate; @@ -389,17 +395,7 @@ public class CrawlQueues { if (urlRejectReason == null) { // stack url if (sb.getLog().isFinest()) sb.getLog().logFinest("crawlOrder: stack: url='" + url + "'"); - final String reasonString = sb.crawlStacker.stackCrawl(url, referrer, hash, item.getDescription(), loaddate, 0, sb.webIndex.defaultRemoteProfile); - - if (reasonString == null) { - // done - log.logInfo("crawlOrder: added remote crawl url: " + urlToString(url)); - } else if (reasonString.startsWith("double")) { - // case where we have already the url loaded; - log.logInfo("crawlOrder: ignored double remote crawl url: " + urlToString(url)); - } else { - log.logInfo("crawlOrder: ignored [" + reasonString + "] remote crawl url: " + urlToString(url)); - } + sb.crawlStacker.enqueueEntry(url, (referrer == null) ? null : referrer.hash(), hash, item.getDescription(), loaddate, 0, sb.webIndex.defaultRemoteProfile); } else { log.logWarning("crawlOrder: Rejected URL '" + urlToString(url) + "': " + urlRejectReason); } diff --git a/source/de/anomic/crawler/CrawlStacker.java b/source/de/anomic/crawler/CrawlStacker.java index 6b9d76ef8..c0a5f0eed 100644 --- a/source/de/anomic/crawler/CrawlStacker.java +++ b/source/de/anomic/crawler/CrawlStacker.java @@ -210,6 +210,33 @@ public final class CrawlStacker extends Thread { return true; } + public String stackCrawl( + final yacyURL url, + final String referrerhash, + final String initiatorHash, + final String name, + final Date loadDate, + final int currentdepth, + final CrawlProfile.entry profile) { + // stacks a crawl item. The position can also be remote + // returns null if successful, a reason string if not successful + //this.log.logFinest("stackCrawl: nexturlString='" + nexturlString + "'"); + + // add the url into the crawling queue + final CrawlEntry entry = new CrawlEntry( + initiatorHash, // initiator, needed for p2p-feedback + url, // url clear text string + (referrerhash == null) ? "" : referrerhash, // last url in crawling queue + name, // load date + loadDate, // the anchor name + (profile == null) ? null : profile.handle(), // profile must not be null! + currentdepth, // depth so far + 0, // anchors, default value + 0 // forkfactor, default value + ); + return stackCrawl(entry); + } + public void enqueueEntry( final yacyURL nexturl, final String referrerhash, @@ -342,26 +369,6 @@ public final class CrawlStacker extends Thread { return new CrawlEntry(entry); } - public String stackCrawl(final yacyURL url, final yacyURL referrer, final String initiatorHash, final String name, final Date loadDate, final int currentdepth, final CrawlProfile.entry profile) { - // stacks a crawl item. The position can also be remote - // returns null if successful, a reason string if not successful - //this.log.logFinest("stackCrawl: nexturlString='" + nexturlString + "'"); - - // add the url into the crawling queue - final CrawlEntry entry = new CrawlEntry( - initiatorHash, // initiator, needed for p2p-feedback - url, // url clear text string - (referrer == null) ? "" : referrer.hash(), // last url in crawling queue - name, // load date - loadDate, // the anchor name - (profile == null) ? null : profile.handle(), // profile must not be null! - currentdepth, // depth so far - 0, // anchors, default value - 0 // forkfactor, default value - ); - return stackCrawl(entry); - } - public String stackCrawl(final CrawlEntry entry) { // stacks a crawl item. The position can also be remote // returns null if successful, a reason string if not successful diff --git a/source/de/anomic/urlRedirector/urlRedirectord.java b/source/de/anomic/urlRedirector/urlRedirectord.java index 9f0d021b0..a02e86d87 100644 --- a/source/de/anomic/urlRedirector/urlRedirectord.java +++ b/source/de/anomic/urlRedirector/urlRedirectord.java @@ -195,7 +195,7 @@ public class urlRedirectord implements serverHandler, Cloneable { sb.crawlQueues.errorURL.remove(urlhash); // enqueuing URL for crawling - reasonString = sb.crawlStacker.stackCrawl( + sb.crawlStacker.enqueueEntry( reqURL, null, sb.webIndex.seedDB.mySeed().hash,