enhancement to crawling and remote crawling:

- for redirector and remote crawling place crawling url on notice queue instead of direct enqueueing in crawler queue - when a request to a remote crawl provider fails, remove the peer from the network to prevent that the url fetcher gets stuck another time again git-svn-id: https://svn.berlios.de/svnroot/repos/yacy/trunk@5320 6c8d7289-2bf4-0310-a012-ef5d649a1542
2024-09-19 00:01:41 +02:00 · 2008-11-06 12:30:55 +00:00 · 2008-11-06 12:30:55 +00:00 · 1b18d4bcf3
commit 1b18d4bcf3
parent 3f746be5d4
4 changed files with 37 additions and 44 deletions
--- a/htroot/rct_p.java
+++ b/htroot/rct_p.java
@ -76,17 +76,7 @@ public class rct_p {
                        if (urlRejectReason == null) {
                            // stack url
                            if (sb.getLog().isFinest()) sb.getLog().logFinest("crawlOrder: stack: url='" + url + "'");
-                            final String reasonString = sb.crawlStacker.stackCrawl(url, referrer, peerhash, "REMOTE-CRAWLING", loaddate, 0, sb.webIndex.defaultRemoteProfile);
-
-                            if (reasonString == null) {
-                                // done
-                                env.getLog().logInfo("crawlOrder: added remote crawl url: " + urlToString(url));
-                            } else if (reasonString.startsWith("double")) {
-                                // case where we have already the url loaded;
-                                env.getLog().logInfo("crawlOrder: ignored double remote crawl url: " + urlToString(url));
-                            } else {
-                                env.getLog().logInfo("crawlOrder: ignored [" + reasonString + "] remote crawl url: " + urlToString(url));
-                            }
+                            sb.crawlStacker.enqueueEntry(url, (referrer == null) ? null : referrer.hash(), peerhash, "REMOTE-CRAWLING", loaddate, 0, sb.webIndex.defaultRemoteProfile);
                        } else {
                            env.getLog().logWarning("crawlOrder: Rejected URL '" + urlToString(url) + "': " + urlRejectReason);
                        }
--- a/source/de/anomic/crawler/CrawlQueues.java
+++ b/source/de/anomic/crawler/CrawlQueues.java
@ -362,7 +362,13 @@ public class CrawlQueues {
        
        // we know a peer which should provide remote crawl entries. load them now.
        final RSSFeed feed = yacyClient.queryRemoteCrawlURLs(sb.webIndex.seedDB, seed, 30, 5000);
-        if (feed == null) return true;
+        if (feed == null || feed.size() == 0) {
+            // something is wrong with this provider. To prevent that we get not stuck with this peer
+            // we remove it from the peer list
+            sb.webIndex.peerActions.peerDeparture(seed, "no results from provided remote crawls");
+            return true;
+        }
+        
        // parse the rss
        yacyURL url, referrer;
        Date loaddate;
@ -389,17 +395,7 @@ public class CrawlQueues {
            if (urlRejectReason == null) {
                // stack url
                if (sb.getLog().isFinest()) sb.getLog().logFinest("crawlOrder: stack: url='" + url + "'");
-                final String reasonString = sb.crawlStacker.stackCrawl(url, referrer, hash, item.getDescription(), loaddate, 0, sb.webIndex.defaultRemoteProfile);
-
-                if (reasonString == null) {
-                    // done
-                    log.logInfo("crawlOrder: added remote crawl url: " + urlToString(url));
-                } else if (reasonString.startsWith("double")) {
-                    // case where we have already the url loaded;
-                    log.logInfo("crawlOrder: ignored double remote crawl url: " + urlToString(url));
-                } else {
-                    log.logInfo("crawlOrder: ignored [" + reasonString + "] remote crawl url: " + urlToString(url));
-                }
+                sb.crawlStacker.enqueueEntry(url, (referrer == null) ? null : referrer.hash(), hash, item.getDescription(), loaddate, 0, sb.webIndex.defaultRemoteProfile);
            } else {
                log.logWarning("crawlOrder: Rejected URL '" + urlToString(url) + "': " + urlRejectReason);
            }
--- a/source/de/anomic/crawler/CrawlStacker.java
+++ b/source/de/anomic/crawler/CrawlStacker.java
@ -210,6 +210,33 @@ public final class CrawlStacker extends Thread {
        return true;
    }
    
+    public String stackCrawl(
+            final yacyURL url,
+            final String referrerhash,
+            final String initiatorHash,
+            final String name,
+            final Date loadDate,
+            final int currentdepth,
+            final CrawlProfile.entry profile) {
+        // stacks a crawl item. The position can also be remote
+        // returns null if successful, a reason string if not successful
+        //this.log.logFinest("stackCrawl: nexturlString='" + nexturlString + "'");
+        
+        // add the url into the crawling queue
+        final CrawlEntry entry = new CrawlEntry(
+                initiatorHash,                               // initiator, needed for p2p-feedback
+                url,                                         // url clear text string
+                (referrerhash == null) ? "" : referrerhash,  // last url in crawling queue
+                name,                                        // load date
+                loadDate,                                    // the anchor name
+                (profile == null) ? null : profile.handle(), // profile must not be null!
+                currentdepth,                                // depth so far
+                0,                                           // anchors, default value
+                0                                            // forkfactor, default value
+        );
+        return stackCrawl(entry);
+    }
+    
    public void enqueueEntry(
            final yacyURL nexturl, 
            final String referrerhash, 
@ -342,26 +369,6 @@ public final class CrawlStacker extends Thread {
        return new CrawlEntry(entry);
    }
    
-    public String stackCrawl(final yacyURL url, final yacyURL referrer, final String initiatorHash, final String name, final Date loadDate, final int currentdepth, final CrawlProfile.entry profile) {
-        // stacks a crawl item. The position can also be remote
-        // returns null if successful, a reason string if not successful
-        //this.log.logFinest("stackCrawl: nexturlString='" + nexturlString + "'");
-        
-        // add the url into the crawling queue
-        final CrawlEntry entry = new CrawlEntry(
-                initiatorHash,                               // initiator, needed for p2p-feedback
-                url,                                         // url clear text string
-                (referrer == null) ? "" : referrer.hash(),   // last url in crawling queue
-                name,                                        // load date
-                loadDate,                                    // the anchor name
-                (profile == null) ? null : profile.handle(), // profile must not be null!
-                currentdepth,                                // depth so far
-                0,                                           // anchors, default value
-                0                                            // forkfactor, default value
-        );
-        return stackCrawl(entry);
-    }
-    
    public String stackCrawl(final CrawlEntry entry) {
        // stacks a crawl item. The position can also be remote
        // returns null if successful, a reason string if not successful
--- a/source/de/anomic/urlRedirector/urlRedirectord.java
+++ b/source/de/anomic/urlRedirector/urlRedirectord.java
@ -195,7 +195,7 @@ public class urlRedirectord implements serverHandler, Cloneable {
                            sb.crawlQueues.errorURL.remove(urlhash);                            
                            
                            // enqueuing URL for crawling
-                            reasonString = sb.crawlStacker.stackCrawl(
+                            sb.crawlStacker.enqueueEntry(
                                    reqURL, 
                                    null, 
                                    sb.webIndex.seedDB.mySeed().hash,