From 1b18d4bcf3956ec31d66c794dc70806f29a38576 Mon Sep 17 00:00:00 2001
From: orbiter <orbiter@6c8d7289-2bf4-0310-a012-ef5d649a1542>
Date: Thu, 6 Nov 2008 12:30:55 +0000
Subject: [PATCH] enhancement to crawling and remote crawling: - for redirector
 and  remote crawling place crawling url on notice queue instead of direct
 enqueueing in crawler queue - when a request to a remote crawl provider
 fails, remove the peer from the network to prevent that the url fetcher gets
 stuck another time again

git-svn-id: https://svn.berlios.de/svnroot/repos/yacy/trunk@5320 6c8d7289-2bf4-0310-a012-ef5d649a1542
---
 htroot/rct_p.java                             | 12 +----
 source/de/anomic/crawler/CrawlQueues.java     | 20 ++++----
 source/de/anomic/crawler/CrawlStacker.java    | 47 +++++++++++--------
 .../anomic/urlRedirector/urlRedirectord.java  |  2 +-
 4 files changed, 37 insertions(+), 44 deletions(-)

diff --git a/htroot/rct_p.java b/htroot/rct_p.java
index 034551b72..e056ec725 100644
--- a/htroot/rct_p.java
+++ b/htroot/rct_p.java
@@ -76,17 +76,7 @@ public class rct_p {
                         if (urlRejectReason == null) {
                             // stack url
                             if (sb.getLog().isFinest()) sb.getLog().logFinest("crawlOrder: stack: url='" + url + "'");
-                            final String reasonString = sb.crawlStacker.stackCrawl(url, referrer, peerhash, "REMOTE-CRAWLING", loaddate, 0, sb.webIndex.defaultRemoteProfile);
-
-                            if (reasonString == null) {
-                                // done
-                                env.getLog().logInfo("crawlOrder: added remote crawl url: " + urlToString(url));
-                            } else if (reasonString.startsWith("double")) {
-                                // case where we have already the url loaded;
-                                env.getLog().logInfo("crawlOrder: ignored double remote crawl url: " + urlToString(url));
-                            } else {
-                                env.getLog().logInfo("crawlOrder: ignored [" + reasonString + "] remote crawl url: " + urlToString(url));
-                            }
+                            sb.crawlStacker.enqueueEntry(url, (referrer == null) ? null : referrer.hash(), peerhash, "REMOTE-CRAWLING", loaddate, 0, sb.webIndex.defaultRemoteProfile);
                         } else {
                             env.getLog().logWarning("crawlOrder: Rejected URL '" + urlToString(url) + "': " + urlRejectReason);
                         }
diff --git a/source/de/anomic/crawler/CrawlQueues.java b/source/de/anomic/crawler/CrawlQueues.java
index 938726f18..e51b01d86 100644
--- a/source/de/anomic/crawler/CrawlQueues.java
+++ b/source/de/anomic/crawler/CrawlQueues.java
@@ -362,7 +362,13 @@ public class CrawlQueues {
         
         // we know a peer which should provide remote crawl entries. load them now.
         final RSSFeed feed = yacyClient.queryRemoteCrawlURLs(sb.webIndex.seedDB, seed, 30, 5000);
-        if (feed == null) return true;
+        if (feed == null || feed.size() == 0) {
+            // something is wrong with this provider. To prevent that we get not stuck with this peer
+            // we remove it from the peer list
+            sb.webIndex.peerActions.peerDeparture(seed, "no results from provided remote crawls");
+            return true;
+        }
+        
         // parse the rss
         yacyURL url, referrer;
         Date loaddate;
@@ -389,17 +395,7 @@ public class CrawlQueues {
             if (urlRejectReason == null) {
                 // stack url
                 if (sb.getLog().isFinest()) sb.getLog().logFinest("crawlOrder: stack: url='" + url + "'");
-                final String reasonString = sb.crawlStacker.stackCrawl(url, referrer, hash, item.getDescription(), loaddate, 0, sb.webIndex.defaultRemoteProfile);
-
-                if (reasonString == null) {
-                    // done
-                    log.logInfo("crawlOrder: added remote crawl url: " + urlToString(url));
-                } else if (reasonString.startsWith("double")) {
-                    // case where we have already the url loaded;
-                    log.logInfo("crawlOrder: ignored double remote crawl url: " + urlToString(url));
-                } else {
-                    log.logInfo("crawlOrder: ignored [" + reasonString + "] remote crawl url: " + urlToString(url));
-                }
+                sb.crawlStacker.enqueueEntry(url, (referrer == null) ? null : referrer.hash(), hash, item.getDescription(), loaddate, 0, sb.webIndex.defaultRemoteProfile);
             } else {
                 log.logWarning("crawlOrder: Rejected URL '" + urlToString(url) + "': " + urlRejectReason);
             }
diff --git a/source/de/anomic/crawler/CrawlStacker.java b/source/de/anomic/crawler/CrawlStacker.java
index 6b9d76ef8..c0a5f0eed 100644
--- a/source/de/anomic/crawler/CrawlStacker.java
+++ b/source/de/anomic/crawler/CrawlStacker.java
@@ -210,6 +210,33 @@ public final class CrawlStacker extends Thread {
         return true;
     }
     
+    public String stackCrawl(
+            final yacyURL url,
+            final String referrerhash,
+            final String initiatorHash,
+            final String name,
+            final Date loadDate,
+            final int currentdepth,
+            final CrawlProfile.entry profile) {
+        // stacks a crawl item. The position can also be remote
+        // returns null if successful, a reason string if not successful
+        //this.log.logFinest("stackCrawl: nexturlString='" + nexturlString + "'");
+        
+        // add the url into the crawling queue
+        final CrawlEntry entry = new CrawlEntry(
+                initiatorHash,                               // initiator, needed for p2p-feedback
+                url,                                         // url clear text string
+                (referrerhash == null) ? "" : referrerhash,  // last url in crawling queue
+                name,                                        // load date
+                loadDate,                                    // the anchor name
+                (profile == null) ? null : profile.handle(), // profile must not be null!
+                currentdepth,                                // depth so far
+                0,                                           // anchors, default value
+                0                                            // forkfactor, default value
+        );
+        return stackCrawl(entry);
+    }
+    
     public void enqueueEntry(
             final yacyURL nexturl, 
             final String referrerhash, 
@@ -342,26 +369,6 @@ public final class CrawlStacker extends Thread {
         return new CrawlEntry(entry);
     }
     
-    public String stackCrawl(final yacyURL url, final yacyURL referrer, final String initiatorHash, final String name, final Date loadDate, final int currentdepth, final CrawlProfile.entry profile) {
-        // stacks a crawl item. The position can also be remote
-        // returns null if successful, a reason string if not successful
-        //this.log.logFinest("stackCrawl: nexturlString='" + nexturlString + "'");
-        
-        // add the url into the crawling queue
-        final CrawlEntry entry = new CrawlEntry(
-                initiatorHash,                               // initiator, needed for p2p-feedback
-                url,                                         // url clear text string
-                (referrer == null) ? "" : referrer.hash(),   // last url in crawling queue
-                name,                                        // load date
-                loadDate,                                    // the anchor name
-                (profile == null) ? null : profile.handle(), // profile must not be null!
-                currentdepth,                                // depth so far
-                0,                                           // anchors, default value
-                0                                            // forkfactor, default value
-        );
-        return stackCrawl(entry);
-    }
-    
     public String stackCrawl(final CrawlEntry entry) {
         // stacks a crawl item. The position can also be remote
         // returns null if successful, a reason string if not successful
diff --git a/source/de/anomic/urlRedirector/urlRedirectord.java b/source/de/anomic/urlRedirector/urlRedirectord.java
index 9f0d021b0..a02e86d87 100644
--- a/source/de/anomic/urlRedirector/urlRedirectord.java
+++ b/source/de/anomic/urlRedirector/urlRedirectord.java
@@ -195,7 +195,7 @@ public class urlRedirectord implements serverHandler, Cloneable {
                             sb.crawlQueues.errorURL.remove(urlhash);                            
                             
                             // enqueuing URL for crawling
-                            reasonString = sb.crawlStacker.stackCrawl(
+                            sb.crawlStacker.enqueueEntry(
                                     reqURL, 
                                     null, 
                                     sb.webIndex.seedDB.mySeed().hash,