Fixed redirected URLs processing as crawl start point.

See mantis 699 (http://mantis.tokeek.de/view.php?id=699) for details.
2024-09-19 00:01:41 +02:00 · 2016-10-20 12:12:26 +02:00 · 2016-10-20 12:12:26 +02:00 · 6f49ece22f
commit 6f49ece22f
parent 68217465fe
1 changed files with 30 additions and 4 deletions
--- a/source/net/yacy/crawler/retrieval/HTTPLoader.java
+++ b/source/net/yacy/crawler/retrieval/HTTPLoader.java
@ -162,8 +162,20 @@ public final class HTTPLoader {
 				if (profile != null && !CrawlSwitchboard.DEFAULT_PROFILES.contains(profile.name())) {
 					// put redirect url on the crawler queue to repeat a
 					// double-check
-					request.redirectURL(redirectionUrl);
-					this.sb.crawlStacker.stackCrawl(request);
+    	        	/* We have to clone the request instance and not to modify directly its URL, 
+    	        	 * otherwise the stackCrawl() function would reject it, because detecting it as already in the activeWorkerEntries */
+                    Request redirectedRequest = new Request(request.initiator(),
+                    		redirectionUrl,
+                    		request.referrerhash(),
+                    		request.name(),
+                    		request.appdate(),
+                    		request.profileHandle(),
+                    		request.depth(),
+                    		request.timezoneOffset());
+    	            String rejectReason = this.sb.crawlStacker.stackCrawl(redirectedRequest);
+    	            if(rejectReason != null) {
+                        throw new IOException("CRAWLER Redirect of URL=" + requestURLString + " aborted. Reason : " + rejectReason);
+    	            }
 					// in the end we must throw an exception (even if this is
 					// not an error, just to abort the current process
 					throw new IOException("CRAWLER Redirect of URL=" + requestURLString + " to "
@ -349,10 +361,24 @@ public final class HTTPLoader {
    	        // we have two use cases here: loading from a crawl or just loading the url. Check this:
    	        if (profile != null && !CrawlSwitchboard.DEFAULT_PROFILES.contains(profile.name())) {
                    // put redirect url on the crawler queue to repeat a double-check
-                    request.redirectURL(redirectionUrl);
-    	            this.sb.crawlStacker.stackCrawl(request);
+    	        	/* We have to clone the request instance and not to modify directly its URL, 
+    	        	 * otherwise the stackCrawl() function would reject it, because detecting it as already in the activeWorkerEntries */
+                    Request redirectedRequest = new Request(request.initiator(),
+                    		redirectionUrl,
+                    		request.referrerhash(),
+                    		request.name(),
+                    		request.appdate(),
+                    		request.profileHandle(),
+                    		request.depth(),
+                    		request.timezoneOffset());
+    	            String rejectReason = this.sb.crawlStacker.stackCrawl(redirectedRequest);
    	            // in the end we must throw an exception (even if this is not an error, just to abort the current process
+    	            if(rejectReason != null) {
+                        throw new IOException("CRAWLER Redirect of URL=" + requestURLString + " aborted. Reason : " + rejectReason);
+    	            }
                    throw new IOException("CRAWLER Redirect of URL=" + requestURLString + " to " + redirectionUrl.toNormalform(false) + " placed on crawler queue for double-check");
+    	            
+
    	        }
    	        
                // if we are already doing a shutdown we don't need to retry crawling