Fixed redirected URLs processing as crawl start point.

See mantis 699 (http://mantis.tokeek.de/view.php?id=699) for details.
This commit is contained in:
luccioman 2016-10-20 12:12:26 +02:00
parent 68217465fe
commit 6f49ece22f

View File

@ -162,8 +162,20 @@ public final class HTTPLoader {
if (profile != null && !CrawlSwitchboard.DEFAULT_PROFILES.contains(profile.name())) { if (profile != null && !CrawlSwitchboard.DEFAULT_PROFILES.contains(profile.name())) {
// put redirect url on the crawler queue to repeat a // put redirect url on the crawler queue to repeat a
// double-check // double-check
request.redirectURL(redirectionUrl); /* We have to clone the request instance and not to modify directly its URL,
this.sb.crawlStacker.stackCrawl(request); * otherwise the stackCrawl() function would reject it, because detecting it as already in the activeWorkerEntries */
Request redirectedRequest = new Request(request.initiator(),
redirectionUrl,
request.referrerhash(),
request.name(),
request.appdate(),
request.profileHandle(),
request.depth(),
request.timezoneOffset());
String rejectReason = this.sb.crawlStacker.stackCrawl(redirectedRequest);
if(rejectReason != null) {
throw new IOException("CRAWLER Redirect of URL=" + requestURLString + " aborted. Reason : " + rejectReason);
}
// in the end we must throw an exception (even if this is // in the end we must throw an exception (even if this is
// not an error, just to abort the current process // not an error, just to abort the current process
throw new IOException("CRAWLER Redirect of URL=" + requestURLString + " to " throw new IOException("CRAWLER Redirect of URL=" + requestURLString + " to "
@ -349,10 +361,24 @@ public final class HTTPLoader {
// we have two use cases here: loading from a crawl or just loading the url. Check this: // we have two use cases here: loading from a crawl or just loading the url. Check this:
if (profile != null && !CrawlSwitchboard.DEFAULT_PROFILES.contains(profile.name())) { if (profile != null && !CrawlSwitchboard.DEFAULT_PROFILES.contains(profile.name())) {
// put redirect url on the crawler queue to repeat a double-check // put redirect url on the crawler queue to repeat a double-check
request.redirectURL(redirectionUrl); /* We have to clone the request instance and not to modify directly its URL,
this.sb.crawlStacker.stackCrawl(request); * otherwise the stackCrawl() function would reject it, because detecting it as already in the activeWorkerEntries */
Request redirectedRequest = new Request(request.initiator(),
redirectionUrl,
request.referrerhash(),
request.name(),
request.appdate(),
request.profileHandle(),
request.depth(),
request.timezoneOffset());
String rejectReason = this.sb.crawlStacker.stackCrawl(redirectedRequest);
// in the end we must throw an exception (even if this is not an error, just to abort the current process // in the end we must throw an exception (even if this is not an error, just to abort the current process
if(rejectReason != null) {
throw new IOException("CRAWLER Redirect of URL=" + requestURLString + " aborted. Reason : " + rejectReason);
}
throw new IOException("CRAWLER Redirect of URL=" + requestURLString + " to " + redirectionUrl.toNormalform(false) + " placed on crawler queue for double-check"); throw new IOException("CRAWLER Redirect of URL=" + requestURLString + " to " + redirectionUrl.toNormalform(false) + " placed on crawler queue for double-check");
} }
// if we are already doing a shutdown we don't need to retry crawling // if we are already doing a shutdown we don't need to retry crawling