mirror of
https://github.com/yacy/yacy_search_server.git
synced 2024-09-19 00:01:41 +02:00
Fixed redirected URLs processing as crawl start point.
See mantis 699 (http://mantis.tokeek.de/view.php?id=699) for details.
This commit is contained in:
parent
68217465fe
commit
6f49ece22f
|
@ -162,8 +162,20 @@ public final class HTTPLoader {
|
|||
if (profile != null && !CrawlSwitchboard.DEFAULT_PROFILES.contains(profile.name())) {
|
||||
// put redirect url on the crawler queue to repeat a
|
||||
// double-check
|
||||
request.redirectURL(redirectionUrl);
|
||||
this.sb.crawlStacker.stackCrawl(request);
|
||||
/* We have to clone the request instance and not to modify directly its URL,
|
||||
* otherwise the stackCrawl() function would reject it, because detecting it as already in the activeWorkerEntries */
|
||||
Request redirectedRequest = new Request(request.initiator(),
|
||||
redirectionUrl,
|
||||
request.referrerhash(),
|
||||
request.name(),
|
||||
request.appdate(),
|
||||
request.profileHandle(),
|
||||
request.depth(),
|
||||
request.timezoneOffset());
|
||||
String rejectReason = this.sb.crawlStacker.stackCrawl(redirectedRequest);
|
||||
if(rejectReason != null) {
|
||||
throw new IOException("CRAWLER Redirect of URL=" + requestURLString + " aborted. Reason : " + rejectReason);
|
||||
}
|
||||
// in the end we must throw an exception (even if this is
|
||||
// not an error, just to abort the current process
|
||||
throw new IOException("CRAWLER Redirect of URL=" + requestURLString + " to "
|
||||
|
@ -349,10 +361,24 @@ public final class HTTPLoader {
|
|||
// we have two use cases here: loading from a crawl or just loading the url. Check this:
|
||||
if (profile != null && !CrawlSwitchboard.DEFAULT_PROFILES.contains(profile.name())) {
|
||||
// put redirect url on the crawler queue to repeat a double-check
|
||||
request.redirectURL(redirectionUrl);
|
||||
this.sb.crawlStacker.stackCrawl(request);
|
||||
/* We have to clone the request instance and not to modify directly its URL,
|
||||
* otherwise the stackCrawl() function would reject it, because detecting it as already in the activeWorkerEntries */
|
||||
Request redirectedRequest = new Request(request.initiator(),
|
||||
redirectionUrl,
|
||||
request.referrerhash(),
|
||||
request.name(),
|
||||
request.appdate(),
|
||||
request.profileHandle(),
|
||||
request.depth(),
|
||||
request.timezoneOffset());
|
||||
String rejectReason = this.sb.crawlStacker.stackCrawl(redirectedRequest);
|
||||
// in the end we must throw an exception (even if this is not an error, just to abort the current process
|
||||
if(rejectReason != null) {
|
||||
throw new IOException("CRAWLER Redirect of URL=" + requestURLString + " aborted. Reason : " + rejectReason);
|
||||
}
|
||||
throw new IOException("CRAWLER Redirect of URL=" + requestURLString + " to " + redirectionUrl.toNormalform(false) + " placed on crawler queue for double-check");
|
||||
|
||||
|
||||
}
|
||||
|
||||
// if we are already doing a shutdown we don't need to retry crawling
|
||||
|
|
Loading…
Reference in New Issue
Block a user