mirror of
https://github.com/yacy/yacy_search_server.git
synced 2024-09-19 00:01:41 +02:00
Fixed redirected URLs processing as crawl start point.
See mantis 699 (http://mantis.tokeek.de/view.php?id=699) for details.
This commit is contained in:
parent
68217465fe
commit
6f49ece22f
|
@ -162,8 +162,20 @@ public final class HTTPLoader {
|
||||||
if (profile != null && !CrawlSwitchboard.DEFAULT_PROFILES.contains(profile.name())) {
|
if (profile != null && !CrawlSwitchboard.DEFAULT_PROFILES.contains(profile.name())) {
|
||||||
// put redirect url on the crawler queue to repeat a
|
// put redirect url on the crawler queue to repeat a
|
||||||
// double-check
|
// double-check
|
||||||
request.redirectURL(redirectionUrl);
|
/* We have to clone the request instance and not to modify directly its URL,
|
||||||
this.sb.crawlStacker.stackCrawl(request);
|
* otherwise the stackCrawl() function would reject it, because detecting it as already in the activeWorkerEntries */
|
||||||
|
Request redirectedRequest = new Request(request.initiator(),
|
||||||
|
redirectionUrl,
|
||||||
|
request.referrerhash(),
|
||||||
|
request.name(),
|
||||||
|
request.appdate(),
|
||||||
|
request.profileHandle(),
|
||||||
|
request.depth(),
|
||||||
|
request.timezoneOffset());
|
||||||
|
String rejectReason = this.sb.crawlStacker.stackCrawl(redirectedRequest);
|
||||||
|
if(rejectReason != null) {
|
||||||
|
throw new IOException("CRAWLER Redirect of URL=" + requestURLString + " aborted. Reason : " + rejectReason);
|
||||||
|
}
|
||||||
// in the end we must throw an exception (even if this is
|
// in the end we must throw an exception (even if this is
|
||||||
// not an error, just to abort the current process
|
// not an error, just to abort the current process
|
||||||
throw new IOException("CRAWLER Redirect of URL=" + requestURLString + " to "
|
throw new IOException("CRAWLER Redirect of URL=" + requestURLString + " to "
|
||||||
|
@ -349,10 +361,24 @@ public final class HTTPLoader {
|
||||||
// we have two use cases here: loading from a crawl or just loading the url. Check this:
|
// we have two use cases here: loading from a crawl or just loading the url. Check this:
|
||||||
if (profile != null && !CrawlSwitchboard.DEFAULT_PROFILES.contains(profile.name())) {
|
if (profile != null && !CrawlSwitchboard.DEFAULT_PROFILES.contains(profile.name())) {
|
||||||
// put redirect url on the crawler queue to repeat a double-check
|
// put redirect url on the crawler queue to repeat a double-check
|
||||||
request.redirectURL(redirectionUrl);
|
/* We have to clone the request instance and not to modify directly its URL,
|
||||||
this.sb.crawlStacker.stackCrawl(request);
|
* otherwise the stackCrawl() function would reject it, because detecting it as already in the activeWorkerEntries */
|
||||||
|
Request redirectedRequest = new Request(request.initiator(),
|
||||||
|
redirectionUrl,
|
||||||
|
request.referrerhash(),
|
||||||
|
request.name(),
|
||||||
|
request.appdate(),
|
||||||
|
request.profileHandle(),
|
||||||
|
request.depth(),
|
||||||
|
request.timezoneOffset());
|
||||||
|
String rejectReason = this.sb.crawlStacker.stackCrawl(redirectedRequest);
|
||||||
// in the end we must throw an exception (even if this is not an error, just to abort the current process
|
// in the end we must throw an exception (even if this is not an error, just to abort the current process
|
||||||
|
if(rejectReason != null) {
|
||||||
|
throw new IOException("CRAWLER Redirect of URL=" + requestURLString + " aborted. Reason : " + rejectReason);
|
||||||
|
}
|
||||||
throw new IOException("CRAWLER Redirect of URL=" + requestURLString + " to " + redirectionUrl.toNormalform(false) + " placed on crawler queue for double-check");
|
throw new IOException("CRAWLER Redirect of URL=" + requestURLString + " to " + redirectionUrl.toNormalform(false) + " placed on crawler queue for double-check");
|
||||||
|
|
||||||
|
|
||||||
}
|
}
|
||||||
|
|
||||||
// if we are already doing a shutdown we don't need to retry crawling
|
// if we are already doing a shutdown we don't need to retry crawling
|
||||||
|
|
Loading…
Reference in New Issue
Block a user