mirror of
https://github.com/yacy/yacy_search_server.git
synced 2024-09-19 00:01:41 +02:00
* don't index pages, which redirect, twice
* there fore auto-redirection of HTTPClient for crawling is disabled and the old code is reactivated git-svn-id: https://svn.berlios.de/svnroot/repos/yacy/trunk@7332 6c8d7289-2bf4-0310-a012-ef5d649a1542
This commit is contained in:
parent
eb79b952ef
commit
dca9e16f51
|
@ -109,45 +109,16 @@ public final class HTTPLoader {
|
|||
|
||||
// HTTP-Client
|
||||
final HTTPClient client = new HTTPClient();
|
||||
client.setRedirecting(false); // we want to handle redirection ourselves, so we don't index pages twice
|
||||
client.setTimout(socketTimeout);
|
||||
client.setHeader(requestHeader.entrySet());
|
||||
// send request
|
||||
final byte[] responseBody = client.GETbytes(request.url().toString(), maxFileSize);
|
||||
final ResponseHeader header = new ResponseHeader(client.getHttpResponse().getAllHeaders());
|
||||
final int code = client.getHttpResponse().getStatusLine().getStatusCode();
|
||||
// FIXME: 30*-handling (bottom) is never reached
|
||||
// we always get the final content because httpClient.followRedirects = true
|
||||
|
||||
if (responseBody == null) {
|
||||
// no response, reject file
|
||||
sb.crawlQueues.errorURL.push(request, sb.peers.mySeed().hash.getBytes(), new Date(), 1, "no response body (you may increase the maxmimum file size)");
|
||||
throw new IOException("REJECTED EMPTY RESPONSE BODY '" + client.getHttpResponse().getStatusLine() + "' for URL " + request.url().toString());
|
||||
} else if (code == 200 || code == 203) {
|
||||
// the transfer is ok
|
||||
|
||||
// we write the new cache entry to file system directly
|
||||
long contentLength = responseBody.length;
|
||||
ByteCount.addAccountCount(ByteCount.CRAWLER, contentLength);
|
||||
|
||||
// check length again in case it was not possible to get the length before loading
|
||||
if (maxFileSize > 0 && contentLength > maxFileSize) {
|
||||
sb.crawlQueues.errorURL.push(request, sb.peers.mySeed().hash.getBytes(), new Date(), 1, "file size limit exceeded");
|
||||
throw new IOException("REJECTED URL " + request.url() + " because file size '" + contentLength + "' exceeds max filesize limit of " + maxFileSize + " bytes. (GET)");
|
||||
}
|
||||
|
||||
// create a new cache entry
|
||||
final Map<String, String> mp = sb.crawler.profilesActiveCrawls.get(request.profileHandle().getBytes());
|
||||
response = new Response(
|
||||
request,
|
||||
requestHeader,
|
||||
header,
|
||||
Integer.toString(code),
|
||||
mp == null ? null : new CrawlProfile(mp),
|
||||
responseBody
|
||||
);
|
||||
|
||||
return response;
|
||||
} else if (code > 299 && code < 310) {
|
||||
if (code > 299 && code < 310) {
|
||||
// redirection (content may be empty)
|
||||
if (header.containsKey(HeaderFramework.LOCATION)) {
|
||||
// getting redirection URL
|
||||
String redirectionUrlString = header.get(HeaderFramework.LOCATION);
|
||||
|
@ -181,13 +152,45 @@ public final class HTTPLoader {
|
|||
// retry crawling with new url
|
||||
request.redirectURL(redirectionUrl);
|
||||
return load(request, retryCount - 1, maxFileSize);
|
||||
} else {
|
||||
// no redirection url provided
|
||||
sb.crawlQueues.errorURL.push(request, sb.peers.mySeed().hash.getBytes(), new Date(), 1, "no redirection url provided");
|
||||
throw new IOException("REJECTED EMTPY REDIRECTION '" + client.getHttpResponse().getStatusLine() + "' for URL " + request.url().toString());
|
||||
}
|
||||
} else {
|
||||
} else if (responseBody == null) {
|
||||
// no response, reject file
|
||||
sb.crawlQueues.errorURL.push(request, sb.peers.mySeed().hash.getBytes(), new Date(), 1, "no response body (you may increase the maxmimum file size)");
|
||||
throw new IOException("REJECTED EMPTY RESPONSE BODY '" + client.getHttpResponse().getStatusLine() + "' for URL " + request.url().toString());
|
||||
} else if (code == 200 || code == 203) {
|
||||
// the transfer is ok
|
||||
|
||||
// we write the new cache entry to file system directly
|
||||
long contentLength = responseBody.length;
|
||||
ByteCount.addAccountCount(ByteCount.CRAWLER, contentLength);
|
||||
|
||||
// check length again in case it was not possible to get the length before loading
|
||||
if (maxFileSize > 0 && contentLength > maxFileSize) {
|
||||
sb.crawlQueues.errorURL.push(request, sb.peers.mySeed().hash.getBytes(), new Date(), 1, "file size limit exceeded");
|
||||
throw new IOException("REJECTED URL " + request.url() + " because file size '" + contentLength + "' exceeds max filesize limit of " + maxFileSize + " bytes. (GET)");
|
||||
}
|
||||
|
||||
// create a new cache entry
|
||||
final Map<String, String> mp = sb.crawler.profilesActiveCrawls.get(request.profileHandle().getBytes());
|
||||
response = new Response(
|
||||
request,
|
||||
requestHeader,
|
||||
header,
|
||||
Integer.toString(code),
|
||||
mp == null ? null : new CrawlProfile(mp),
|
||||
responseBody
|
||||
);
|
||||
|
||||
return response;
|
||||
} else {
|
||||
// if the response has not the right response type then reject file
|
||||
sb.crawlQueues.errorURL.push(request, sb.peers.mySeed().hash.getBytes(), new Date(), 1, "wrong http status code " + code);
|
||||
throw new IOException("REJECTED WRONG STATUS TYPE '" + client.getHttpResponse().getStatusLine() + "' for URL " + request.url().toString());
|
||||
}
|
||||
return response;
|
||||
}
|
||||
|
||||
public static Response load(final Request request) throws IOException {
|
||||
|
|
Loading…
Reference in New Issue
Block a user