mirror of
https://github.com/yacy/yacy_search_server.git
synced 2024-09-21 00:00:13 +02:00
87087f12fe
- removed concurrency overhead for small number of index normalizations as it happens during remote search - removed 'load only parseable' constraint for snippet fetch because some resources may not have any url file extension and these had therefore not been parseable and searcheable since they may become parseable after loading when their mime type is known - this partly fixes some problems with http://forum.yacy-websuche.de/viewtopic.php?p=20300#p20300 but more changes are necessary to get all expected search results git-svn-id: https://svn.berlios.de/svnroot/repos/yacy/trunk@6926 6c8d7289-2bf4-0310-a012-ef5d649a1542
300 lines
14 KiB
Java
300 lines
14 KiB
Java
// HTTPLoader.java
|
|
// ---------------
|
|
// (C) by Michael Peter Christen; mc@yacy.net
|
|
// first published on http://yacy.net
|
|
// Frankfurt, Germany, 2006
|
|
//
|
|
// $LastChangedDate$
|
|
// $LastChangedRevision$
|
|
// $LastChangedBy$
|
|
//
|
|
// This program is free software; you can redistribute it and/or modify
|
|
// it under the terms of the GNU General Public License as published by
|
|
// the Free Software Foundation; either version 2 of the License, or
|
|
// (at your option) any later version.
|
|
//
|
|
// This program is distributed in the hope that it will be useful,
|
|
// but WITHOUT ANY WARRANTY; without even the implied warranty of
|
|
// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
|
|
// GNU General Public License for more details.
|
|
//
|
|
// You should have received a copy of the GNU General Public License
|
|
// along with this program; if not, write to the Free Software
|
|
// Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
|
|
|
|
package de.anomic.crawler.retrieval;
|
|
|
|
import java.io.IOException;
|
|
import java.util.Date;
|
|
|
|
import net.yacy.cora.document.MultiProtocolURI;
|
|
import net.yacy.kelondro.data.meta.DigestURI;
|
|
import net.yacy.kelondro.logging.Log;
|
|
import net.yacy.repository.Blacklist;
|
|
|
|
import de.anomic.crawler.Latency;
|
|
import de.anomic.http.client.Client;
|
|
import de.anomic.http.server.HeaderFramework;
|
|
import de.anomic.http.server.RequestHeader;
|
|
import de.anomic.http.server.ResponseContainer;
|
|
import de.anomic.search.Segments;
|
|
import de.anomic.search.Switchboard;
|
|
|
|
public final class HTTPLoader {
|
|
|
|
private static final String DEFAULT_ENCODING = "gzip,deflate";
|
|
private static final String DEFAULT_LANGUAGE = "en-us,en;q=0.5";
|
|
private static final String DEFAULT_CHARSET = "ISO-8859-1,utf-8;q=0.7,*;q=0.7";
|
|
public static final long DEFAULT_MAXFILESIZE = 1024 * 1024 * 10;
|
|
public static final int DEFAULT_CRAWLING_RETRY_COUNT = 5;
|
|
public static final String crawlerUserAgent = "yacybot (" + Client.getSystemOST() +") http://yacy.net/bot.html";
|
|
public static final String yacyUserAgent = "yacy (" + Client.getSystemOST() +") yacy.net";
|
|
|
|
/**
|
|
* The socket timeout that should be used
|
|
*/
|
|
private final int socketTimeout;
|
|
|
|
/**
|
|
* The maximum allowed file size
|
|
*/
|
|
//private long maxFileSize = -1;
|
|
|
|
//private String acceptEncoding;
|
|
//private String acceptLanguage;
|
|
//private String acceptCharset;
|
|
private final Switchboard sb;
|
|
private final Log log;
|
|
|
|
public HTTPLoader(final Switchboard sb, final Log theLog) {
|
|
this.sb = sb;
|
|
this.log = theLog;
|
|
|
|
// refreshing timeout value
|
|
this.socketTimeout = (int) sb.getConfigLong("crawler.clientTimeout", 10000);
|
|
}
|
|
|
|
public Response load(final Request entry, long maxFileSize) throws IOException {
|
|
long start = System.currentTimeMillis();
|
|
Response doc = load(entry, DEFAULT_CRAWLING_RETRY_COUNT, maxFileSize);
|
|
Latency.update(entry.url(), System.currentTimeMillis() - start);
|
|
return doc;
|
|
}
|
|
|
|
private Response load(final Request request, final int retryCount, final long maxFileSize) throws IOException {
|
|
|
|
if (retryCount < 0) {
|
|
sb.crawlQueues.errorURL.push(request, sb.peers.mySeed().hash.getBytes(), new Date(), 1, "redirection counter exceeded");
|
|
throw new IOException("Redirection counter exceeded for URL " + request.url().toString() + ". Processing aborted.");
|
|
}
|
|
|
|
final String host = request.url().getHost();
|
|
if (host == null || host.length() < 2) throw new IOException("host is not well-formed: '" + host + "'");
|
|
final String path = request.url().getFile();
|
|
int port = request.url().getPort();
|
|
final boolean ssl = request.url().getProtocol().equals("https");
|
|
if (port < 0) port = (ssl) ? 443 : 80;
|
|
|
|
// check if url is in blacklist
|
|
final String hostlow = host.toLowerCase();
|
|
if (Switchboard.urlBlacklist.isListed(Blacklist.BLACKLIST_CRAWLER, hostlow, path)) {
|
|
sb.crawlQueues.errorURL.push(request, sb.peers.mySeed().hash.getBytes(), new Date(), 1, "url in blacklist");
|
|
throw new IOException("CRAWLER Rejecting URL '" + request.url().toString() + "'. URL is in blacklist.");
|
|
}
|
|
|
|
// take a file from the net
|
|
Response response = null;
|
|
|
|
// create a request header
|
|
final RequestHeader requestHeader = new RequestHeader();
|
|
requestHeader.put(HeaderFramework.USER_AGENT, crawlerUserAgent);
|
|
DigestURI refererURL = null;
|
|
if (request.referrerhash() != null) refererURL = sb.getURL(Segments.Process.LOCALCRAWLING, request.referrerhash());
|
|
if (refererURL != null) requestHeader.put(RequestHeader.REFERER, refererURL.toNormalform(true, true));
|
|
requestHeader.put(HeaderFramework.ACCEPT_LANGUAGE, sb.getConfig("crawler.http.acceptLanguage", DEFAULT_LANGUAGE));
|
|
requestHeader.put(HeaderFramework.ACCEPT_CHARSET, sb.getConfig("crawler.http.acceptCharset", DEFAULT_CHARSET));
|
|
requestHeader.put(HeaderFramework.ACCEPT_ENCODING, sb.getConfig("crawler.http.acceptEncoding", DEFAULT_ENCODING));
|
|
|
|
// HTTP-Client
|
|
final Client client = new Client(socketTimeout, requestHeader);
|
|
|
|
ResponseContainer res = null;
|
|
try {
|
|
// send request
|
|
res = client.GET(request.url().toString(), maxFileSize);
|
|
// FIXME: 30*-handling (bottom) is never reached
|
|
// we always get the final content because httpClient.followRedirects = true
|
|
|
|
if (res.getStatusCode() == 200 || res.getStatusCode() == 203) {
|
|
// the transfer is ok
|
|
|
|
// we write the new cache entry to file system directly
|
|
res.setAccountingName("CRAWLER");
|
|
final byte[] responseBody = res.getData();
|
|
long contentLength = responseBody.length;
|
|
|
|
// check length again in case it was not possible to get the length before loading
|
|
if (maxFileSize > 0 && contentLength > maxFileSize) {
|
|
sb.crawlQueues.errorURL.push(request, sb.peers.mySeed().hash.getBytes(), new Date(), 1, "file size limit exceeded");
|
|
throw new IOException("REJECTED URL " + request.url() + " because file size '" + contentLength + "' exceeds max filesize limit of " + maxFileSize + " bytes. (GET)");
|
|
}
|
|
|
|
// create a new cache entry
|
|
response = new Response(
|
|
request,
|
|
requestHeader,
|
|
res.getResponseHeader(),
|
|
res.getStatusLine(),
|
|
sb.crawler.profilesActiveCrawls.getEntry(request.profileHandle()),
|
|
responseBody
|
|
);
|
|
|
|
return response;
|
|
} else if (res.getStatusLine().startsWith("30")) {
|
|
if (res.getResponseHeader().containsKey(HeaderFramework.LOCATION)) {
|
|
// getting redirection URL
|
|
String redirectionUrlString = res.getResponseHeader().get(HeaderFramework.LOCATION);
|
|
redirectionUrlString = redirectionUrlString.trim();
|
|
|
|
if (redirectionUrlString.length() == 0) {
|
|
sb.crawlQueues.errorURL.push(request, sb.peers.mySeed().hash.getBytes(), new Date(), 1, "redirection header empy");
|
|
throw new IOException("CRAWLER Redirection of URL=" + request.url().toString() + " aborted. Location header is empty.");
|
|
}
|
|
|
|
// normalizing URL
|
|
final DigestURI redirectionUrl = new DigestURI(MultiProtocolURI.newURL(request.url(), redirectionUrlString));
|
|
|
|
// restart crawling with new url
|
|
this.log.logInfo("CRAWLER Redirection detected ('" + res.getStatusLine() + "') for URL " + request.url().toString());
|
|
this.log.logInfo("CRAWLER ..Redirecting request to: " + redirectionUrl);
|
|
|
|
// if we are already doing a shutdown we don't need to retry crawling
|
|
if (Thread.currentThread().isInterrupted()) {
|
|
sb.crawlQueues.errorURL.push(request, sb.peers.mySeed().hash.getBytes(), new Date(), 1, "server shutdown");
|
|
throw new IOException("CRAWLER Retry of URL=" + request.url().toString() + " aborted because of server shutdown.");
|
|
}
|
|
|
|
// check if the url was already indexed
|
|
final String dbname = sb.urlExists(Segments.Process.LOCALCRAWLING, redirectionUrl.hash());
|
|
if (dbname != null) {
|
|
sb.crawlQueues.errorURL.push(request, sb.peers.mySeed().hash.getBytes(), new Date(), 1, "redirection to double content");
|
|
throw new IOException("CRAWLER Redirection of URL=" + request.url().toString() + " ignored. The url appears already in db " + dbname);
|
|
}
|
|
|
|
// retry crawling with new url
|
|
request.redirectURL(redirectionUrl);
|
|
return load(request, retryCount - 1, maxFileSize);
|
|
}
|
|
} else {
|
|
// if the response has not the right response type then reject file
|
|
sb.crawlQueues.errorURL.push(request, sb.peers.mySeed().hash.getBytes(), new Date(), 1, "wrong http status code " + res.getStatusCode() + ")");
|
|
throw new IOException("REJECTED WRONG STATUS TYPE '" + res.getStatusLine() + "' for URL " + request.url().toString());
|
|
}
|
|
} finally {
|
|
if(res != null) {
|
|
// release connection
|
|
res.closeStream();
|
|
}
|
|
}
|
|
return response;
|
|
}
|
|
|
|
public static Response load(final Request request) throws IOException {
|
|
return load(request, 3);
|
|
}
|
|
|
|
private static Response load(final Request request, int retryCount) throws IOException {
|
|
|
|
if (retryCount < 0) {
|
|
throw new IOException("Redirection counter exceeded for URL " + request.url().toString() + ". Processing aborted.");
|
|
}
|
|
|
|
final String host = request.url().getHost();
|
|
if (host == null || host.length() < 2) throw new IOException("host is not well-formed: '" + host + "'");
|
|
final String path = request.url().getFile();
|
|
int port = request.url().getPort();
|
|
final boolean ssl = request.url().getProtocol().equals("https");
|
|
if (port < 0) port = (ssl) ? 443 : 80;
|
|
|
|
// check if url is in blacklist
|
|
final String hostlow = host.toLowerCase();
|
|
if (Switchboard.urlBlacklist != null && Switchboard.urlBlacklist.isListed(Blacklist.BLACKLIST_CRAWLER, hostlow, path)) {
|
|
throw new IOException("CRAWLER Rejecting URL '" + request.url().toString() + "'. URL is in blacklist.");
|
|
}
|
|
|
|
// take a file from the net
|
|
Response response = null;
|
|
|
|
// create a request header
|
|
final RequestHeader requestHeader = new RequestHeader();
|
|
requestHeader.put(HeaderFramework.USER_AGENT, crawlerUserAgent);
|
|
requestHeader.put(HeaderFramework.ACCEPT_LANGUAGE, DEFAULT_LANGUAGE);
|
|
requestHeader.put(HeaderFramework.ACCEPT_CHARSET, DEFAULT_CHARSET);
|
|
requestHeader.put(HeaderFramework.ACCEPT_ENCODING, DEFAULT_ENCODING);
|
|
|
|
// HTTP-Client
|
|
final Client client = new Client(20000, requestHeader);
|
|
|
|
ResponseContainer res = null;
|
|
try {
|
|
// send request
|
|
res = client.GET(request.url().toString(), Long.MAX_VALUE);
|
|
// FIXME: 30*-handling (bottom) is never reached
|
|
// we always get the final content because httpClient.followRedirects = true
|
|
|
|
if (res.getStatusCode() == 200 || res.getStatusCode() == 203) {
|
|
// the transfer is ok
|
|
|
|
// we write the new cache entry to file system directly
|
|
res.setAccountingName("CRAWLER");
|
|
final byte[] responseBody = res.getData();
|
|
|
|
// create a new cache entry
|
|
response = new Response(
|
|
request,
|
|
requestHeader,
|
|
res.getResponseHeader(),
|
|
res.getStatusLine(),
|
|
null,
|
|
responseBody
|
|
);
|
|
|
|
return response;
|
|
} else if (res.getStatusLine().startsWith("30")) {
|
|
if (res.getResponseHeader().containsKey(HeaderFramework.LOCATION)) {
|
|
// getting redirection URL
|
|
String redirectionUrlString = res.getResponseHeader().get(HeaderFramework.LOCATION);
|
|
redirectionUrlString = redirectionUrlString.trim();
|
|
|
|
if (redirectionUrlString.length() == 0) {
|
|
throw new IOException("CRAWLER Redirection of URL=" + request.url().toString() + " aborted. Location header is empty.");
|
|
}
|
|
|
|
// normalizing URL
|
|
final DigestURI redirectionUrl = new DigestURI(MultiProtocolURI.newURL(request.url(), redirectionUrlString));
|
|
|
|
|
|
// if we are already doing a shutdown we don't need to retry crawling
|
|
if (Thread.currentThread().isInterrupted()) {
|
|
throw new IOException("CRAWLER Retry of URL=" + request.url().toString() + " aborted because of server shutdown.");
|
|
}
|
|
|
|
// retry crawling with new url
|
|
request.redirectURL(redirectionUrl);
|
|
return load(request, retryCount - 1);
|
|
}
|
|
} else {
|
|
// if the response has not the right response type then reject file
|
|
throw new IOException("REJECTED WRONG STATUS TYPE '" + res.getStatusLine() + "' for URL " + request.url().toString());
|
|
}
|
|
} finally {
|
|
if(res != null) {
|
|
// release connection
|
|
res.closeStream();
|
|
}
|
|
}
|
|
return response;
|
|
}
|
|
|
|
}
|