2010-03-11 16:43:06 +01:00
// HTTPLoader.java
// ---------------
// (C) by Michael Peter Christen; mc@yacy.net
// first published on http://yacy.net
// Frankfurt, Germany, 2006
2005-09-07 15:18:34 +02:00
//
2009-09-23 23:26:14 +02:00
// $LastChangedDate$
// $LastChangedRevision$
// $LastChangedBy$
2005-04-21 12:31:40 +02:00
//
2010-03-11 16:43:06 +01:00
// This program is free software; you can redistribute it and/or modify
// it under the terms of the GNU General Public License as published by
// the Free Software Foundation; either version 2 of the License, or
// (at your option) any later version.
2005-04-21 12:31:40 +02:00
//
2010-03-11 16:43:06 +01:00
// This program is distributed in the hope that it will be useful,
// but WITHOUT ANY WARRANTY; without even the implied warranty of
// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
// GNU General Public License for more details.
2005-04-21 12:31:40 +02:00
//
2010-03-11 16:43:06 +01:00
// You should have received a copy of the GNU General Public License
// along with this program; if not, write to the Free Software
// Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
2005-04-21 12:31:40 +02:00
2009-07-15 23:07:46 +02:00
package de.anomic.crawler.retrieval ;
2005-04-21 12:31:40 +02:00
import java.io.IOException ;
import java.util.Date ;
2010-08-31 17:47:47 +02:00
import java.util.Map ;
2006-09-04 08:09:20 +02:00
2010-05-25 14:54:57 +02:00
import net.yacy.cora.document.MultiProtocolURI ;
2010-08-23 14:32:02 +02:00
import net.yacy.cora.protocol.HeaderFramework ;
import net.yacy.cora.protocol.RequestHeader ;
import net.yacy.cora.protocol.ResponseHeader ;
2010-08-23 00:32:39 +02:00
import net.yacy.cora.protocol.http.HTTPClient ;
2009-10-11 02:12:19 +02:00
import net.yacy.kelondro.data.meta.DigestURI ;
2010-09-11 17:58:15 +02:00
import net.yacy.kelondro.io.ByteCount ;
2009-10-10 01:13:30 +02:00
import net.yacy.kelondro.logging.Log ;
2009-10-21 22:14:30 +02:00
import net.yacy.repository.Blacklist ;
2009-10-10 01:13:30 +02:00
2010-08-31 17:47:47 +02:00
import de.anomic.crawler.CrawlProfile ;
2009-07-15 23:07:46 +02:00
import de.anomic.crawler.Latency ;
2009-10-11 02:12:19 +02:00
import de.anomic.search.Segments ;
2009-07-19 22:37:44 +02:00
import de.anomic.search.Switchboard ;
2005-04-21 12:31:40 +02:00
2008-05-06 02:32:41 +02:00
public final class HTTPLoader {
2005-11-04 14:41:51 +01:00
2008-05-24 13:04:44 +02:00
private static final String DEFAULT_ENCODING = " gzip,deflate " ;
private static final String DEFAULT_LANGUAGE = " en-us,en;q=0.5 " ;
private static final String DEFAULT_CHARSET = " ISO-8859-1,utf-8;q=0.7,*;q=0.7 " ;
2010-05-14 20:30:11 +02:00
public static final long DEFAULT_MAXFILESIZE = 1024 * 1024 * 10 ;
2008-05-24 13:04:44 +02:00
public static final int DEFAULT_CRAWLING_RETRY_COUNT = 5 ;
2006-09-04 11:00:18 +02:00
/ * *
* The socket timeout that should be used
* /
2008-08-02 14:12:04 +02:00
private final int socketTimeout ;
2009-07-19 22:37:44 +02:00
private final Switchboard sb ;
2009-01-31 00:33:47 +01:00
private final Log log ;
2006-09-04 11:00:18 +02:00
2009-07-19 22:37:44 +02:00
public HTTPLoader ( final Switchboard sb , final Log theLog ) {
2007-10-29 02:43:20 +01:00
this . sb = sb ;
this . log = theLog ;
2007-06-07 17:26:41 +02:00
// refreshing timeout value
2007-10-29 02:43:20 +01:00
this . socketTimeout = ( int ) sb . getConfigLong ( " crawler.clientTimeout " , 10000 ) ;
2009-07-19 23:59:29 +02:00
}
2007-10-29 02:43:20 +01:00
2010-06-17 13:59:40 +02:00
public Response load ( final Request entry , long maxFileSize ) throws IOException {
2009-03-20 11:21:23 +01:00
long start = System . currentTimeMillis ( ) ;
2010-06-17 13:59:40 +02:00
Response doc = load ( entry , DEFAULT_CRAWLING_RETRY_COUNT , maxFileSize ) ;
2010-05-26 02:01:16 +02:00
Latency . update ( entry . url ( ) , System . currentTimeMillis ( ) - start ) ;
2009-03-20 11:21:23 +01:00
return doc ;
2007-10-29 02:43:20 +01:00
}
2006-09-04 11:00:18 +02:00
2010-06-17 13:59:40 +02:00
private Response load ( final Request request , final int retryCount , final long maxFileSize ) throws IOException {
2005-11-04 14:41:51 +01:00
2007-10-29 02:43:20 +01:00
if ( retryCount < 0 ) {
2010-04-08 02:11:32 +02:00
sb . crawlQueues . errorURL . push ( request , sb . peers . mySeed ( ) . hash . getBytes ( ) , new Date ( ) , 1 , " redirection counter exceeded " ) ;
2009-07-19 23:59:29 +02:00
throw new IOException ( " Redirection counter exceeded for URL " + request . url ( ) . toString ( ) + " . Processing aborted. " ) ;
2007-10-29 02:43:20 +01:00
}
2009-07-19 23:59:29 +02:00
final String host = request . url ( ) . getHost ( ) ;
2009-09-22 16:39:06 +02:00
if ( host = = null | | host . length ( ) < 2 ) throw new IOException ( " host is not well-formed: ' " + host + " ' " ) ;
2009-07-19 23:59:29 +02:00
final String path = request . url ( ) . getFile ( ) ;
int port = request . url ( ) . getPort ( ) ;
final boolean ssl = request . url ( ) . getProtocol ( ) . equals ( " https " ) ;
2005-06-02 03:33:10 +02:00
if ( port < 0 ) port = ( ssl ) ? 443 : 80 ;
2006-08-07 17:11:14 +02:00
2005-09-02 14:09:45 +02:00
// check if url is in blacklist
2008-08-02 14:12:04 +02:00
final String hostlow = host . toLowerCase ( ) ;
2009-07-19 22:37:44 +02:00
if ( Switchboard . urlBlacklist . isListed ( Blacklist . BLACKLIST_CRAWLER , hostlow , path ) ) {
2010-04-08 02:11:32 +02:00
sb . crawlQueues . errorURL . push ( request , sb . peers . mySeed ( ) . hash . getBytes ( ) , new Date ( ) , 1 , " url in blacklist " ) ;
2009-07-19 23:59:29 +02:00
throw new IOException ( " CRAWLER Rejecting URL ' " + request . url ( ) . toString ( ) + " '. URL is in blacklist. " ) ;
2005-11-04 14:41:51 +01:00
}
2007-10-29 02:43:20 +01:00
2005-06-02 03:33:10 +02:00
// take a file from the net
2009-07-19 23:59:29 +02:00
Response response = null ;
2010-05-14 20:30:11 +02:00
2009-07-19 23:59:29 +02:00
// create a request header
final RequestHeader requestHeader = new RequestHeader ( ) ;
2010-09-27 16:54:32 +02:00
requestHeader . put ( HeaderFramework . USER_AGENT , MultiProtocolURI . yacybotUserAgent ) ;
2009-10-11 02:12:19 +02:00
DigestURI refererURL = null ;
2009-10-09 16:44:20 +02:00
if ( request . referrerhash ( ) ! = null ) refererURL = sb . getURL ( Segments . Process . LOCALCRAWLING , request . referrerhash ( ) ) ;
2009-07-19 23:59:29 +02:00
if ( refererURL ! = null ) requestHeader . put ( RequestHeader . REFERER , refererURL . toNormalform ( true , true ) ) ;
requestHeader . put ( HeaderFramework . ACCEPT_LANGUAGE , sb . getConfig ( " crawler.http.acceptLanguage " , DEFAULT_LANGUAGE ) ) ;
requestHeader . put ( HeaderFramework . ACCEPT_CHARSET , sb . getConfig ( " crawler.http.acceptCharset " , DEFAULT_CHARSET ) ) ;
requestHeader . put ( HeaderFramework . ACCEPT_ENCODING , sb . getConfig ( " crawler.http.acceptEncoding " , DEFAULT_ENCODING ) ) ;
2005-11-04 14:41:51 +01:00
2009-07-19 23:59:29 +02:00
// HTTP-Client
2010-08-23 00:32:39 +02:00
final HTTPClient client = new HTTPClient ( ) ;
2010-11-21 23:46:12 +01:00
client . setRedirecting ( false ) ; // we want to handle redirection ourselves, so we don't index pages twice
2010-07-27 03:16:26 +02:00
client . setTimout ( socketTimeout ) ;
client . setHeader ( requestHeader . entrySet ( ) ) ;
2009-07-19 23:59:29 +02:00
// send request
2010-07-27 03:16:26 +02:00
final byte [ ] responseBody = client . GETbytes ( request . url ( ) . toString ( ) , maxFileSize ) ;
2010-08-10 23:22:30 +02:00
final ResponseHeader header = new ResponseHeader ( client . getHttpResponse ( ) . getAllHeaders ( ) ) ;
2010-07-27 03:16:26 +02:00
final int code = client . getHttpResponse ( ) . getStatusLine ( ) . getStatusCode ( ) ;
2009-07-13 21:55:13 +02:00
2010-11-21 23:46:12 +01:00
if ( code > 299 & & code < 310 ) {
// redirection (content may be empty)
2010-07-27 03:16:26 +02:00
if ( header . containsKey ( HeaderFramework . LOCATION ) ) {
2009-07-19 23:59:29 +02:00
// getting redirection URL
2010-07-27 03:16:26 +02:00
String redirectionUrlString = header . get ( HeaderFramework . LOCATION ) ;
2009-07-19 23:59:29 +02:00
redirectionUrlString = redirectionUrlString . trim ( ) ;
if ( redirectionUrlString . length ( ) = = 0 ) {
2010-04-08 02:11:32 +02:00
sb . crawlQueues . errorURL . push ( request , sb . peers . mySeed ( ) . hash . getBytes ( ) , new Date ( ) , 1 , " redirection header empy " ) ;
2009-07-19 23:59:29 +02:00
throw new IOException ( " CRAWLER Redirection of URL= " + request . url ( ) . toString ( ) + " aborted. Location header is empty. " ) ;
2009-07-13 21:55:13 +02:00
}
2009-07-19 23:59:29 +02:00
// normalizing URL
2010-05-25 14:54:57 +02:00
final DigestURI redirectionUrl = new DigestURI ( MultiProtocolURI . newURL ( request . url ( ) , redirectionUrlString ) ) ;
2009-07-13 21:55:13 +02:00
2009-07-19 23:59:29 +02:00
// restart crawling with new url
2010-07-27 03:16:26 +02:00
this . log . logInfo ( " CRAWLER Redirection detected (' " + client . getHttpResponse ( ) . getStatusLine ( ) + " ') for URL " + request . url ( ) . toString ( ) ) ;
2009-07-19 23:59:29 +02:00
this . log . logInfo ( " CRAWLER ..Redirecting request to: " + redirectionUrl ) ;
2009-07-13 21:55:13 +02:00
2009-07-19 23:59:29 +02:00
// if we are already doing a shutdown we don't need to retry crawling
if ( Thread . currentThread ( ) . isInterrupted ( ) ) {
2010-04-08 02:11:32 +02:00
sb . crawlQueues . errorURL . push ( request , sb . peers . mySeed ( ) . hash . getBytes ( ) , new Date ( ) , 1 , " server shutdown " ) ;
2009-07-19 23:59:29 +02:00
throw new IOException ( " CRAWLER Retry of URL= " + request . url ( ) . toString ( ) + " aborted because of server shutdown. " ) ;
}
// check if the url was already indexed
2010-04-08 02:11:32 +02:00
final String dbname = sb . urlExists ( Segments . Process . LOCALCRAWLING , redirectionUrl . hash ( ) ) ;
2009-07-19 23:59:29 +02:00
if ( dbname ! = null ) {
2010-04-08 02:11:32 +02:00
sb . crawlQueues . errorURL . push ( request , sb . peers . mySeed ( ) . hash . getBytes ( ) , new Date ( ) , 1 , " redirection to double content " ) ;
2009-07-19 23:59:29 +02:00
throw new IOException ( " CRAWLER Redirection of URL= " + request . url ( ) . toString ( ) + " ignored. The url appears already in db " + dbname ) ;
}
// retry crawling with new url
request . redirectURL ( redirectionUrl ) ;
2010-06-17 13:59:40 +02:00
return load ( request , retryCount - 1 , maxFileSize ) ;
2010-11-21 23:46:12 +01:00
} else {
// no redirection url provided
sb . crawlQueues . errorURL . push ( request , sb . peers . mySeed ( ) . hash . getBytes ( ) , new Date ( ) , 1 , " no redirection url provided " ) ;
throw new IOException ( " REJECTED EMTPY REDIRECTION ' " + client . getHttpResponse ( ) . getStatusLine ( ) + " ' for URL " + request . url ( ) . toString ( ) ) ;
2008-04-05 15:17:16 +02:00
}
2010-11-21 23:46:12 +01:00
} else if ( responseBody = = null ) {
// no response, reject file
sb . crawlQueues . errorURL . push ( request , sb . peers . mySeed ( ) . hash . getBytes ( ) , new Date ( ) , 1 , " no response body (you may increase the maxmimum file size) " ) ;
throw new IOException ( " REJECTED EMPTY RESPONSE BODY ' " + client . getHttpResponse ( ) . getStatusLine ( ) + " ' for URL " + request . url ( ) . toString ( ) ) ;
} else if ( code = = 200 | | code = = 203 ) {
// the transfer is ok
// we write the new cache entry to file system directly
long contentLength = responseBody . length ;
ByteCount . addAccountCount ( ByteCount . CRAWLER , contentLength ) ;
// check length again in case it was not possible to get the length before loading
if ( maxFileSize > 0 & & contentLength > maxFileSize ) {
sb . crawlQueues . errorURL . push ( request , sb . peers . mySeed ( ) . hash . getBytes ( ) , new Date ( ) , 1 , " file size limit exceeded " ) ;
throw new IOException ( " REJECTED URL " + request . url ( ) + " because file size ' " + contentLength + " ' exceeds max filesize limit of " + maxFileSize + " bytes. (GET) " ) ;
}
// create a new cache entry
final Map < String , String > mp = sb . crawler . profilesActiveCrawls . get ( request . profileHandle ( ) . getBytes ( ) ) ;
response = new Response (
request ,
requestHeader ,
header ,
Integer . toString ( code ) ,
mp = = null ? null : new CrawlProfile ( mp ) ,
responseBody
) ;
return response ;
} else {
2009-07-19 23:59:29 +02:00
// if the response has not the right response type then reject file
2010-10-01 01:57:58 +02:00
sb . crawlQueues . errorURL . push ( request , sb . peers . mySeed ( ) . hash . getBytes ( ) , new Date ( ) , 1 , " wrong http status code " + code ) ;
2010-07-27 03:16:26 +02:00
throw new IOException ( " REJECTED WRONG STATUS TYPE ' " + client . getHttpResponse ( ) . getStatusLine ( ) + " ' for URL " + request . url ( ) . toString ( ) ) ;
2009-04-13 23:21:47 +02:00
}
2005-06-02 03:33:10 +02:00
}
2006-08-07 17:11:14 +02:00
2009-10-01 00:11:00 +02:00
public static Response load ( final Request request ) throws IOException {
return load ( request , 3 ) ;
}
private static Response load ( final Request request , int retryCount ) throws IOException {
if ( retryCount < 0 ) {
throw new IOException ( " Redirection counter exceeded for URL " + request . url ( ) . toString ( ) + " . Processing aborted. " ) ;
}
final String host = request . url ( ) . getHost ( ) ;
if ( host = = null | | host . length ( ) < 2 ) throw new IOException ( " host is not well-formed: ' " + host + " ' " ) ;
final String path = request . url ( ) . getFile ( ) ;
int port = request . url ( ) . getPort ( ) ;
final boolean ssl = request . url ( ) . getProtocol ( ) . equals ( " https " ) ;
if ( port < 0 ) port = ( ssl ) ? 443 : 80 ;
// check if url is in blacklist
final String hostlow = host . toLowerCase ( ) ;
if ( Switchboard . urlBlacklist ! = null & & Switchboard . urlBlacklist . isListed ( Blacklist . BLACKLIST_CRAWLER , hostlow , path ) ) {
throw new IOException ( " CRAWLER Rejecting URL ' " + request . url ( ) . toString ( ) + " '. URL is in blacklist. " ) ;
}
// take a file from the net
Response response = null ;
// create a request header
final RequestHeader requestHeader = new RequestHeader ( ) ;
2010-09-27 16:54:32 +02:00
requestHeader . put ( HeaderFramework . USER_AGENT , MultiProtocolURI . yacybotUserAgent ) ;
2009-10-01 00:11:00 +02:00
requestHeader . put ( HeaderFramework . ACCEPT_LANGUAGE , DEFAULT_LANGUAGE ) ;
requestHeader . put ( HeaderFramework . ACCEPT_CHARSET , DEFAULT_CHARSET ) ;
requestHeader . put ( HeaderFramework . ACCEPT_ENCODING , DEFAULT_ENCODING ) ;
2010-08-23 00:32:39 +02:00
final HTTPClient client = new HTTPClient ( ) ;
2010-07-27 03:16:26 +02:00
client . setTimout ( 20000 ) ;
client . setHeader ( requestHeader . entrySet ( ) ) ;
final byte [ ] responseBody = client . GETbytes ( request . url ( ) . toString ( ) , Long . MAX_VALUE ) ;
2010-08-10 23:22:30 +02:00
final ResponseHeader header = new ResponseHeader ( client . getHttpResponse ( ) . getAllHeaders ( ) ) ;
2010-07-27 03:16:26 +02:00
final int code = client . getHttpResponse ( ) . getStatusLine ( ) . getStatusCode ( ) ;
2009-10-01 00:11:00 +02:00
// FIXME: 30*-handling (bottom) is never reached
// we always get the final content because httpClient.followRedirects = true
2010-07-27 03:16:26 +02:00
if ( responseBody ! = null & & ( code = = 200 | | code = = 203 ) ) {
2009-10-01 00:11:00 +02:00
// the transfer is ok
2010-09-11 17:58:15 +02:00
//statistics:
ByteCount . addAccountCount ( ByteCount . CRAWLER , responseBody . length ) ;
2009-10-01 00:11:00 +02:00
// we write the new cache entry to file system directly
// create a new cache entry
response = new Response (
request ,
requestHeader ,
2010-07-27 03:16:26 +02:00
header ,
Integer . toString ( code ) ,
2009-10-01 00:11:00 +02:00
null ,
responseBody
) ;
return response ;
2010-07-27 03:16:26 +02:00
} else if ( code > 299 & & code < 310 ) {
if ( header . containsKey ( HeaderFramework . LOCATION ) ) {
2009-10-01 00:11:00 +02:00
// getting redirection URL
2010-07-27 03:16:26 +02:00
String redirectionUrlString = header . get ( HeaderFramework . LOCATION ) ;
2009-10-01 00:11:00 +02:00
redirectionUrlString = redirectionUrlString . trim ( ) ;
if ( redirectionUrlString . length ( ) = = 0 ) {
throw new IOException ( " CRAWLER Redirection of URL= " + request . url ( ) . toString ( ) + " aborted. Location header is empty. " ) ;
}
// normalizing URL
2010-05-25 14:54:57 +02:00
final DigestURI redirectionUrl = new DigestURI ( MultiProtocolURI . newURL ( request . url ( ) , redirectionUrlString ) ) ;
2009-10-01 00:11:00 +02:00
// if we are already doing a shutdown we don't need to retry crawling
if ( Thread . currentThread ( ) . isInterrupted ( ) ) {
throw new IOException ( " CRAWLER Retry of URL= " + request . url ( ) . toString ( ) + " aborted because of server shutdown. " ) ;
}
// retry crawling with new url
request . redirectURL ( redirectionUrl ) ;
return load ( request , retryCount - 1 ) ;
}
} else {
// if the response has not the right response type then reject file
2010-07-27 03:16:26 +02:00
throw new IOException ( " REJECTED WRONG STATUS TYPE ' " + client . getHttpResponse ( ) . getStatusLine ( ) + " ' for URL " + request . url ( ) . toString ( ) ) ;
2009-10-01 00:11:00 +02:00
}
return response ;
}
2007-06-07 17:26:41 +02:00
}