2005-04-21 12:31:40 +02:00
//plasmaCrawlWorker.java
//------------------------
//part of YaCy
//(C) by Michael Peter Christen; mc@anomic.de
//first published on http://www.anomic.de
2006-09-04 16:38:29 +02:00
//Frankfurt, Germany, 2006
2005-09-07 15:18:34 +02:00
//
2006-09-04 08:09:20 +02:00
// $LastChangedDate: 2006-08-12 16:28:14 +0200 (Sa, 12 Aug 2006) $
// $LastChangedRevision: 2397 $
// $LastChangedBy: theli $
2005-04-21 12:31:40 +02:00
//
//This program is free software; you can redistribute it and/or modify
//it under the terms of the GNU General Public License as published by
//the Free Software Foundation; either version 2 of the License, or
//(at your option) any later version.
//
//This program is distributed in the hope that it will be useful,
//but WITHOUT ANY WARRANTY; without even the implied warranty of
//MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
//GNU General Public License for more details.
//
//You should have received a copy of the GNU General Public License
//along with this program; if not, write to the Free Software
//Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
//
//Using this software in any meaning (reading, learning, copying, compiling,
//running) means that you agree that the Author(s) is (are) not responsible
//for cost, loss of data or any harm that may be caused directly or indirectly
//by usage of this softare or this documentation. The usage of this software
//is on your own risk. The installation and usage (starting/running) of this
//software may allow other people or application to access your computer and
//any attached devices and is highly dependent on the configuration of the
//software which must be done by the user of the software; the author(s) is
//(are) also not responsible for proper configuration and usage of the
//software, even if provoked by documentation provided together with
//the software.
//
//Any changes to this file according to the GPL as documented in the file
//gpl.txt aside this file in the shipment you received can be done to the
//lines that follows this copyright notice here, but changes must not be
//done inside the copyright notive above. A re-distribution must contain
//the intact and unchanged copyright notice.
//Contributions and changes to the program code must be marked as such.
2008-05-06 02:32:41 +02:00
package de.anomic.crawler ;
2005-04-21 12:31:40 +02:00
2006-09-04 08:09:20 +02:00
import java.io.File ;
2005-04-21 12:31:40 +02:00
import java.io.FileOutputStream ;
import java.io.IOException ;
2006-09-11 10:26:39 +02:00
import java.io.OutputStream ;
2005-08-30 22:25:07 +02:00
import java.net.MalformedURLException ;
2005-09-11 22:21:38 +02:00
import java.net.NoRouteToHostException ;
2005-04-21 12:31:40 +02:00
import java.net.SocketException ;
2005-08-28 07:08:26 +02:00
import java.net.UnknownHostException ;
2005-04-21 12:31:40 +02:00
import java.util.Date ;
2006-09-04 08:09:20 +02:00
2008-04-05 15:17:16 +02:00
import de.anomic.http.HttpClient ;
2008-04-10 01:35:20 +02:00
import de.anomic.http.JakartaCommonsHttpClient ;
2008-04-05 15:17:16 +02:00
import de.anomic.http.JakartaCommonsHttpResponse ;
2005-04-21 12:31:40 +02:00
import de.anomic.http.httpHeader ;
2006-09-11 10:26:39 +02:00
import de.anomic.http.httpdBoundedSizeOutputStream ;
import de.anomic.http.httpdLimitExceededException ;
2008-03-26 16:37:49 +01:00
import de.anomic.index.indexReferenceBlacklist ;
2006-09-04 08:09:20 +02:00
import de.anomic.plasma.plasmaHTCache ;
import de.anomic.plasma.plasmaParser ;
import de.anomic.plasma.plasmaSwitchboard ;
2006-09-06 16:31:17 +02:00
import de.anomic.plasma.cache.IResourceInfo ;
import de.anomic.plasma.cache.http.ResourceInfo ;
2006-03-19 01:29:27 +01:00
import de.anomic.server.serverSystem ;
2005-06-09 12:44:55 +02:00
import de.anomic.server.logging.serverLog ;
2007-09-05 11:01:35 +02:00
import de.anomic.yacy.yacyURL ;
2005-04-21 12:31:40 +02:00
2008-05-06 02:32:41 +02:00
public final class HTTPLoader {
2005-11-04 14:41:51 +01:00
2008-05-24 13:04:44 +02:00
private static final String DEFAULT_ENCODING = " gzip,deflate " ;
private static final String DEFAULT_LANGUAGE = " en-us,en;q=0.5 " ;
private static final String DEFAULT_CHARSET = " ISO-8859-1,utf-8;q=0.7,*;q=0.7 " ;
private static final long DEFAULT_MAXFILESIZE = 1024 * 1024 * 10 ;
public static final int DEFAULT_CRAWLING_RETRY_COUNT = 5 ;
2008-04-05 15:17:16 +02:00
private static final String crawlerUserAgent = " yacybot ( " + HttpClient . getSystemOST ( ) + " ) http://yacy.net/bot.html " ;
2006-09-04 11:00:18 +02:00
/ * *
* The socket timeout that should be used
* /
2005-10-22 15:28:04 +02:00
private int socketTimeout ;
2006-09-04 11:00:18 +02:00
2006-09-09 17:06:49 +02:00
/ * *
* The maximum allowed file size
* /
2008-05-24 13:04:44 +02:00
//private long maxFileSize = -1;
2006-09-09 17:06:49 +02:00
2008-05-24 13:04:44 +02:00
//private String acceptEncoding;
//private String acceptLanguage;
//private String acceptCharset;
2007-10-29 02:43:20 +01:00
private plasmaSwitchboard sb ;
private serverLog log ;
2006-09-04 11:00:18 +02:00
2008-05-06 02:32:41 +02:00
public HTTPLoader ( plasmaSwitchboard sb , serverLog theLog ) {
2007-10-29 02:43:20 +01:00
this . sb = sb ;
this . log = theLog ;
2007-06-07 17:26:41 +02:00
// refreshing timeout value
2007-10-29 02:43:20 +01:00
this . socketTimeout = ( int ) sb . getConfigLong ( " crawler.clientTimeout " , 10000 ) ;
2008-05-24 13:04:44 +02:00
}
2006-09-04 11:00:18 +02:00
2008-04-05 15:17:16 +02:00
/ * *
* @param entry
* @param requestDate
* @param requestHeader
* @param responseHeader
* @param responseStatus Status - Code SPACE Reason - Phrase
* @return
* /
2008-05-06 02:32:41 +02:00
protected plasmaHTCache . Entry createCacheEntry ( CrawlEntry entry , Date requestDate , httpHeader requestHeader , httpHeader responseHeader , final String responseStatus ) {
2008-04-05 15:17:16 +02:00
IResourceInfo resourceInfo = new ResourceInfo ( entry . url ( ) , requestHeader , responseHeader ) ;
2007-08-15 23:31:31 +02:00
return plasmaHTCache . newEntry (
2006-09-04 11:00:18 +02:00
requestDate ,
2007-10-29 02:43:20 +01:00
entry . depth ( ) ,
entry . url ( ) ,
entry . name ( ) ,
2008-04-05 15:17:16 +02:00
responseStatus ,
2006-09-06 16:31:17 +02:00
resourceInfo ,
2007-10-29 02:43:20 +01:00
entry . initiator ( ) ,
2008-05-14 23:36:02 +02:00
sb . webIndex . profilesActiveCrawls . getEntry ( entry . profileHandle ( ) )
2005-06-28 10:01:26 +02:00
) ;
2006-09-04 11:00:18 +02:00
}
2007-10-29 02:43:20 +01:00
2008-05-06 02:32:41 +02:00
public plasmaHTCache . Entry load ( CrawlEntry entry , String parserMode ) {
2007-11-22 02:34:29 +01:00
return load ( entry , parserMode , DEFAULT_CRAWLING_RETRY_COUNT ) ;
2007-10-29 02:43:20 +01:00
}
2006-09-04 11:00:18 +02:00
2008-05-06 02:32:41 +02:00
private plasmaHTCache . Entry load ( CrawlEntry entry , String parserMode , int retryCount ) {
2005-11-04 14:41:51 +01:00
2007-10-29 02:43:20 +01:00
if ( retryCount < 0 ) {
this . log . logInfo ( " Redirection counter exceeded for URL " + entry . url ( ) . toString ( ) + " . Processing aborted. " ) ;
2008-05-14 23:36:02 +02:00
sb . crawlQueues . errorURL . newEntry ( entry , sb . webIndex . seedDB . mySeed ( ) . hash , new Date ( ) , 1 , ErrorURL . DENIED_REDIRECTION_COUNTER_EXCEEDED ) . store ( ) ;
2007-10-29 02:43:20 +01:00
return null ;
}
2005-06-02 03:33:10 +02:00
Date requestDate = new Date ( ) ; // remember the time...
2007-10-29 02:43:20 +01:00
String host = entry . url ( ) . getHost ( ) ;
String path = entry . url ( ) . getFile ( ) ;
int port = entry . url ( ) . getPort ( ) ;
boolean ssl = entry . url ( ) . getProtocol ( ) . equals ( " https " ) ;
2005-06-02 03:33:10 +02:00
if ( port < 0 ) port = ( ssl ) ? 443 : 80 ;
2006-08-07 17:11:14 +02:00
2005-09-02 14:09:45 +02:00
// check if url is in blacklist
String hostlow = host . toLowerCase ( ) ;
2008-03-26 16:37:49 +01:00
if ( plasmaSwitchboard . urlBlacklist . isListed ( indexReferenceBlacklist . BLACKLIST_CRAWLER , hostlow , path ) ) {
2007-10-29 02:43:20 +01:00
this . log . logInfo ( " CRAWLER Rejecting URL ' " + entry . url ( ) . toString ( ) + " '. URL is in blacklist. " ) ;
2008-05-14 23:36:02 +02:00
sb . crawlQueues . errorURL . newEntry ( entry , sb . webIndex . seedDB . mySeed ( ) . hash , new Date ( ) , 1 , ErrorURL . DENIED_URL_IN_BLACKLIST ) . store ( ) ;
2006-01-22 17:39:10 +01:00
return null ;
2005-11-04 14:41:51 +01:00
}
2007-10-29 02:43:20 +01:00
2005-06-02 03:33:10 +02:00
// take a file from the net
2006-01-22 17:39:10 +01:00
plasmaHTCache . Entry htCache = null ;
2005-06-02 03:33:10 +02:00
try {
// create a request header
httpHeader requestHeader = new httpHeader ( ) ;
2008-04-05 15:17:16 +02:00
requestHeader . put ( httpHeader . USER_AGENT , crawlerUserAgent ) ;
2007-10-29 02:43:20 +01:00
yacyURL refererURL = null ;
if ( entry . referrerhash ( ) ! = null ) refererURL = sb . getURL ( entry . referrerhash ( ) ) ;
2008-05-24 13:04:44 +02:00
if ( refererURL ! = null ) requestHeader . put ( httpHeader . REFERER , refererURL . toNormalform ( true , true ) ) ;
requestHeader . put ( httpHeader . ACCEPT_LANGUAGE , sb . getConfig ( " crawler.http.acceptLanguage " , DEFAULT_LANGUAGE ) ) ;
requestHeader . put ( httpHeader . ACCEPT_CHARSET , sb . getConfig ( " crawler.http.acceptCharset " , DEFAULT_CHARSET ) ) ;
requestHeader . put ( httpHeader . ACCEPT_ENCODING , sb . getConfig ( " crawler.http.acceptEncoding " , DEFAULT_ENCODING ) ) ;
2005-11-04 14:41:51 +01:00
2008-04-05 15:17:16 +02:00
// HTTP-Client
2008-04-11 00:47:05 +02:00
JakartaCommonsHttpClient client = new JakartaCommonsHttpClient ( socketTimeout , requestHeader , null ) ;
2008-04-05 15:17:16 +02:00
2008-04-11 00:47:05 +02:00
JakartaCommonsHttpResponse res = null ;
2008-04-05 15:17:16 +02:00
try {
// send request
res = client . GET ( entry . url ( ) . toString ( ) ) ;
2005-11-04 14:41:51 +01:00
2008-04-05 15:17:16 +02:00
if ( res . getStatusCode ( ) = = 200 | | res . getStatusCode ( ) = = 203 ) {
2005-06-02 03:33:10 +02:00
// the transfer is ok
2006-08-07 17:11:14 +02:00
// create a new cache entry
2008-04-05 15:17:16 +02:00
htCache = createCacheEntry ( entry , requestDate , requestHeader , res . getResponseHeader ( ) , res . getStatusLine ( ) ) ;
2006-08-07 17:11:14 +02:00
// aborting download if content is to long ...
2006-09-04 17:03:54 +02:00
if ( htCache . cacheFile ( ) . getAbsolutePath ( ) . length ( ) > serverSystem . maxPathLength ) {
2007-10-29 02:43:20 +01:00
this . log . logInfo ( " REJECTED URL " + entry . url ( ) . toString ( ) + " because path too long ' " + plasmaHTCache . cachePath . getAbsolutePath ( ) + " ' " ) ;
2008-05-14 23:36:02 +02:00
sb . crawlQueues . errorURL . newEntry ( entry , sb . webIndex . seedDB . mySeed ( ) . hash , new Date ( ) , 1 , ErrorURL . DENIED_CACHEFILE_PATH_TOO_LONG ) ;
2006-03-18 16:20:50 +01:00
return ( htCache = null ) ;
}
2005-06-02 03:33:10 +02:00
// reserve cache entry
2007-08-15 23:31:31 +02:00
if ( ! htCache . cacheFile ( ) . getCanonicalPath ( ) . startsWith ( plasmaHTCache . cachePath . getCanonicalPath ( ) ) ) {
2005-11-27 07:35:23 +01:00
// if the response has not the right file type then reject file
2007-10-29 02:43:20 +01:00
this . log . logInfo ( " REJECTED URL " + entry . url ( ) . toString ( ) + " because of an invalid file path (' " +
2006-09-04 17:03:54 +02:00
htCache . cacheFile ( ) . getCanonicalPath ( ) + " ' does not start with ' " +
2007-08-15 23:31:31 +02:00
plasmaHTCache . cachePath . getAbsolutePath ( ) + " '). " ) ;
2008-05-14 23:36:02 +02:00
sb . crawlQueues . errorURL . newEntry ( entry , sb . webIndex . seedDB . mySeed ( ) . hash , new Date ( ) , 1 , ErrorURL . DENIED_INVALID_CACHEFILE_PATH ) ;
2006-03-15 21:16:15 +01:00
return ( htCache = null ) ;
}
2005-06-02 03:33:10 +02:00
// request has been placed and result has been returned. work off response
2007-10-29 02:43:20 +01:00
File cacheFile = plasmaHTCache . getCachePath ( entry . url ( ) ) ;
2005-06-02 03:33:10 +02:00
try {
2008-04-05 15:17:16 +02:00
if ( plasmaParser . supportedContent ( parserMode , entry . url ( ) , res . getResponseHeader ( ) . mime ( ) ) ) {
2006-09-11 10:26:39 +02:00
// delete old content
2005-09-05 00:03:44 +02:00
if ( cacheFile . isFile ( ) ) {
2007-10-29 02:43:20 +01:00
plasmaHTCache . deleteURLfromCache ( entry . url ( ) ) ;
2005-11-04 14:41:51 +01:00
}
2006-09-11 10:26:39 +02:00
// create parent directories
2005-06-02 03:33:10 +02:00
cacheFile . getParentFile ( ) . mkdirs ( ) ;
2006-09-11 10:26:39 +02:00
OutputStream fos = null ;
2005-06-16 10:34:52 +02:00
try {
2006-09-11 10:26:39 +02:00
// creating an output stream
fos = new FileOutputStream ( cacheFile ) ;
2006-10-03 14:16:25 +02:00
// getting content length
2008-04-05 15:17:16 +02:00
long contentLength = res . getResponseHeader ( ) . contentLength ( ) ;
2006-10-03 14:16:25 +02:00
2008-05-24 13:04:44 +02:00
// check the maximum allowed file size
if ( contentLength = = - 1 ) {
fos = new httpdBoundedSizeOutputStream ( fos , sb . getConfigLong ( " crawler.http.maxFileSize " , DEFAULT_MAXFILESIZE ) ) ;
} else if ( contentLength > sb . getConfigLong ( " crawler.http.maxFileSize " , DEFAULT_MAXFILESIZE ) ) {
this . log . logInfo ( " REJECTED URL " + entry . url ( ) + " because file size ' " + contentLength + " ' exceeds max filesize limit of " + sb . getConfigLong ( " crawler.http.maxFileSize " , DEFAULT_MAXFILESIZE ) + " bytes. " ) ;
sb . crawlQueues . errorURL . newEntry ( entry , sb . webIndex . seedDB . mySeed ( ) . hash , new Date ( ) , 1 , ErrorURL . DENIED_FILESIZE_LIMIT_EXCEEDED ) ;
return null ;
2006-09-11 10:26:39 +02:00
}
// we write the new cache entry to file system directly
2008-06-06 18:01:27 +02:00
( res ) . setAccountingName ( " CRAWLER " ) ;
2008-04-05 15:17:16 +02:00
byte [ ] responseBody = res . getData ( ) ;
fos . write ( responseBody ) ;
htCache . setCacheArray ( responseBody ) ;
2007-08-15 23:31:31 +02:00
plasmaHTCache . writeFileAnnouncement ( cacheFile ) ;
2005-06-16 10:34:52 +02:00
} finally {
2006-09-04 11:00:18 +02:00
if ( fos ! = null ) try { fos . close ( ) ; } catch ( Exception e ) { /* ignore this */ }
2005-06-16 10:34:52 +02:00
}
2006-01-22 17:39:10 +01:00
2007-10-29 02:43:20 +01:00
return htCache ;
2005-10-03 10:45:39 +02:00
} else {
// if the response has not the right file type then reject file
2008-04-05 15:17:16 +02:00
this . log . logInfo ( " REJECTED WRONG MIME/EXT TYPE " + res . getResponseHeader ( ) . mime ( ) + " for URL " + entry . url ( ) . toString ( ) ) ;
2008-05-14 23:36:02 +02:00
sb . crawlQueues . errorURL . newEntry ( entry , sb . webIndex . seedDB . mySeed ( ) . hash , new Date ( ) , 1 , ErrorURL . DENIED_WRONG_MIMETYPE_OR_EXT ) ;
2007-10-29 02:43:20 +01:00
return null ;
2005-06-02 03:33:10 +02:00
}
} catch ( SocketException e ) {
// this may happen if the client suddenly closes its connection
// maybe the user has stopped loading
// in that case, we are not responsible and just forget it
// but we clean the cache also, since it may be only partial
// and most possible corrupted
if ( cacheFile . exists ( ) ) cacheFile . delete ( ) ;
2007-10-29 02:43:20 +01:00
this . log . logSevere ( " CRAWLER LOADER ERROR1: with URL= " + entry . url ( ) . toString ( ) + " : " + e . toString ( ) ) ;
2008-05-14 23:36:02 +02:00
sb . crawlQueues . errorURL . newEntry ( entry , sb . webIndex . seedDB . mySeed ( ) . hash , new Date ( ) , 1 , ErrorURL . DENIED_CONNECTION_ERROR ) ;
2006-01-22 17:39:10 +01:00
htCache = null ;
2005-06-02 03:33:10 +02:00
}
2008-04-05 15:17:16 +02:00
} else if ( res . getStatusLine ( ) . startsWith ( " 30 " ) ) {
if ( res . getResponseHeader ( ) . containsKey ( httpHeader . LOCATION ) ) {
2005-08-22 00:52:46 +02:00
// getting redirection URL
2008-06-06 18:01:27 +02:00
String redirectionUrlString = res . getResponseHeader ( ) . get ( httpHeader . LOCATION ) ;
2005-08-22 00:52:46 +02:00
redirectionUrlString = redirectionUrlString . trim ( ) ;
2005-11-04 14:41:51 +01:00
2005-12-25 14:56:11 +01:00
if ( redirectionUrlString . length ( ) = = 0 ) {
2007-10-29 02:43:20 +01:00
this . log . logWarning ( " CRAWLER Redirection of URL= " + entry . url ( ) . toString ( ) + " aborted. Location header is empty. " ) ;
2008-05-14 23:36:02 +02:00
sb . crawlQueues . errorURL . newEntry ( entry , sb . webIndex . seedDB . mySeed ( ) . hash , new Date ( ) , 1 , ErrorURL . DENIED_REDIRECTION_HEADER_EMPTY ) ;
2006-01-22 17:39:10 +01:00
return null ;
2005-12-25 14:56:11 +01:00
}
2005-08-22 00:52:46 +02:00
// normalizing URL
2007-10-29 02:43:20 +01:00
yacyURL redirectionUrl = yacyURL . newURL ( entry . url ( ) , redirectionUrlString ) ;
2005-11-04 14:41:51 +01:00
2005-06-28 10:01:26 +02:00
// restart crawling with new url
2008-04-05 15:17:16 +02:00
this . log . logInfo ( " CRAWLER Redirection detected (' " + res . getStatusLine ( ) + " ') for URL " + entry . url ( ) . toString ( ) ) ;
2006-09-04 11:00:18 +02:00
this . log . logInfo ( " CRAWLER ..Redirecting request to: " + redirectionUrl ) ;
2005-11-04 14:41:51 +01:00
2005-08-17 13:36:48 +02:00
// if we are already doing a shutdown we don't need to retry crawling
if ( Thread . currentThread ( ) . isInterrupted ( ) ) {
2007-10-29 02:43:20 +01:00
this . log . logSevere ( " CRAWLER Retry of URL= " + entry . url ( ) . toString ( ) + " aborted because of server shutdown. " ) ;
2008-05-14 23:36:02 +02:00
sb . crawlQueues . errorURL . newEntry ( entry , sb . webIndex . seedDB . mySeed ( ) . hash , new Date ( ) , 1 , ErrorURL . DENIED_SERVER_SHUTDOWN ) ;
2006-01-22 17:39:10 +01:00
return null ;
2005-11-04 14:41:51 +01:00
}
// generating url hash
2007-09-05 11:01:35 +02:00
String urlhash = redirectionUrl . hash ( ) ;
2005-11-03 16:28:37 +01:00
2007-10-03 00:40:53 +02:00
// check if the url was already indexed
2007-10-29 02:43:20 +01:00
String dbname = sb . urlExists ( urlhash ) ;
2007-10-03 00:40:53 +02:00
if ( dbname ! = null ) {
2007-10-29 02:43:20 +01:00
this . log . logWarning ( " CRAWLER Redirection of URL= " + entry . url ( ) . toString ( ) + " ignored. The url appears already in db " + dbname ) ;
2008-05-14 23:36:02 +02:00
sb . crawlQueues . errorURL . newEntry ( entry , sb . webIndex . seedDB . mySeed ( ) . hash , new Date ( ) , 1 , ErrorURL . DENIED_REDIRECTION_TO_DOUBLE_CONTENT ) ;
2007-10-03 00:40:53 +02:00
return null ;
}
2005-08-17 13:36:48 +02:00
// retry crawling with new url
2007-10-29 02:43:20 +01:00
entry . redirectURL ( redirectionUrl ) ;
2007-11-22 02:34:29 +01:00
return load ( entry , plasmaParser . PARSER_MODE_URLREDIRECTOR , retryCount - 1 ) ;
2006-01-22 17:39:10 +01:00
2005-06-28 10:01:26 +02:00
}
2007-10-29 02:43:20 +01:00
} else {
2005-06-02 03:33:10 +02:00
// if the response has not the right response type then reject file
2008-04-05 15:17:16 +02:00
this . log . logInfo ( " REJECTED WRONG STATUS TYPE ' " + res . getStatusLine ( ) + " ' for URL " + entry . url ( ) . toString ( ) ) ;
2006-08-07 13:31:18 +02:00
2006-08-07 17:11:14 +02:00
// not processed any further
2008-05-14 23:36:02 +02:00
sb . crawlQueues . errorURL . newEntry ( entry , sb . webIndex . seedDB . mySeed ( ) . hash , new Date ( ) , 1 , ErrorURL . DENIED_WRONG_HTTP_STATUSCODE + res . getStatusCode ( ) + " ) " ) ;
2005-06-02 03:33:10 +02:00
}
2006-01-22 17:39:10 +01:00
2008-04-05 15:17:16 +02:00
} finally {
if ( res ! = null ) {
// release connection
res . closeStream ( ) ;
}
}
2006-01-22 17:39:10 +01:00
return htCache ;
2005-06-02 03:33:10 +02:00
} catch ( Exception e ) {
2005-08-04 13:05:04 +02:00
String errorMsg = e . getMessage ( ) ;
2006-08-07 17:11:14 +02:00
String failreason = null ;
2005-11-04 14:41:51 +01:00
2006-01-28 17:18:07 +01:00
if ( ( e instanceof IOException ) & &
( errorMsg ! = null ) & &
( errorMsg . indexOf ( " socket closed " ) > = 0 ) & &
( Thread . currentThread ( ) . isInterrupted ( ) )
) {
2006-09-04 11:00:18 +02:00
this . log . logInfo ( " CRAWLER Interruption detected because of server shutdown. " ) ;
2008-05-06 02:32:41 +02:00
failreason = ErrorURL . DENIED_SERVER_SHUTDOWN ;
2006-09-11 10:26:39 +02:00
} else if ( e instanceof httpdLimitExceededException ) {
2008-05-24 13:04:44 +02:00
this . log . logWarning ( " CRAWLER Max file size limit ' " + sb . getConfigLong ( " crawler.http.maxFileSize " , DEFAULT_MAXFILESIZE ) + " ' exceeded while downloading URL " + entry . url ( ) ) ;
2008-05-06 02:32:41 +02:00
failreason = ErrorURL . DENIED_FILESIZE_LIMIT_EXCEEDED ;
2006-01-28 17:18:07 +01:00
} else if ( e instanceof MalformedURLException ) {
2007-10-29 02:43:20 +01:00
this . log . logWarning ( " CRAWLER Malformed URL ' " + entry . url ( ) . toString ( ) + " ' detected. " ) ;
2008-05-06 02:32:41 +02:00
failreason = ErrorURL . DENIED_MALFORMED_URL ;
2005-09-11 22:21:38 +02:00
} else if ( e instanceof NoRouteToHostException ) {
2007-10-29 02:43:20 +01:00
this . log . logWarning ( " CRAWLER No route to host found while trying to crawl URL ' " + entry . url ( ) . toString ( ) + " '. " ) ;
2008-05-06 02:32:41 +02:00
failreason = ErrorURL . DENIED_NO_ROUTE_TO_HOST ;
2005-11-04 14:41:51 +01:00
} else if ( ( e instanceof UnknownHostException ) | |
( ( errorMsg ! = null ) & & ( errorMsg . indexOf ( " unknown host " ) > = 0 ) ) ) {
2007-11-07 23:38:09 +01:00
yacyURL u = ( entry . referrerhash ( ) = = null ) ? null : sb . getURL ( entry . referrerhash ( ) ) ;
this . log . logWarning ( " CRAWLER Unknown host in URL ' " + entry . url ( ) + " '. " +
" Referer URL: " + ( ( u = = null ) ? " Unknown " : u . toNormalform ( true , true ) ) ) ;
2008-05-06 02:32:41 +02:00
failreason = ErrorURL . DENIED_UNKNOWN_HOST ;
2005-09-01 08:31:30 +02:00
} else if ( e instanceof java . net . BindException ) {
2007-10-29 02:43:20 +01:00
this . log . logWarning ( " CRAWLER BindException detected while trying to download content from ' " + entry . url ( ) . toString ( ) +
2005-09-01 08:31:30 +02:00
" '. Retrying request. " ) ;
2008-05-06 02:32:41 +02:00
failreason = ErrorURL . DENIED_CONNECTION_BIND_EXCEPTION ;
2006-10-19 10:45:52 +02:00
} else if ( ( errorMsg ! = null ) & & (
( errorMsg . indexOf ( " Corrupt GZIP trailer " ) > = 0 ) | |
2007-01-05 14:45:31 +01:00
( errorMsg . indexOf ( " Not in GZIP format " ) > = 0 ) | |
( errorMsg . indexOf ( " Unexpected end of ZLIB " ) > = 0 )
2006-10-19 10:45:52 +02:00
) ) {
2007-10-29 02:43:20 +01:00
this . log . logWarning ( " CRAWLER Problems detected while receiving gzip encoded content from ' " + entry . url ( ) . toString ( ) +
2005-09-01 08:31:30 +02:00
" '. Retrying request without using gzip content encoding. " ) ;
2008-05-06 02:32:41 +02:00
failreason = ErrorURL . DENIED_CONTENT_DECODING_ERROR ;
2008-04-14 05:50:49 +02:00
} else if ( ( errorMsg ! = null ) & & ( errorMsg . indexOf ( " The host did not accept the connection within timeout of " ) > = 0 ) ) {
this . log . logWarning ( " CRAWLER Timeout while trying to connect to ' " + entry . url ( ) . toString ( ) +
" '. Retrying request. " ) ;
2008-05-06 02:32:41 +02:00
failreason = ErrorURL . DENIED_CONNECTION_TIMEOUT ;
2005-09-01 08:31:30 +02:00
} else if ( ( errorMsg ! = null ) & & ( errorMsg . indexOf ( " Read timed out " ) > = 0 ) ) {
2007-10-29 02:43:20 +01:00
this . log . logWarning ( " CRAWLER Read timeout while receiving content from ' " + entry . url ( ) . toString ( ) +
2005-09-01 08:31:30 +02:00
" '. Retrying request. " ) ;
2008-05-06 02:32:41 +02:00
failreason = ErrorURL . DENIED_CONNECTION_TIMEOUT ;
2005-09-01 08:31:30 +02:00
} else if ( ( errorMsg ! = null ) & & ( errorMsg . indexOf ( " connect timed out " ) > = 0 ) ) {
2007-10-29 02:43:20 +01:00
this . log . logWarning ( " CRAWLER Timeout while trying to connect to ' " + entry . url ( ) . toString ( ) +
2005-11-04 14:41:51 +01:00
" '. Retrying request. " ) ;
2008-05-06 02:32:41 +02:00
failreason = ErrorURL . DENIED_CONNECTION_TIMEOUT ;
2005-09-01 08:31:30 +02:00
} else if ( ( errorMsg ! = null ) & & ( errorMsg . indexOf ( " Connection timed out " ) > = 0 ) ) {
2007-10-29 02:43:20 +01:00
this . log . logWarning ( " CRAWLER Connection timeout while receiving content from ' " + entry . url ( ) . toString ( ) +
2005-11-04 14:41:51 +01:00
" '. Retrying request. " ) ;
2008-05-06 02:32:41 +02:00
failreason = ErrorURL . DENIED_CONNECTION_TIMEOUT ;
2005-09-01 08:31:30 +02:00
} else if ( ( errorMsg ! = null ) & & ( errorMsg . indexOf ( " Connection refused " ) > = 0 ) ) {
2007-10-29 02:43:20 +01:00
this . log . logWarning ( " CRAWLER Connection refused while trying to connect to ' " + entry . url ( ) . toString ( ) + " '. " ) ;
2008-05-06 02:32:41 +02:00
failreason = ErrorURL . DENIED_CONNECTION_REFUSED ;
2008-04-07 18:56:15 +02:00
} else if ( ( errorMsg ! = null ) & & ( errorMsg . indexOf ( " Circular redirect to ' " ) > = 0 ) ) {
this . log . logWarning ( " CRAWLER Redirect Error with URL ' " + entry . url ( ) . toString ( ) + " ': " + e . toString ( ) ) ;
2008-05-06 02:32:41 +02:00
failreason = ErrorURL . DENIED_REDIRECTION_COUNTER_EXCEEDED ;
2005-09-01 08:31:30 +02:00
} else if ( ( errorMsg ! = null ) & & ( errorMsg . indexOf ( " There is not enough space on the disk " ) > = 0 ) ) {
2007-10-29 02:43:20 +01:00
this . log . logSevere ( " CRAWLER Not enough space on the disk detected while crawling ' " + entry . url ( ) . toString ( ) + " '. " +
2005-09-01 08:31:30 +02:00
" Pausing crawlers. " ) ;
2007-10-29 02:43:20 +01:00
sb . pauseCrawlJob ( plasmaSwitchboard . CRAWLJOB_LOCAL_CRAWL ) ;
sb . pauseCrawlJob ( plasmaSwitchboard . CRAWLJOB_REMOTE_TRIGGERED_CRAWL ) ;
2008-05-06 02:32:41 +02:00
failreason = ErrorURL . DENIED_OUT_OF_DISK_SPACE ;
2005-09-03 17:17:19 +02:00
} else if ( ( errorMsg ! = null ) & & ( errorMsg . indexOf ( " Network is unreachable " ) > = 0 ) ) {
2007-10-29 02:43:20 +01:00
this . log . logSevere ( " CRAWLER Network is unreachable while trying to crawl URL ' " + entry . url ( ) . toString ( ) + " '. " ) ;
2008-05-06 02:32:41 +02:00
failreason = ErrorURL . DENIED_NETWORK_IS_UNREACHABLE ;
2005-09-08 22:37:01 +02:00
} else if ( ( errorMsg ! = null ) & & ( errorMsg . indexOf ( " No trusted certificate found " ) > = 0 ) ) {
2007-10-29 02:43:20 +01:00
this . log . logSevere ( " CRAWLER No trusted certificate found for URL ' " + entry . url ( ) . toString ( ) + " '. " ) ;
2008-05-06 02:32:41 +02:00
failreason = ErrorURL . DENIED_SSL_UNTRUSTED_CERT ;
2005-09-01 08:31:30 +02:00
} else {
2007-10-29 02:43:20 +01:00
this . log . logSevere ( " CRAWLER Unexpected Error with URL ' " + entry . url ( ) . toString ( ) + " ': " + e . toString ( ) , e ) ;
2008-05-06 02:32:41 +02:00
failreason = ErrorURL . DENIED_CONNECTION_ERROR ;
2005-09-01 08:31:30 +02:00
}
2005-11-04 14:41:51 +01:00
2006-08-07 17:11:14 +02:00
if ( failreason ! = null ) {
2006-09-12 06:50:12 +02:00
// add url into error db
2008-05-14 23:36:02 +02:00
sb . crawlQueues . errorURL . newEntry ( entry , sb . webIndex . seedDB . mySeed ( ) . hash , new Date ( ) , 1 , failreason ) ;
2006-08-07 17:11:14 +02:00
}
2006-01-22 17:39:10 +01:00
return null ;
2005-06-02 03:33:10 +02:00
}
}
2006-08-07 17:11:14 +02:00
2007-06-07 17:26:41 +02:00
}