2005-09-07 13:17:21 +02:00
//plasmaCrawlRobotsTxt.java
//-------------------------------------
//part of YACY
//(C) by Michael Peter Christen; mc@anomic.de
//first published on http://www.anomic.de
//Frankfurt, Germany, 2004
//
2008-07-05 02:35:20 +02:00
//This file is contributed by Martin Thelian
// [MC] moved some methods from robotsParser file that had been created by Alexander Schier to this class
2005-09-07 13:17:21 +02:00
//last major change: $LastChangedDate$ by $LastChangedBy$
//Revision: $LastChangedRevision$
//
//This program is free software; you can redistribute it and/or modify
//it under the terms of the GNU General Public License as published by
//the Free Software Foundation; either version 2 of the License, or
//(at your option) any later version.
//
//This program is distributed in the hope that it will be useful,
//but WITHOUT ANY WARRANTY; without even the implied warranty of
//MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
//GNU General Public License for more details.
//
//You should have received a copy of the GNU General Public License
//along with this program; if not, write to the Free Software
//Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
//
//Using this software in any meaning (reading, learning, copying, compiling,
//running) means that you agree that the Author(s) is (are) not responsible
//for cost, loss of data or any harm that may be caused directly or indirectly
//by usage of this softare or this documentation. The usage of this software
//is on your own risk. The installation and usage (starting/running) of this
//software may allow other people or application to access your computer and
//any attached devices and is highly dependent on the configuration of the
//software which must be done by the user of the software; the author(s) is
//(are) also not responsible for proper configuration and usage of the
//software, even if provoked by documentation provided together with
//the software.
//
//Any changes to this file according to the GPL as documented in the file
//gpl.txt aside this file in the shipment you received can be done to the
//lines that follows this copyright notice here, but changes must not be
//done inside the copyright notive above. A re-distribution must contain
//the intact and unchanged copyright notice.
//Contributions and changes to the program code must be marked as such.
2008-05-06 02:32:41 +02:00
package de.anomic.crawler ;
2005-09-07 13:17:21 +02:00
2008-07-05 02:35:20 +02:00
import java.io.BufferedInputStream ;
import java.io.BufferedOutputStream ;
2005-09-07 13:17:21 +02:00
import java.io.File ;
import java.io.IOException ;
2008-07-05 02:35:20 +02:00
import java.net.MalformedURLException ;
2005-09-07 13:17:21 +02:00
import java.util.ArrayList ;
import java.util.Arrays ;
import java.util.Date ;
import java.util.HashMap ;
import java.util.Iterator ;
import java.util.LinkedList ;
2006-09-30 00:27:20 +02:00
2008-07-05 02:35:20 +02:00
import de.anomic.http.HttpClient ;
import de.anomic.http.JakartaCommonsHttpClient ;
import de.anomic.http.JakartaCommonsHttpResponse ;
import de.anomic.http.httpHeader ;
2008-07-10 02:47:37 +02:00
import de.anomic.kelondro.kelondroBLOB ;
import de.anomic.kelondro.kelondroBLOBHeap ;
2008-06-08 01:12:24 +02:00
import de.anomic.kelondro.kelondroBLOBTree ;
2005-09-14 12:10:49 +02:00
import de.anomic.kelondro.kelondroException ;
2008-07-11 00:08:16 +02:00
import de.anomic.kelondro.kelondroMapDataMining ;
2007-05-09 19:59:36 +02:00
import de.anomic.kelondro.kelondroNaturalOrder ;
2008-07-05 02:35:20 +02:00
import de.anomic.server.serverByteBuffer ;
import de.anomic.server.serverFileUtils ;
import de.anomic.server.logging.serverLog ;
import de.anomic.yacy.yacyURL ;
2005-09-07 13:17:21 +02:00
2008-05-06 02:32:41 +02:00
public class RobotsTxt {
2006-03-09 15:03:54 +01:00
public static final String ROBOTS_DB_PATH_SEPARATOR = " ; " ;
2008-07-11 00:08:16 +02:00
kelondroMapDataMining robotsTable ;
2005-10-09 06:43:07 +02:00
private final File robotsTableFile ;
2005-09-07 13:17:21 +02:00
2008-05-06 02:32:41 +02:00
public RobotsTxt ( File robotsTableFile ) {
2005-09-07 13:17:21 +02:00
this . robotsTableFile = robotsTableFile ;
2006-08-24 04:19:25 +02:00
robotsTableFile . getParentFile ( ) . mkdirs ( ) ;
2008-07-10 02:47:37 +02:00
kelondroBLOB blob = null ;
if ( robotsTableFile . getName ( ) . endsWith ( " .heap " ) ) {
try {
blob = new kelondroBLOBHeap ( robotsTableFile , 64 , kelondroNaturalOrder . naturalOrder ) ;
} catch ( IOException e ) {
e . printStackTrace ( ) ;
}
} else {
blob = new kelondroBLOBTree ( robotsTableFile , true , true , 256 , 512 , '_' , kelondroNaturalOrder . naturalOrder , false , false , true ) ;
}
2008-07-11 00:08:16 +02:00
robotsTable = new kelondroMapDataMining ( blob , 100 ) ;
2006-05-09 12:03:12 +02:00
}
2005-09-07 13:17:21 +02:00
private void resetDatabase ( ) {
// deletes the robots.txt database and creates a new one
2007-03-09 09:48:47 +01:00
if ( robotsTable ! = null ) robotsTable . close ( ) ;
2005-09-07 13:17:21 +02:00
if ( ! ( robotsTableFile . delete ( ) ) ) throw new RuntimeException ( " cannot delete robots.txt database " ) ;
2005-12-13 00:59:58 +01:00
robotsTableFile . getParentFile ( ) . mkdirs ( ) ;
2008-07-11 00:08:16 +02:00
robotsTable = new kelondroMapDataMining ( new kelondroBLOBTree ( robotsTableFile , true , true , 256 , 512 , '_' , kelondroNaturalOrder . naturalOrder , false , false , true ) , 100 ) ;
2005-09-07 13:17:21 +02:00
}
2008-06-04 23:34:57 +02:00
public void clear ( ) throws IOException {
this . robotsTable . clear ( ) ;
}
2005-09-07 13:17:21 +02:00
public void close ( ) {
2007-05-18 15:00:42 +02:00
this . robotsTable . close ( ) ;
2005-09-07 13:17:21 +02:00
}
public int size ( ) {
2007-05-18 15:00:42 +02:00
return this . robotsTable . size ( ) ;
2005-12-05 15:24:13 +01:00
}
2005-09-07 13:17:21 +02:00
2008-07-05 02:35:20 +02:00
private Entry getEntry ( String hostName ) {
2005-09-07 13:17:21 +02:00
try {
2008-07-11 09:15:46 +02:00
HashMap < String , String > record = this . robotsTable . get ( hostName ) ;
2005-09-07 13:17:21 +02:00
if ( record = = null ) return null ;
return new Entry ( hostName , record ) ;
2005-12-05 15:24:13 +01:00
} catch ( kelondroException e ) {
2007-01-30 00:51:10 +01:00
resetDatabase ( ) ;
return null ;
2008-07-11 09:15:46 +02:00
} catch ( IOException e ) {
resetDatabase ( ) ;
return null ;
2005-09-07 13:17:21 +02:00
}
2008-07-05 02:35:20 +02:00
}
public int crawlDelay ( String hostname ) {
RobotsTxt . Entry robotsEntry = getEntry ( hostname ) ;
Integer hostDelay = ( robotsEntry = = null ) ? null : robotsEntry . getCrawlDelay ( ) ;
if ( hostDelay = = null ) return 0 ; else return hostDelay . intValue ( ) ;
}
2005-09-07 13:17:21 +02:00
2008-07-05 02:35:20 +02:00
private Entry addEntry (
2007-05-18 15:00:42 +02:00
String hostName ,
2008-01-24 23:49:00 +01:00
ArrayList < String > disallowPathList ,
2007-05-18 15:00:42 +02:00
Date loadedDate ,
Date modDate ,
String eTag ,
String sitemap ,
Integer crawlDelay
) {
2008-01-24 23:49:00 +01:00
Entry entry = new Entry (
hostName , disallowPathList , loadedDate , modDate ,
eTag , sitemap , crawlDelay ) ;
2005-09-07 13:17:21 +02:00
addEntry ( entry ) ;
return entry ;
}
2008-07-05 02:35:20 +02:00
private String addEntry ( Entry entry ) {
2005-09-07 13:17:21 +02:00
// writes a new page and returns key
try {
2008-07-11 00:08:16 +02:00
this . robotsTable . put ( entry . hostName , entry . mem ) ;
2005-09-07 13:17:21 +02:00
return entry . hostName ;
} catch ( IOException e ) {
return null ;
}
}
public class Entry {
public static final String DISALLOW_PATH_LIST = " disallow " ;
public static final String LOADED_DATE = " date " ;
2005-10-05 12:45:33 +02:00
public static final String MOD_DATE = " modDate " ;
public static final String ETAG = " etag " ;
2007-04-26 17:42:38 +02:00
public static final String SITEMAP = " sitemap " ;
2007-05-18 15:00:42 +02:00
public static final String CRAWL_DELAY = " crawlDelay " ;
2005-09-07 13:17:21 +02:00
// this is a simple record structure that hold all properties of a single crawl start
2008-01-29 11:12:48 +01:00
HashMap < String , String > mem ;
2008-01-24 23:49:00 +01:00
private LinkedList < String > disallowPathList ;
2007-05-18 15:00:42 +02:00
String hostName ;
2005-09-07 13:17:21 +02:00
2008-01-29 11:12:48 +01:00
public Entry ( String hostName , HashMap < String , String > mem ) {
2005-09-26 10:05:59 +02:00
this . hostName = hostName . toLowerCase ( ) ;
2005-09-07 13:17:21 +02:00
this . mem = mem ;
if ( this . mem . containsKey ( DISALLOW_PATH_LIST ) ) {
2008-01-24 23:49:00 +01:00
this . disallowPathList = new LinkedList < String > ( ) ;
2008-06-06 18:01:27 +02:00
String csPl = this . mem . get ( DISALLOW_PATH_LIST ) ;
2005-09-07 13:17:21 +02:00
if ( csPl . length ( ) > 0 ) {
2006-03-09 15:03:54 +01:00
String [ ] pathArray = csPl . split ( ROBOTS_DB_PATH_SEPARATOR ) ;
2005-09-07 13:17:21 +02:00
if ( ( pathArray ! = null ) & & ( pathArray . length > 0 ) ) {
this . disallowPathList . addAll ( Arrays . asList ( pathArray ) ) ;
}
}
} else {
2008-01-24 23:49:00 +01:00
this . disallowPathList = new LinkedList < String > ( ) ;
2005-09-07 13:17:21 +02:00
}
}
2005-10-05 12:45:33 +02:00
public Entry (
String hostName ,
2008-01-24 23:49:00 +01:00
ArrayList < String > disallowPathList ,
2005-10-05 12:45:33 +02:00
Date loadedDate ,
Date modDate ,
2007-04-26 17:42:38 +02:00
String eTag ,
2007-05-18 15:00:42 +02:00
String sitemap ,
Integer crawlDelay
) {
if ( ( hostName = = null ) | | ( hostName . length ( ) = = 0 ) ) throw new IllegalArgumentException ( " The hostname is missing " ) ;
2005-09-07 13:17:21 +02:00
this . hostName = hostName . trim ( ) . toLowerCase ( ) ;
2008-01-24 23:49:00 +01:00
this . disallowPathList = new LinkedList < String > ( ) ;
2005-09-07 13:17:21 +02:00
2008-01-24 23:49:00 +01:00
this . mem = new HashMap < String , String > ( 5 ) ;
2005-09-07 13:17:21 +02:00
if ( loadedDate ! = null ) this . mem . put ( LOADED_DATE , Long . toString ( loadedDate . getTime ( ) ) ) ;
2005-10-05 12:45:33 +02:00
if ( modDate ! = null ) this . mem . put ( MOD_DATE , Long . toString ( modDate . getTime ( ) ) ) ;
if ( eTag ! = null ) this . mem . put ( ETAG , eTag ) ;
2007-04-26 17:42:38 +02:00
if ( sitemap ! = null ) this . mem . put ( SITEMAP , sitemap ) ;
2007-05-18 15:00:42 +02:00
if ( crawlDelay ! = null ) this . mem . put ( CRAWL_DELAY , crawlDelay . toString ( ) ) ;
2005-09-07 13:17:21 +02:00
if ( ( disallowPathList ! = null ) & & ( disallowPathList . size ( ) > 0 ) ) {
this . disallowPathList . addAll ( disallowPathList ) ;
StringBuffer pathListStr = new StringBuffer ( ) ;
for ( int i = 0 ; i < disallowPathList . size ( ) ; i + + ) {
pathListStr . append ( disallowPathList . get ( i ) )
2006-03-09 15:03:54 +01:00
. append ( ROBOTS_DB_PATH_SEPARATOR ) ;
2005-09-07 13:17:21 +02:00
}
this . mem . put ( DISALLOW_PATH_LIST , pathListStr . substring ( 0 , pathListStr . length ( ) - 1 ) ) ;
}
}
public String toString ( ) {
StringBuffer str = new StringBuffer ( ) ;
str . append ( ( this . hostName = = null ) ? " null " : this . hostName )
. append ( " : " ) ;
if ( this . mem ! = null ) {
str . append ( this . mem . toString ( ) ) ;
}
return str . toString ( ) ;
}
2007-04-26 17:42:38 +02:00
public String getSitemap ( ) {
2007-05-06 11:52:04 +02:00
return this . mem . containsKey ( SITEMAP ) ? ( String ) this . mem . get ( SITEMAP ) : null ;
2007-04-26 17:42:38 +02:00
}
2005-09-07 13:17:21 +02:00
public Date getLoadedDate ( ) {
if ( this . mem . containsKey ( LOADED_DATE ) ) {
2008-06-06 18:01:27 +02:00
return new Date ( Long . valueOf ( this . mem . get ( LOADED_DATE ) ) . longValue ( ) ) ;
2005-09-07 13:17:21 +02:00
}
return null ;
}
2005-10-18 09:45:27 +02:00
public void setLoadedDate ( Date newLoadedDate ) {
if ( newLoadedDate ! = null ) {
this . mem . put ( LOADED_DATE , Long . toString ( newLoadedDate . getTime ( ) ) ) ;
}
}
2005-09-07 13:17:21 +02:00
2005-10-05 12:45:33 +02:00
public Date getModDate ( ) {
if ( this . mem . containsKey ( MOD_DATE ) ) {
2008-06-06 18:01:27 +02:00
return new Date ( Long . valueOf ( this . mem . get ( MOD_DATE ) ) . longValue ( ) ) ;
2005-10-05 12:45:33 +02:00
}
return null ;
}
public String getETag ( ) {
if ( this . mem . containsKey ( ETAG ) ) {
2008-06-06 18:01:27 +02:00
return this . mem . get ( ETAG ) ;
2005-10-05 12:45:33 +02:00
}
return null ;
}
2007-05-18 15:00:42 +02:00
public Integer getCrawlDelay ( ) {
if ( this . mem . containsKey ( CRAWL_DELAY ) ) {
2008-06-06 18:01:27 +02:00
return Integer . valueOf ( this . mem . get ( CRAWL_DELAY ) ) ;
2007-05-18 15:00:42 +02:00
}
return null ;
}
2005-09-07 13:17:21 +02:00
public boolean isDisallowed ( String path ) {
2006-03-09 15:03:54 +01:00
if ( ( this . mem = = null ) | | ( this . disallowPathList . size ( ) = = 0 ) ) return false ;
// if the path is null or empty we set it to /
if ( ( path = = null ) | | ( path . length ( ) = = 0 ) ) path = " / " ;
// escaping all occurences of ; because this char is used as special char in the Robots DB
else path = path . replaceAll ( ROBOTS_DB_PATH_SEPARATOR , " %3B " ) ;
2005-09-07 13:17:21 +02:00
2008-01-24 23:49:00 +01:00
Iterator < String > pathIter = this . disallowPathList . iterator ( ) ;
2005-09-07 13:17:21 +02:00
while ( pathIter . hasNext ( ) ) {
2008-01-24 23:49:00 +01:00
String nextPath = pathIter . next ( ) ;
2006-03-09 15:03:54 +01:00
// allow rule
if ( nextPath . startsWith ( " ! " ) & & nextPath . length ( ) > 1 & & path . startsWith ( nextPath . substring ( 1 ) ) ) {
return false ;
}
// disallow rule
if ( path . startsWith ( nextPath ) ) {
return true ;
}
2005-09-07 13:17:21 +02:00
}
return false ;
}
}
2008-07-05 02:35:20 +02:00
// methods that had been in robotsParser.java:
public static final int DOWNLOAD_ACCESS_RESTRICTED = 0 ;
public static final int DOWNLOAD_ROBOTS_TXT = 1 ;
public static final int DOWNLOAD_ETAG = 2 ;
public static final int DOWNLOAD_MODDATE = 3 ;
private static final String getHostPort ( yacyURL theURL ) {
String urlHostPort = null ;
int port = getPort ( theURL ) ;
urlHostPort = theURL . getHost ( ) + " : " + port ;
urlHostPort = urlHostPort . toLowerCase ( ) . intern ( ) ;
return urlHostPort ;
}
private static final int getPort ( yacyURL theURL ) {
int port = theURL . getPort ( ) ;
if ( port = = - 1 ) {
if ( theURL . getProtocol ( ) . equalsIgnoreCase ( " http " ) ) {
port = 80 ;
} else if ( theURL . getProtocol ( ) . equalsIgnoreCase ( " https " ) ) {
port = 443 ;
}
}
return port ;
}
public yacyURL getSitemapURL ( yacyURL theURL ) {
if ( theURL = = null ) throw new IllegalArgumentException ( ) ;
yacyURL sitemapURL = null ;
// generating the hostname:poart string needed to do a DB lookup
String urlHostPort = getHostPort ( theURL ) ;
RobotsTxt . Entry robotsTxt4Host = this . getEntry ( urlHostPort ) ;
if ( robotsTxt4Host = = null ) return null ;
try {
String sitemapUrlStr = robotsTxt4Host . getSitemap ( ) ;
if ( sitemapUrlStr ! = null ) sitemapURL = new yacyURL ( sitemapUrlStr , null ) ;
} catch ( MalformedURLException e ) { /* ignore this */ }
return sitemapURL ;
}
public Integer getCrawlDelay ( yacyURL theURL ) {
if ( theURL = = null ) throw new IllegalArgumentException ( ) ;
Integer crawlDelay = null ;
// generating the hostname:poart string needed to do a DB lookup
String urlHostPort = getHostPort ( theURL ) ;
RobotsTxt . Entry robotsTxt4Host = getEntry ( urlHostPort ) ;
if ( robotsTxt4Host = = null ) return null ;
try {
crawlDelay = robotsTxt4Host . getCrawlDelay ( ) ;
} catch ( NumberFormatException e ) { /* ignore this */ }
return crawlDelay ;
}
2008-07-10 02:47:37 +02:00
//private static final HashSet<String> loadedRobots = new HashSet<String>(); // only for debugging
2008-07-05 02:35:20 +02:00
@SuppressWarnings ( " unchecked " )
public boolean isDisallowed ( yacyURL nexturl ) {
if ( nexturl = = null ) throw new IllegalArgumentException ( ) ;
// generating the hostname:poart string needed to do a DB lookup
String urlHostPort = getHostPort ( nexturl ) ;
2008-07-10 02:47:37 +02:00
RobotsTxt . Entry robotsTxt4Host = null ;
synchronized ( this ) {
// do a DB lookup to determine if the robots data is already available
robotsTxt4Host = getEntry ( urlHostPort ) ;
// if we have not found any data or the data is older than 7 days, we need to load it from the remote server
if (
( robotsTxt4Host = = null ) | |
( robotsTxt4Host . getLoadedDate ( ) = = null ) | |
( System . currentTimeMillis ( ) - robotsTxt4Host . getLoadedDate ( ) . getTime ( ) > 7 * 24 * 60 * 60 * 1000 )
) {
2008-07-05 02:35:20 +02:00
// generating the proper url to download the robots txt
yacyURL robotsURL = null ;
try {
robotsURL = new yacyURL ( nexturl . getProtocol ( ) , nexturl . getHost ( ) , getPort ( nexturl ) , " /robots.txt " ) ;
} catch ( MalformedURLException e ) {
serverLog . logSevere ( " ROBOTS " , " Unable to generate robots.txt URL for URL ' " + nexturl . toString ( ) + " '. " ) ;
return false ;
}
Object [ ] result = null ;
2008-07-10 02:47:37 +02:00
serverLog . logFine ( " ROBOTS " , " Trying to download the robots.txt file from URL ' " + robotsURL + " '. " ) ;
try {
result = downloadRobotsTxt ( robotsURL , 5 , robotsTxt4Host ) ;
2008-07-05 02:35:20 +02:00
} catch ( Exception e ) {
2008-07-10 02:47:37 +02:00
result = null ;
2008-07-05 02:35:20 +02:00
}
2008-07-10 02:47:37 +02:00
/ *
assert ! loadedRobots . contains ( robotsURL . toNormalform ( false , false ) ) :
" robots-url= " + robotsURL . toString ( ) +
" , robots= " + ( ( result = = null | | result [ DOWNLOAD_ROBOTS_TXT ] = = null ) ? " NULL " : new String ( ( byte [ ] ) result [ DOWNLOAD_ROBOTS_TXT ] ) ) +
" , robotsTxt4Host= " + ( ( robotsTxt4Host = = null ) ? " NULL " : robotsTxt4Host . getLoadedDate ( ) . toString ( ) ) ;
loadedRobots . add ( robotsURL . toNormalform ( false , false ) ) ;
* /
2008-07-05 02:35:20 +02:00
2008-07-10 02:47:37 +02:00
if ( result = = null ) {
// no robots.txt available, make an entry to prevent that the robots loading is done twice
if ( robotsTxt4Host = = null ) {
// generate artificial entry
robotsTxt4Host = new Entry (
urlHostPort ,
new ArrayList < String > ( ) ,
new Date ( ) ,
new Date ( ) ,
null ,
null ,
new Integer ( 0 ) ) ;
} else {
robotsTxt4Host . setLoadedDate ( new Date ( ) ) ;
}
// store the data into the robots DB
addEntry ( robotsTxt4Host ) ;
} else {
Object [ ] parserResult = robotsParser . parse ( ( byte [ ] ) result [ DOWNLOAD_ROBOTS_TXT ] ) ;
ArrayList < String > denyPath = ( ArrayList < String > ) parserResult [ 0 ] ;
if ( ( ( Boolean ) result [ DOWNLOAD_ACCESS_RESTRICTED ] ) . booleanValue ( ) ) {
2008-07-05 02:35:20 +02:00
denyPath = new ArrayList < String > ( ) ;
denyPath . add ( " / " ) ;
2008-07-10 02:47:37 +02:00
}
2008-07-05 02:35:20 +02:00
2008-07-10 02:47:37 +02:00
// store the data into the robots DB
robotsTxt4Host = addEntry (
urlHostPort ,
denyPath ,
new Date ( ) ,
( Date ) result [ DOWNLOAD_MODDATE ] ,
( String ) result [ DOWNLOAD_ETAG ] ,
( String ) parserResult [ 1 ] ,
( Integer ) parserResult [ 2 ] ) ;
2008-07-05 02:35:20 +02:00
}
}
}
2008-07-10 02:47:37 +02:00
return robotsTxt4Host . isDisallowed ( nexturl . getFile ( ) ) ;
2008-07-05 02:35:20 +02:00
}
2005-09-07 13:17:21 +02:00
2008-07-05 02:35:20 +02:00
private static Object [ ] downloadRobotsTxt ( yacyURL robotsURL , int redirectionCount , RobotsTxt . Entry entry ) throws Exception {
if ( redirectionCount < 0 ) return new Object [ ] { Boolean . FALSE , null , null } ;
redirectionCount - - ;
boolean accessCompletelyRestricted = false ;
byte [ ] robotsTxt = null ;
long downloadStart , downloadEnd ;
String eTag = null , oldEtag = null ;
Date lastMod = null ;
downloadStart = System . currentTimeMillis ( ) ;
// if we previously have downloaded this robots.txt then we can set the if-modified-since header
httpHeader reqHeaders = new httpHeader ( ) ;
// add yacybot user agent
reqHeaders . put ( httpHeader . USER_AGENT , HTTPLoader . crawlerUserAgent ) ;
// adding referer
reqHeaders . put ( httpHeader . REFERER , ( yacyURL . newURL ( robotsURL , " / " ) ) . toNormalform ( true , true ) ) ;
if ( entry ! = null ) {
oldEtag = entry . getETag ( ) ;
reqHeaders = new httpHeader ( ) ;
Date modDate = entry . getModDate ( ) ;
if ( modDate ! = null ) reqHeaders . put ( httpHeader . IF_MODIFIED_SINCE , HttpClient . dateString ( entry . getModDate ( ) ) ) ;
}
// setup http-client
//TODO: adding Traffic statistic for robots download?
JakartaCommonsHttpClient client = new JakartaCommonsHttpClient ( 10000 , reqHeaders , null ) ;
JakartaCommonsHttpResponse res = null ;
try {
// sending the get request
res = client . GET ( robotsURL . toString ( ) ) ;
// check for interruption
if ( Thread . currentThread ( ) . isInterrupted ( ) ) throw new InterruptedException ( " Shutdown in progress. " ) ;
// check the response status
if ( res . getStatusLine ( ) . startsWith ( " 2 " ) ) {
if ( ! res . getResponseHeader ( ) . mime ( ) . startsWith ( " text/plain " ) ) {
robotsTxt = null ;
serverLog . logFinest ( " ROBOTS " , " Robots.txt from URL ' " + robotsURL + " ' has wrong mimetype ' " + res . getResponseHeader ( ) . mime ( ) + " '. " ) ;
} else {
// getting some metadata
eTag = res . getResponseHeader ( ) . containsKey ( httpHeader . ETAG ) ? ( res . getResponseHeader ( ) . get ( httpHeader . ETAG ) ) . trim ( ) : null ;
lastMod = res . getResponseHeader ( ) . lastModified ( ) ;
// if the robots.txt file was not changed we break here
if ( ( eTag ! = null ) & & ( oldEtag ! = null ) & & ( eTag . equals ( oldEtag ) ) ) {
serverLog . logFinest ( " ROBOTS " , " Robots.txt from URL ' " + robotsURL + " ' was not modified. Abort downloading of new version. " ) ;
return null ;
}
// downloading the content
serverByteBuffer sbb = new serverByteBuffer ( ) ;
try {
serverFileUtils . copyToStream ( new BufferedInputStream ( res . getDataAsStream ( ) ) , new BufferedOutputStream ( sbb ) ) ;
} finally {
res . closeStream ( ) ;
}
robotsTxt = sbb . getBytes ( ) ;
downloadEnd = System . currentTimeMillis ( ) ;
serverLog . logFinest ( " ROBOTS " , " Robots.txt successfully loaded from URL ' " + robotsURL + " ' in " + ( downloadEnd - downloadStart ) + " ms. " ) ;
}
} else if ( res . getStatusCode ( ) = = 304 ) {
return null ;
} else if ( res . getStatusLine ( ) . startsWith ( " 3 " ) ) {
// getting redirection URL
String redirectionUrlString = res . getResponseHeader ( ) . get ( httpHeader . LOCATION ) ;
if ( redirectionUrlString = = null ) {
serverLog . logFinest ( " ROBOTS " , " robots.txt could not be downloaded from URL ' " + robotsURL + " ' because of missing redirecton header. [ " + res . getStatusLine ( ) + " ]. " ) ;
robotsTxt = null ;
} else {
redirectionUrlString = redirectionUrlString . trim ( ) ;
// generating the new URL object
yacyURL redirectionUrl = yacyURL . newURL ( robotsURL , redirectionUrlString ) ;
// following the redirection
serverLog . logFinest ( " ROBOTS " , " Redirection detected for robots.txt with URL ' " + robotsURL + " '. " +
" \ nRedirecting request to: " + redirectionUrl ) ;
return downloadRobotsTxt ( redirectionUrl , redirectionCount , entry ) ;
}
} else if ( res . getStatusCode ( ) = = 401 | | res . getStatusCode ( ) = = 403 ) {
accessCompletelyRestricted = true ;
serverLog . logFinest ( " ROBOTS " , " Access to Robots.txt not allowed on URL ' " + robotsURL + " '. " ) ;
} else {
serverLog . logFinest ( " ROBOTS " , " robots.txt could not be downloaded from URL ' " + robotsURL + " '. [ " + res . getStatusLine ( ) + " ]. " ) ;
robotsTxt = null ;
}
} catch ( Exception e ) {
throw e ;
} finally {
if ( res ! = null ) {
// release connection
res . closeStream ( ) ;
}
}
return new Object [ ] { new Boolean ( accessCompletelyRestricted ) , robotsTxt , eTag , lastMod } ;
}
2005-09-07 13:17:21 +02:00
}