2011-11-14 16:11:57 +01:00
//plasmaCrawlRobotsTxt.java
2005-09-07 13:17:21 +02:00
//-------------------------------------
//part of YACY
2008-07-20 19:14:51 +02:00
//(C) by Michael Peter Christen; mc@yacy.net
2005-09-07 13:17:21 +02:00
//first published on http://www.anomic.de
//Frankfurt, Germany, 2004
//
2008-07-05 02:35:20 +02:00
//This file is contributed by Martin Thelian
// [MC] moved some methods from robotsParser file that had been created by Alexander Schier to this class
2005-09-07 13:17:21 +02:00
//last major change: $LastChangedDate$ by $LastChangedBy$
//Revision: $LastChangedRevision$
//
//This program is free software; you can redistribute it and/or modify
2011-05-02 16:05:51 +02:00
//it under the terms of the GNU General public License as published by
2005-09-07 13:17:21 +02:00
//the Free Software Foundation; either version 2 of the License, or
//(at your option) any later version.
//
//This program is distributed in the hope that it will be useful,
//but WITHOUT ANY WARRANTY; without even the implied warranty of
//MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
2011-05-02 16:05:51 +02:00
//GNU General public License for more details.
2005-09-07 13:17:21 +02:00
//
2011-05-02 16:05:51 +02:00
//You should have received a copy of the GNU General public License
2005-09-07 13:17:21 +02:00
//along with this program; if not, write to the Free Software
//Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
2008-05-06 02:32:41 +02:00
package de.anomic.crawler ;
2005-09-07 13:17:21 +02:00
import java.io.IOException ;
2008-07-05 02:35:20 +02:00
import java.net.MalformedURLException ;
2005-09-07 13:17:21 +02:00
import java.util.ArrayList ;
import java.util.Date ;
2008-10-10 10:39:11 +02:00
import java.util.Map ;
2011-04-04 01:39:45 +02:00
import java.util.Set ;
2008-07-11 14:03:18 +02:00
import java.util.concurrent.ConcurrentHashMap ;
2010-09-05 15:19:42 +02:00
import java.util.regex.Pattern ;
2006-09-30 00:27:20 +02:00
2010-05-25 14:54:57 +02:00
import net.yacy.cora.document.MultiProtocolURI ;
2011-11-16 02:03:49 +01:00
import net.yacy.cora.document.UTF8 ;
2011-04-26 15:35:29 +02:00
import net.yacy.cora.protocol.ClientIdentification ;
2010-08-23 14:32:02 +02:00
import net.yacy.cora.protocol.HeaderFramework ;
import net.yacy.cora.protocol.RequestHeader ;
import net.yacy.cora.protocol.ResponseHeader ;
2010-08-23 00:32:39 +02:00
import net.yacy.cora.protocol.http.HTTPClient ;
2010-03-04 12:58:07 +01:00
import net.yacy.kelondro.blob.BEncodedHeap ;
2010-06-15 21:44:05 +02:00
import net.yacy.kelondro.index.RowSpaceExceededException ;
2010-09-11 17:58:15 +02:00
import net.yacy.kelondro.io.ByteCount ;
2009-10-10 01:13:30 +02:00
2011-11-14 16:11:57 +01:00
import org.apache.log4j.Logger ;
2011-11-16 16:33:55 +01:00
import de.anomic.crawler.retrieval.HTTPLoader ;
2011-11-16 02:03:49 +01:00
import de.anomic.data.WorkTables ;
2008-05-06 02:32:41 +02:00
public class RobotsTxt {
2011-11-14 16:11:57 +01:00
2010-08-23 14:32:02 +02:00
private static Logger log = Logger . getLogger ( RobotsTxt . class ) ;
2011-05-02 16:05:51 +02:00
protected static final String ROBOTS_DB_PATH_SEPARATOR = " ; " ;
protected static final Pattern ROBOTS_DB_PATH_SEPARATOR_MATCHER = Pattern . compile ( ROBOTS_DB_PATH_SEPARATOR ) ;
2011-11-14 16:11:57 +01:00
2009-02-17 10:12:47 +01:00
private final ConcurrentHashMap < String , DomSync > syncObjects ;
2008-07-11 11:12:54 +02:00
//private static final HashSet<String> loadedRobots = new HashSet<String>(); // only for debugging
2011-11-16 02:03:49 +01:00
private final WorkTables tables ;
2011-11-14 16:11:57 +01:00
2009-02-17 10:12:47 +01:00
private static class DomSync {
2011-05-02 16:05:51 +02:00
private DomSync ( ) { }
2009-02-17 10:12:47 +01:00
}
2011-11-14 16:11:57 +01:00
2011-11-16 02:03:49 +01:00
public RobotsTxt ( final WorkTables worktables ) {
2011-11-14 16:11:57 +01:00
this . syncObjects = new ConcurrentHashMap < String , DomSync > ( ) ;
2011-11-16 02:03:49 +01:00
this . tables = worktables ;
try {
log . info ( " initiated robots table: " + this . tables . getHeap ( WorkTables . TABLE_ROBOTS_NAME ) . getFile ( ) ) ;
} catch ( final IOException e ) {
try {
this . tables . getHeap ( WorkTables . TABLE_ROBOTS_NAME ) . clear ( ) ;
} catch ( final IOException e1 ) {
}
}
2006-05-09 12:03:12 +02:00
}
2011-11-14 16:11:57 +01:00
2011-11-16 02:03:49 +01:00
public void clear ( ) throws IOException {
2010-08-23 14:32:02 +02:00
log . info ( " clearing robots table " ) ;
2011-11-16 02:03:49 +01:00
this . tables . getHeap ( WorkTables . TABLE_ROBOTS_NAME ) . clear ( ) ;
2011-11-14 16:11:57 +01:00
this . syncObjects . clear ( ) ;
2005-09-07 13:17:21 +02:00
}
2011-11-14 16:11:57 +01:00
2011-11-16 02:03:49 +01:00
public int size ( ) throws IOException {
return this . tables . getHeap ( WorkTables . TABLE_ROBOTS_NAME ) . size ( ) ;
2005-12-05 15:24:13 +01:00
}
2011-11-14 16:11:57 +01:00
2011-05-02 16:05:51 +02:00
public RobotsTxtEntry getEntry ( final MultiProtocolURI theURL , final Set < String > thisAgents ) throws IOException {
2011-04-04 01:39:45 +02:00
if ( theURL = = null ) throw new IllegalArgumentException ( ) ;
if ( ! theURL . getProtocol ( ) . startsWith ( " http " ) ) return null ;
return getEntry ( theURL , thisAgents , true ) ;
}
2011-11-14 16:11:57 +01:00
2011-05-02 16:05:51 +02:00
private RobotsTxtEntry getEntry ( final MultiProtocolURI theURL , final Set < String > thisAgents , final boolean fetchOnlineIfNotAvailableOrNotFresh ) throws IOException {
2011-04-04 01:39:45 +02:00
// this method will always return a non-null value
2011-11-14 16:11:57 +01:00
final String urlHostPort = getHostPort ( theURL ) ;
2011-05-02 16:05:51 +02:00
RobotsTxtEntry robotsTxt4Host = null ;
2010-06-15 21:44:05 +02:00
Map < String , byte [ ] > record ;
2011-11-16 02:03:49 +01:00
final BEncodedHeap robotsTable = this . tables . getHeap ( WorkTables . TABLE_ROBOTS_NAME ) ;
2010-06-15 21:44:05 +02:00
try {
2011-11-16 02:03:49 +01:00
record = robotsTable . get ( robotsTable . encodedKey ( urlHostPort ) ) ;
2011-11-14 16:11:57 +01:00
} catch ( final RowSpaceExceededException e ) {
2010-08-23 14:32:02 +02:00
log . warn ( " memory exhausted " , e ) ;
2010-06-15 21:44:05 +02:00
record = null ;
}
2011-05-02 16:05:51 +02:00
if ( record ! = null ) robotsTxt4Host = new RobotsTxtEntry ( urlHostPort , record ) ;
2011-11-14 16:11:57 +01:00
2008-07-11 11:12:54 +02:00
if ( fetchOnlineIfNotAvailableOrNotFresh & & (
2011-11-14 16:11:57 +01:00
robotsTxt4Host = = null | |
2008-07-11 11:12:54 +02:00
robotsTxt4Host . getLoadedDate ( ) = = null | |
System . currentTimeMillis ( ) - robotsTxt4Host . getLoadedDate ( ) . getTime ( ) > 7 * 24 * 60 * 60 * 1000
2008-07-11 14:03:18 +02:00
) ) {
2011-11-14 16:11:57 +01:00
2008-07-11 14:03:18 +02:00
// make or get a synchronization object
2009-02-17 10:12:47 +01:00
DomSync syncObj = this . syncObjects . get ( urlHostPort ) ;
2008-07-11 14:03:18 +02:00
if ( syncObj = = null ) {
2009-02-17 10:12:47 +01:00
syncObj = new DomSync ( ) ;
2008-07-11 14:03:18 +02:00
this . syncObjects . put ( urlHostPort , syncObj ) ;
2008-07-11 11:12:54 +02:00
}
2011-11-14 16:11:57 +01:00
2008-07-11 14:03:18 +02:00
// we can now synchronize for each host separately
synchronized ( syncObj ) {
2011-11-14 16:11:57 +01:00
2008-07-11 14:03:18 +02:00
// if we have not found any data or the data is older than 7 days, we need to load it from the remote server
2011-11-14 16:11:57 +01:00
2008-07-11 14:03:18 +02:00
// check the robots table again for all threads that come here because they waited for another one
// to complete a download
2010-06-15 21:44:05 +02:00
try {
2011-11-16 02:03:49 +01:00
record = robotsTable . get ( robotsTable . encodedKey ( urlHostPort ) ) ;
2011-11-14 16:11:57 +01:00
} catch ( final RowSpaceExceededException e ) {
2010-08-23 14:32:02 +02:00
log . warn ( " memory exhausted " , e ) ;
2010-06-15 21:44:05 +02:00
record = null ;
}
2011-05-02 16:05:51 +02:00
if ( record ! = null ) robotsTxt4Host = new RobotsTxtEntry ( urlHostPort , record ) ;
2008-07-11 14:03:18 +02:00
if ( robotsTxt4Host ! = null & &
robotsTxt4Host . getLoadedDate ( ) ! = null & &
2009-04-02 17:29:36 +02:00
System . currentTimeMillis ( ) - robotsTxt4Host . getLoadedDate ( ) . getTime ( ) < = 1 * 24 * 60 * 60 * 1000 ) {
2008-07-11 14:03:18 +02:00
return robotsTxt4Host ;
}
2011-11-14 16:11:57 +01:00
2008-07-11 14:03:18 +02:00
// generating the proper url to download the robots txt
2010-05-26 02:01:16 +02:00
MultiProtocolURI robotsURL = null ;
2011-11-14 16:11:57 +01:00
try {
2010-05-26 02:01:16 +02:00
robotsURL = new MultiProtocolURI ( " http:// " + urlHostPort + " /robots.txt " ) ;
2008-08-02 14:12:04 +02:00
} catch ( final MalformedURLException e ) {
2010-08-23 14:32:02 +02:00
log . fatal ( " Unable to generate robots.txt URL for host:port ' " + urlHostPort + " '. " , e ) ;
2008-07-11 14:03:18 +02:00
robotsURL = null ;
2008-07-11 11:12:54 +02:00
}
2011-11-14 16:11:57 +01:00
2008-07-11 14:03:18 +02:00
Object [ ] result = null ;
if ( robotsURL ! = null ) {
2010-08-23 14:32:02 +02:00
if ( log . isDebugEnabled ( ) ) log . debug ( " Trying to download the robots.txt file from URL ' " + robotsURL + " '. " ) ;
2008-07-11 14:03:18 +02:00
try {
2011-05-26 12:57:02 +02:00
result = downloadRobotsTxt ( robotsURL , 3 , robotsTxt4Host ) ;
2008-08-02 14:12:04 +02:00
} catch ( final Exception e ) {
2008-07-11 14:03:18 +02:00
result = null ;
}
2008-07-11 11:12:54 +02:00
}
2008-07-11 14:03:18 +02:00
/ *
assert ! loadedRobots . contains ( robotsURL . toNormalform ( false , false ) ) :
" robots-url= " + robotsURL . toString ( ) +
2011-03-07 21:36:40 +01:00
" , robots= " + ( ( result = = null | | result [ DOWNLOAD_ROBOTS_TXT ] = = null ) ? " NULL " : UTF8 . String ( ( byte [ ] ) result [ DOWNLOAD_ROBOTS_TXT ] ) ) +
2008-07-11 14:03:18 +02:00
" , robotsTxt4Host= " + ( ( robotsTxt4Host = = null ) ? " NULL " : robotsTxt4Host . getLoadedDate ( ) . toString ( ) ) ;
loadedRobots . add ( robotsURL . toNormalform ( false , false ) ) ;
* /
2011-11-14 16:11:57 +01:00
2008-07-11 14:03:18 +02:00
if ( result = = null ) {
// no robots.txt available, make an entry to prevent that the robots loading is done twice
if ( robotsTxt4Host = = null ) {
// generate artificial entry
2011-05-02 16:05:51 +02:00
robotsTxt4Host = new RobotsTxtEntry (
2011-11-14 16:11:57 +01:00
robotsURL ,
new ArrayList < String > ( ) ,
new ArrayList < String > ( ) ,
2008-07-11 14:03:18 +02:00
new Date ( ) ,
new Date ( ) ,
null ,
null ,
2011-04-04 11:47:18 +02:00
Integer . valueOf ( 0 ) ,
null ) ;
2008-07-11 14:03:18 +02:00
} else {
robotsTxt4Host . setLoadedDate ( new Date ( ) ) ;
}
2011-11-14 16:11:57 +01:00
2008-07-11 14:03:18 +02:00
// store the data into the robots DB
2011-11-16 02:03:49 +01:00
final int sz = robotsTable . size ( ) ;
2008-07-11 14:03:18 +02:00
addEntry ( robotsTxt4Host ) ;
2011-11-16 02:03:49 +01:00
if ( robotsTable . size ( ) < = sz ) {
2010-08-23 14:32:02 +02:00
log . fatal ( " new entry in robots.txt table failed, resetting database " ) ;
2011-11-14 16:11:57 +01:00
clear ( ) ;
2009-06-15 00:09:08 +02:00
addEntry ( robotsTxt4Host ) ;
}
2008-07-11 14:03:18 +02:00
} else {
2011-11-16 02:03:49 +01:00
final byte [ ] robotsTxt = ( byte [ ] ) result [ DOWNLOAD_ROBOTS_TXT ] ;
2011-11-22 00:10:29 +01:00
//Log.logInfo("RobotsTxt", "robots of " + robotsURL.toNormalform(true, true) + ":\n" + ((robotsTxt == null) ? "null" : UTF8.String(robotsTxt))); // debug TODO remove
2011-11-16 14:06:46 +01:00
RobotsTxtParser parserResult ;
ArrayList < String > denyPath ;
2008-07-11 14:03:18 +02:00
if ( ( ( Boolean ) result [ DOWNLOAD_ACCESS_RESTRICTED ] ) . booleanValue ( ) ) {
2011-11-16 14:06:46 +01:00
parserResult = new RobotsTxtParser ( thisAgents ) ;
// create virtual deny path
2008-07-11 14:03:18 +02:00
denyPath = new ArrayList < String > ( ) ;
denyPath . add ( " / " ) ;
2011-11-16 14:06:46 +01:00
} else {
parserResult = new RobotsTxtParser ( thisAgents , robotsTxt ) ;
denyPath = parserResult . denyList ( ) ;
2008-07-11 14:03:18 +02:00
}
2011-11-14 16:11:57 +01:00
2008-07-11 14:03:18 +02:00
// store the data into the robots DB
robotsTxt4Host = addEntry (
2010-03-04 12:58:07 +01:00
robotsURL ,
2008-07-24 13:54:37 +02:00
parserResult . allowList ( ) ,
2008-07-11 14:03:18 +02:00
denyPath ,
new Date ( ) ,
( Date ) result [ DOWNLOAD_MODDATE ] ,
( String ) result [ DOWNLOAD_ETAG ] ,
2008-07-24 13:54:37 +02:00
parserResult . sitemap ( ) ,
2011-04-04 11:47:18 +02:00
parserResult . crawlDelayMillis ( ) ,
parserResult . agentName ( ) ) ;
2008-07-11 14:03:18 +02:00
}
2008-07-11 11:12:54 +02:00
}
}
return robotsTxt4Host ;
2008-07-05 02:35:20 +02:00
}
2011-11-14 16:11:57 +01:00
2011-05-02 16:05:51 +02:00
private RobotsTxtEntry addEntry (
2011-11-14 16:11:57 +01:00
final MultiProtocolURI theURL ,
final ArrayList < String > allowPathList ,
final ArrayList < String > denyPathList ,
final Date loadedDate ,
final Date modDate ,
final String eTag ,
2008-08-02 14:12:04 +02:00
final String sitemap ,
2011-04-04 11:47:18 +02:00
final long crawlDelayMillis ,
final String agentName
2007-05-18 15:00:42 +02:00
) {
2011-05-02 16:05:51 +02:00
final RobotsTxtEntry entry = new RobotsTxtEntry (
2010-03-04 12:58:07 +01:00
theURL , allowPathList , denyPathList ,
loadedDate , modDate ,
2011-04-04 11:47:18 +02:00
eTag , sitemap , crawlDelayMillis , agentName ) ;
2005-09-07 13:17:21 +02:00
addEntry ( entry ) ;
return entry ;
}
2011-11-14 16:11:57 +01:00
2011-05-02 16:05:51 +02:00
private String addEntry ( final RobotsTxtEntry entry ) {
2005-09-07 13:17:21 +02:00
// writes a new page and returns key
try {
2011-11-16 02:03:49 +01:00
final BEncodedHeap robotsTable = this . tables . getHeap ( WorkTables . TABLE_ROBOTS_NAME ) ;
robotsTable . insert ( robotsTable . encodedKey ( entry . getHostName ( ) ) , entry . getMem ( ) ) ;
2011-04-04 01:39:45 +02:00
return entry . getHostName ( ) ;
2009-12-10 00:27:26 +01:00
} catch ( final Exception e ) {
2010-08-23 14:32:02 +02:00
log . warn ( " cannot write robots.txt entry " , e ) ;
2005-09-07 13:17:21 +02:00
return null ;
}
2011-11-14 16:11:57 +01:00
}
2008-07-05 02:35:20 +02:00
// methods that had been in robotsParser.java:
2011-11-14 16:11:57 +01:00
2011-05-02 16:05:51 +02:00
private static final int DOWNLOAD_ACCESS_RESTRICTED = 0 ;
private static final int DOWNLOAD_ROBOTS_TXT = 1 ;
private static final int DOWNLOAD_ETAG = 2 ;
private static final int DOWNLOAD_MODDATE = 3 ;
2011-11-14 16:11:57 +01:00
2010-05-26 02:01:16 +02:00
static final String getHostPort ( final MultiProtocolURI theURL ) {
2008-07-05 02:35:20 +02:00
String urlHostPort = null ;
2008-08-02 14:12:04 +02:00
final int port = getPort ( theURL ) ;
2008-07-05 02:35:20 +02:00
urlHostPort = theURL . getHost ( ) + " : " + port ;
2011-11-14 16:11:57 +01:00
urlHostPort = urlHostPort . toLowerCase ( ) . intern ( ) ;
2008-07-05 02:35:20 +02:00
return urlHostPort ;
}
2011-11-14 16:11:57 +01:00
2010-05-26 02:01:16 +02:00
private static final int getPort ( final MultiProtocolURI theURL ) {
2008-07-05 02:35:20 +02:00
int port = theURL . getPort ( ) ;
if ( port = = - 1 ) {
if ( theURL . getProtocol ( ) . equalsIgnoreCase ( " http " ) ) {
port = 80 ;
} else if ( theURL . getProtocol ( ) . equalsIgnoreCase ( " https " ) ) {
port = 443 ;
}
2011-11-14 16:11:57 +01:00
2008-07-05 02:35:20 +02:00
}
return port ;
}
2011-04-04 01:39:45 +02:00
2011-05-02 16:05:51 +02:00
private static Object [ ] downloadRobotsTxt ( final MultiProtocolURI robotsURL , int redirectionCount , final RobotsTxtEntry entry ) throws Exception {
2010-03-20 11:15:11 +01:00
if ( robotsURL = = null | | ! robotsURL . getProtocol ( ) . startsWith ( " http " ) ) return null ;
2011-11-14 16:11:57 +01:00
2008-07-05 02:35:20 +02:00
if ( redirectionCount < 0 ) return new Object [ ] { Boolean . FALSE , null , null } ;
redirectionCount - - ;
2011-11-14 16:11:57 +01:00
2008-07-05 02:35:20 +02:00
boolean accessCompletelyRestricted = false ;
byte [ ] robotsTxt = null ;
long downloadStart , downloadEnd ;
String eTag = null , oldEtag = null ;
Date lastMod = null ;
downloadStart = System . currentTimeMillis ( ) ;
2011-11-14 16:11:57 +01:00
2008-07-05 02:35:20 +02:00
// if we previously have downloaded this robots.txt then we can set the if-modified-since header
2009-07-19 22:37:44 +02:00
RequestHeader reqHeaders = new RequestHeader ( ) ;
2011-11-14 16:11:57 +01:00
2008-07-05 02:35:20 +02:00
// add yacybot user agent
2011-04-26 15:35:29 +02:00
reqHeaders . put ( HeaderFramework . USER_AGENT , ClientIdentification . getUserAgent ( ) ) ;
2011-11-14 16:11:57 +01:00
2008-07-05 02:35:20 +02:00
// adding referer
2010-05-25 14:54:57 +02:00
reqHeaders . put ( RequestHeader . REFERER , ( MultiProtocolURI . newURL ( robotsURL , " / " ) ) . toNormalform ( true , true ) ) ;
2011-11-16 16:33:55 +01:00
reqHeaders . put ( RequestHeader . ACCEPT , HTTPLoader . DEFAULT_ACCEPT ) ;
2008-07-05 02:35:20 +02:00
if ( entry ! = null ) {
oldEtag = entry . getETag ( ) ;
2009-07-19 22:37:44 +02:00
reqHeaders = new RequestHeader ( ) ;
2008-08-02 14:12:04 +02:00
final Date modDate = entry . getModDate ( ) ;
2010-08-23 14:32:02 +02:00
if ( modDate ! = null ) reqHeaders . put ( RequestHeader . IF_MODIFIED_SINCE , HeaderFramework . formatRFC1123 ( entry . getModDate ( ) ) ) ;
2011-11-14 16:11:57 +01:00
2008-07-05 02:35:20 +02:00
}
2011-11-14 16:11:57 +01:00
2008-07-05 02:35:20 +02:00
// setup http-client
//TODO: adding Traffic statistic for robots download?
2010-08-23 00:32:39 +02:00
final HTTPClient client = new HTTPClient ( ) ;
2010-07-27 03:16:26 +02:00
client . setHeader ( reqHeaders . entrySet ( ) ) ;
2008-07-05 02:35:20 +02:00
try {
// check for interruption
if ( Thread . currentThread ( ) . isInterrupted ( ) ) throw new InterruptedException ( " Shutdown in progress. " ) ;
2011-11-14 16:11:57 +01:00
2010-07-27 03:16:26 +02:00
// sending the get request
2011-04-21 15:58:49 +02:00
robotsTxt = client . GETbytes ( robotsURL ) ;
2010-09-11 17:58:15 +02:00
// statistics:
if ( robotsTxt ! = null ) {
ByteCount . addAccountCount ( ByteCount . CRAWLER , robotsTxt . length ) ;
}
2010-07-27 03:16:26 +02:00
final int code = client . getHttpResponse ( ) . getStatusLine ( ) . getStatusCode ( ) ;
2010-08-10 23:22:30 +02:00
final ResponseHeader header = new ResponseHeader ( client . getHttpResponse ( ) . getAllHeaders ( ) ) ;
2011-11-14 16:11:57 +01:00
2008-07-05 02:35:20 +02:00
// check the response status
2010-07-27 03:16:26 +02:00
if ( code > 199 & & code < 300 ) {
if ( ! header . mime ( ) . startsWith ( " text/plain " ) ) {
2008-07-05 02:35:20 +02:00
robotsTxt = null ;
2011-03-28 21:55:15 +02:00
log . info ( " Robots.txt from URL ' " + robotsURL + " ' has wrong mimetype ' " + header . mime ( ) + " '. " ) ;
2008-07-05 02:35:20 +02:00
} else {
// getting some metadata
2010-07-27 03:16:26 +02:00
eTag = header . containsKey ( HeaderFramework . ETAG ) ? ( header . get ( HeaderFramework . ETAG ) ) . trim ( ) : null ;
lastMod = header . lastModified ( ) ;
2011-11-14 16:11:57 +01:00
2008-07-05 02:35:20 +02:00
// if the robots.txt file was not changed we break here
if ( ( eTag ! = null ) & & ( oldEtag ! = null ) & & ( eTag . equals ( oldEtag ) ) ) {
2010-08-23 14:32:02 +02:00
if ( log . isDebugEnabled ( ) ) log . debug ( " Robots.txt from URL ' " + robotsURL + " ' was not modified. Abort downloading of new version. " ) ;
2008-07-05 02:35:20 +02:00
return null ;
}
2011-11-14 16:11:57 +01:00
downloadEnd = System . currentTimeMillis ( ) ;
2010-08-23 14:32:02 +02:00
if ( log . isDebugEnabled ( ) ) log . debug ( " Robots.txt successfully loaded from URL ' " + robotsURL + " ' in " + ( downloadEnd - downloadStart ) + " ms. " ) ;
2008-07-05 02:35:20 +02:00
}
2010-07-27 03:16:26 +02:00
} else if ( code = = 304 ) {
2008-07-05 02:35:20 +02:00
return null ;
2010-07-27 03:16:26 +02:00
} else if ( code > 299 & & code < 400 ) {
2008-07-05 02:35:20 +02:00
// getting redirection URL
2010-07-27 03:16:26 +02:00
String redirectionUrlString = header . get ( HeaderFramework . LOCATION ) ;
2008-07-05 02:35:20 +02:00
if ( redirectionUrlString = = null ) {
2010-08-23 14:32:02 +02:00
if ( log . isDebugEnabled ( ) )
log . debug ( " robots.txt could not be downloaded from URL ' " + robotsURL + " ' because of missing redirecton header. [ " + client . getHttpResponse ( ) . getStatusLine ( ) + " ]. " ) ;
2011-11-14 16:11:57 +01:00
robotsTxt = null ;
2008-07-05 02:35:20 +02:00
} else {
2011-11-14 16:11:57 +01:00
2008-07-05 02:35:20 +02:00
redirectionUrlString = redirectionUrlString . trim ( ) ;
2011-11-14 16:11:57 +01:00
2008-07-05 02:35:20 +02:00
// generating the new URL object
2011-11-14 16:11:57 +01:00
final MultiProtocolURI redirectionUrl = MultiProtocolURI . newURL ( robotsURL , redirectionUrlString ) ;
2008-07-05 02:35:20 +02:00
// following the redirection
2011-11-14 16:11:57 +01:00
if ( log . isDebugEnabled ( ) ) log . debug ( " Redirection detected for robots.txt with URL ' " + robotsURL + " '. " +
2008-07-05 02:35:20 +02:00
" \ nRedirecting request to: " + redirectionUrl ) ;
return downloadRobotsTxt ( redirectionUrl , redirectionCount , entry ) ;
}
2010-07-27 03:16:26 +02:00
} else if ( code = = 401 | | code = = 403 ) {
2008-07-05 02:35:20 +02:00
accessCompletelyRestricted = true ;
2011-11-16 15:09:50 +01:00
log . info ( " Access to Robots.txt not allowed on URL ' " + robotsURL + " ', redirectionCount = " + redirectionCount ) ; // since this is a strange case we log it all the time
2008-07-05 02:35:20 +02:00
} else {
2010-08-23 14:32:02 +02:00
if ( log . isDebugEnabled ( ) )
log . debug ( " robots.txt could not be downloaded from URL ' " + robotsURL + " '. [ " + client . getHttpResponse ( ) . getStatusLine ( ) + " ]. " ) ;
2008-07-05 02:35:20 +02:00
robotsTxt = null ;
2011-11-14 16:11:57 +01:00
}
2008-08-02 14:12:04 +02:00
} catch ( final Exception e ) {
2008-07-05 02:35:20 +02:00
throw e ;
}
2008-08-06 21:43:12 +02:00
return new Object [ ] { Boolean . valueOf ( accessCompletelyRestricted ) , robotsTxt , eTag , lastMod } ;
2008-07-05 02:35:20 +02:00
}
2011-11-16 14:56:31 +01:00
public final static void main ( final String [ ] args ) throws Exception {
final String url = " http://www.badelatschen.net/robots.txt " ;
final Object [ ] o = downloadRobotsTxt ( new MultiProtocolURI ( url ) , 0 , null ) ;
if ( o = = null ) {
System . out . println ( " result: null " ) ;
} else {
System . out . println ( " not allowed = " + ( ( Boolean ) o [ 0 ] ) . toString ( ) ) ;
System . out . println ( " robots = " + ( ( o [ 1 ] = = null ) ? " null " : UTF8 . String ( ( byte [ ] ) o [ 1 ] ) ) ) ;
}
System . exit ( 0 ) ;
/ *
final HttpClient httpclient = new DefaultHttpClient ( ) ;
try {
final HttpGet httpget = new HttpGet ( url ) ;
final ResponseHandler < String > responseHandler = new BasicResponseHandler ( ) ;
final String responseBody = httpclient . execute ( httpget , responseHandler ) ;
System . out . println ( responseBody ) ;
} finally {
httpclient . getConnectionManager ( ) . shutdown ( ) ;
}
* /
}
2005-09-07 13:17:21 +02:00
}