2007-10-29 02:43:20 +01:00
// plasmaProtocolLoader.java
// (C) 2007 by Michael Peter Christen; mc@yacy.net, Frankfurt a. M., Germany
// first published 24.10.2007 on http://yacy.net
//
// This is a part of YaCy, a peer-to-peer based web search engine
//
// $LastChangedDate: 2006-04-02 22:40:07 +0200 (So, 02 Apr 2006) $
// $LastChangedRevision: 1986 $
// $LastChangedBy: orbiter $
//
// LICENSE
//
// This program is free software; you can redistribute it and/or modify
// it under the terms of the GNU General Public License as published by
// the Free Software Foundation; either version 2 of the License, or
// (at your option) any later version.
//
// This program is distributed in the hope that it will be useful,
// but WITHOUT ANY WARRANTY; without even the implied warranty of
// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
// GNU General Public License for more details.
//
// You should have received a copy of the GNU General Public License
// along with this program; if not, write to the Free Software
// Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
2008-05-06 02:32:41 +02:00
package de.anomic.crawler ;
2007-10-29 02:43:20 +01:00
2008-11-11 22:33:40 +01:00
import java.io.IOException ;
2007-10-29 02:43:20 +01:00
import java.util.Arrays ;
import java.util.HashSet ;
2008-05-16 21:50:28 +02:00
import java.util.Iterator ;
import java.util.Map ;
import java.util.concurrent.ConcurrentHashMap ;
2007-10-29 02:43:20 +01:00
2008-08-25 20:11:47 +02:00
import de.anomic.index.indexDocumentMetadata ;
2007-10-29 02:43:20 +01:00
import de.anomic.plasma.plasmaSwitchboard ;
2008-05-20 01:05:19 +02:00
import de.anomic.server.serverCore ;
2008-12-17 23:53:06 +01:00
import de.anomic.server.serverProcessorJob ;
2007-10-29 02:43:20 +01:00
import de.anomic.server.logging.serverLog ;
2008-05-06 02:32:41 +02:00
public final class ProtocolLoader {
2007-10-29 02:43:20 +01:00
2008-05-16 21:50:28 +02:00
private static final long minDelay = 250 ; // milliseconds; 4 accesses per second
private static final ConcurrentHashMap < String , Long > accessTime = new ConcurrentHashMap < String , Long > ( ) ; // to protect targets from DDoS
2008-08-02 14:12:04 +02:00
private final plasmaSwitchboard sb ;
private final serverLog log ;
private final HashSet < String > supportedProtocols ;
private final HTTPLoader httpLoader ;
private final FTPLoader ftpLoader ;
2007-10-29 02:43:20 +01:00
2008-08-02 14:12:04 +02:00
public ProtocolLoader ( final plasmaSwitchboard sb , final serverLog log ) {
2007-10-29 02:43:20 +01:00
this . sb = sb ;
this . log = log ;
2008-01-28 21:08:32 +01:00
this . supportedProtocols = new HashSet < String > ( Arrays . asList ( new String [ ] { " http " , " https " , " ftp " } ) ) ;
2007-10-29 02:43:20 +01:00
// initiate loader objects
2008-05-06 02:32:41 +02:00
httpLoader = new HTTPLoader ( sb , log ) ;
ftpLoader = new FTPLoader ( sb , log ) ;
2007-10-29 02:43:20 +01:00
}
2008-08-02 14:12:04 +02:00
public boolean isSupportedProtocol ( final String protocol ) {
2007-10-29 02:43:20 +01:00
if ( ( protocol = = null ) | | ( protocol . length ( ) = = 0 ) ) return false ;
return this . supportedProtocols . contains ( protocol . trim ( ) . toLowerCase ( ) ) ;
}
2008-01-28 21:08:32 +01:00
@SuppressWarnings ( " unchecked " )
public HashSet < String > getSupportedProtocols ( ) {
return ( HashSet < String > ) this . supportedProtocols . clone ( ) ;
2007-10-29 02:43:20 +01:00
}
2008-11-11 22:33:40 +01:00
public indexDocumentMetadata load ( final CrawlEntry entry , final String parserMode ) throws IOException {
// getting the protocol of the next URL
2008-08-02 14:12:04 +02:00
final String protocol = entry . url ( ) . getProtocol ( ) ;
final String host = entry . url ( ) . getHost ( ) ;
2008-05-16 21:50:28 +02:00
2008-05-20 01:05:19 +02:00
// check if this loads a page from localhost, which must be prevented to protect the server
// against attacks to the administration interface when localhost access is granted
2008-11-11 22:33:40 +01:00
if ( serverCore . isLocalhost ( host ) & & sb . getConfigBool ( " adminAccountForLocalhost " , false ) ) throw new IOException ( " access to localhost not granted for url " + entry . url ( ) ) ;
2008-05-20 01:05:19 +02:00
2008-05-16 21:50:28 +02:00
// check access time
if ( ! entry . url ( ) . isLocal ( ) ) {
2008-08-02 14:12:04 +02:00
final Long lastAccess = accessTime . get ( host ) ;
2008-05-16 21:50:28 +02:00
long wait = 0 ;
if ( lastAccess ! = null ) wait = Math . max ( 0 , minDelay + lastAccess . longValue ( ) - System . currentTimeMillis ( ) ) ;
if ( wait > 0 ) {
// force a sleep here. Instead just sleep we clean up the accessTime map
2008-08-02 14:12:04 +02:00
final long untilTime = System . currentTimeMillis ( ) + wait ;
final Iterator < Map . Entry < String , Long > > i = accessTime . entrySet ( ) . iterator ( ) ;
2008-05-16 21:50:28 +02:00
Map . Entry < String , Long > e ;
while ( i . hasNext ( ) ) {
e = i . next ( ) ;
if ( System . currentTimeMillis ( ) > untilTime ) break ;
if ( System . currentTimeMillis ( ) - e . getValue ( ) . longValue ( ) > minDelay ) i . remove ( ) ;
}
if ( System . currentTimeMillis ( ) < untilTime )
2008-08-02 14:12:04 +02:00
try { Thread . sleep ( untilTime - System . currentTimeMillis ( ) ) ; } catch ( final InterruptedException ee ) { }
2008-05-16 21:50:28 +02:00
}
}
accessTime . put ( host , System . currentTimeMillis ( ) ) ;
2007-10-29 02:43:20 +01:00
2008-05-16 21:50:28 +02:00
// load resource
2007-11-22 02:34:29 +01:00
if ( ( protocol . equals ( " http " ) | | ( protocol . equals ( " https " ) ) ) ) return httpLoader . load ( entry , parserMode ) ;
2007-10-29 02:43:20 +01:00
if ( protocol . equals ( " ftp " ) ) return ftpLoader . load ( entry ) ;
2008-11-11 22:33:40 +01:00
throw new IOException ( " Unsupported protocol ' " + protocol + " ' in url " + entry . url ( ) ) ;
2007-10-29 02:43:20 +01:00
}
2008-08-02 14:12:04 +02:00
public String process ( final CrawlEntry entry , final String parserMode ) {
2007-10-29 02:43:20 +01:00
// load a resource, store it to htcache and push queue entry to switchboard queue
// returns null if everything went fine, a fail reason string if a problem occurred
2008-08-25 20:11:47 +02:00
indexDocumentMetadata h ;
2007-10-29 02:43:20 +01:00
try {
2008-12-17 23:53:06 +01:00
entry . setStatus ( " loading " , serverProcessorJob . STATUS_RUNNING ) ;
2007-11-22 02:34:29 +01:00
h = load ( entry , parserMode ) ;
2008-11-11 22:33:40 +01:00
assert h ! = null ;
2008-12-17 23:53:06 +01:00
entry . setStatus ( " loaded " , serverProcessorJob . STATUS_RUNNING ) ;
2008-08-02 14:12:04 +02:00
final boolean stored = sb . htEntryStoreProcess ( h ) ;
2008-12-17 23:53:06 +01:00
entry . setStatus ( " stored- " + ( ( stored ) ? " ok " : " fail " ) , serverProcessorJob . STATUS_FINISHED ) ;
2007-10-29 02:43:20 +01:00
return ( stored ) ? null : " not stored " ;
2008-11-11 22:33:40 +01:00
} catch ( IOException e ) {
2008-12-17 23:53:06 +01:00
entry . setStatus ( " error " , serverProcessorJob . STATUS_FINISHED ) ;
2008-11-11 22:33:40 +01:00
log . logWarning ( " problem loading " + entry . url ( ) . toString ( ) ) ;
2007-10-29 02:43:20 +01:00
return " load error - " + e . getMessage ( ) ;
}
}
2008-10-19 20:10:42 +02:00
}