2009-07-15 23:07:46 +02:00
// LoaderDispatcher.java
2007-10-29 02:43:20 +01:00
// (C) 2007 by Michael Peter Christen; mc@yacy.net, Frankfurt a. M., Germany
// first published 24.10.2007 on http://yacy.net
//
// This is a part of YaCy, a peer-to-peer based web search engine
//
2009-09-23 23:26:14 +02:00
// $LastChangedDate$
// $LastChangedRevision$
// $LastChangedBy$
2007-10-29 02:43:20 +01:00
//
// LICENSE
//
// This program is free software; you can redistribute it and/or modify
// it under the terms of the GNU General Public License as published by
// the Free Software Foundation; either version 2 of the License, or
// (at your option) any later version.
//
// This program is distributed in the hope that it will be useful,
// but WITHOUT ANY WARRANTY; without even the implied warranty of
// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
// GNU General Public License for more details.
//
// You should have received a copy of the GNU General Public License
// along with this program; if not, write to the Free Software
// Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
2009-10-21 17:12:34 +02:00
package net.yacy.repository ;
2007-10-29 02:43:20 +01:00
2009-08-27 16:34:41 +02:00
import java.io.ByteArrayInputStream ;
2009-11-04 17:15:28 +01:00
import java.io.File ;
2008-11-11 22:33:40 +01:00
import java.io.IOException ;
2009-08-27 16:34:41 +02:00
import java.io.InputStream ;
2009-10-18 02:53:43 +02:00
import java.io.Writer ;
2007-10-29 02:43:20 +01:00
import java.util.Arrays ;
2009-07-23 23:31:51 +02:00
import java.util.Date ;
2007-10-29 02:43:20 +01:00
import java.util.HashSet ;
2008-05-16 21:50:28 +02:00
import java.util.Iterator ;
import java.util.Map ;
import java.util.concurrent.ConcurrentHashMap ;
2007-10-29 02:43:20 +01:00
2009-10-18 02:53:43 +02:00
import net.yacy.document.Document ;
2009-10-20 00:34:44 +02:00
import net.yacy.document.TextParser ;
2009-10-18 02:53:43 +02:00
import net.yacy.document.ParserException ;
import net.yacy.document.parser.html.ContentScraper ;
import net.yacy.document.parser.html.TransformerWriter ;
2009-10-11 02:12:19 +02:00
import net.yacy.kelondro.data.meta.DigestURI ;
2009-10-10 01:13:30 +02:00
import net.yacy.kelondro.logging.Log ;
2010-03-19 15:28:25 +01:00
import net.yacy.kelondro.util.Domains ;
2009-11-04 17:15:28 +01:00
import net.yacy.kelondro.util.FileUtils ;
2009-10-10 01:13:30 +02:00
2009-07-23 23:31:51 +02:00
import de.anomic.crawler.CrawlProfile ;
2009-10-21 17:12:34 +02:00
import de.anomic.crawler.retrieval.FTPLoader ;
import de.anomic.crawler.retrieval.HTTPLoader ;
import de.anomic.crawler.retrieval.Request ;
import de.anomic.crawler.retrieval.Response ;
2010-03-10 09:55:29 +01:00
import de.anomic.crawler.retrieval.SMBLoader ;
2009-07-23 23:31:51 +02:00
import de.anomic.http.client.Cache ;
2009-10-18 02:53:43 +02:00
import de.anomic.http.client.Client ;
2009-10-11 02:24:42 +02:00
import de.anomic.http.server.HeaderFramework ;
import de.anomic.http.server.RequestHeader ;
import de.anomic.http.server.ResponseHeader ;
2009-10-11 02:12:19 +02:00
import de.anomic.search.Segments ;
2009-07-19 22:37:44 +02:00
import de.anomic.search.Switchboard ;
2007-10-29 02:43:20 +01:00
2009-07-15 23:07:46 +02:00
public final class LoaderDispatcher {
2007-10-29 02:43:20 +01:00
2008-05-16 21:50:28 +02:00
private static final long minDelay = 250 ; // milliseconds; 4 accesses per second
private static final ConcurrentHashMap < String , Long > accessTime = new ConcurrentHashMap < String , Long > ( ) ; // to protect targets from DDoS
2009-07-19 22:37:44 +02:00
private final Switchboard sb ;
2008-08-02 14:12:04 +02:00
private final HashSet < String > supportedProtocols ;
private final HTTPLoader httpLoader ;
private final FTPLoader ftpLoader ;
2010-03-10 09:55:29 +01:00
private final SMBLoader smbLoader ;
2009-07-23 23:31:51 +02:00
private final Log log ;
2007-10-29 02:43:20 +01:00
2009-07-23 23:31:51 +02:00
public LoaderDispatcher ( final Switchboard sb ) {
2007-10-29 02:43:20 +01:00
this . sb = sb ;
2010-03-10 09:55:29 +01:00
this . supportedProtocols = new HashSet < String > ( Arrays . asList ( new String [ ] { " http " , " https " , " ftp " , " smb " } ) ) ;
2007-10-29 02:43:20 +01:00
// initiate loader objects
2009-07-23 23:31:51 +02:00
this . log = new Log ( " LOADER " ) ;
2008-05-06 02:32:41 +02:00
httpLoader = new HTTPLoader ( sb , log ) ;
ftpLoader = new FTPLoader ( sb , log ) ;
2010-03-10 09:55:29 +01:00
smbLoader = new SMBLoader ( sb , log ) ;
2007-10-29 02:43:20 +01:00
}
2008-08-02 14:12:04 +02:00
public boolean isSupportedProtocol ( final String protocol ) {
2007-10-29 02:43:20 +01:00
if ( ( protocol = = null ) | | ( protocol . length ( ) = = 0 ) ) return false ;
return this . supportedProtocols . contains ( protocol . trim ( ) . toLowerCase ( ) ) ;
}
2008-01-28 21:08:32 +01:00
@SuppressWarnings ( " unchecked " )
public HashSet < String > getSupportedProtocols ( ) {
return ( HashSet < String > ) this . supportedProtocols . clone ( ) ;
2007-10-29 02:43:20 +01:00
}
2009-07-23 23:31:51 +02:00
public Response load (
2009-10-11 02:12:19 +02:00
final DigestURI url ,
2009-07-23 23:31:51 +02:00
final boolean forText ,
2009-11-04 17:15:28 +01:00
final boolean global ) throws IOException {
2009-09-03 13:46:08 +02:00
return load ( request ( url , forText , global ) , forText ) ;
2009-07-24 16:52:27 +02:00
}
2010-03-20 11:28:03 +01:00
/ * *
* load a resource from the web , from ftp , from smb or a file
* @param url
* @param forText
* @param global
* @param cacheStratgy strategy according to CACHE_STRATEGY_NOCACHE , CACHE_STRATEGY_IFFRESH , CACHE_STRATEGY_IFEXIST , CACHE_STRATEGY_CACHEONLY
* @return the loaded entity in a Response object
* @throws IOException
* /
2009-07-24 16:52:27 +02:00
public Response load (
2009-10-11 02:12:19 +02:00
final DigestURI url ,
2009-07-24 16:52:27 +02:00
final boolean forText ,
final boolean global ,
2009-11-04 17:15:28 +01:00
int cacheStratgy ) throws IOException {
2009-09-03 13:46:08 +02:00
return load ( request ( url , forText , global ) , forText , cacheStratgy ) ;
2009-07-24 16:52:27 +02:00
}
2009-11-04 17:15:28 +01:00
public void load ( final DigestURI url , int cacheStratgy , File targetFile ) throws IOException {
2010-03-20 11:28:03 +01:00
byte [ ] b = load ( request ( url , false , true ) , false , cacheStratgy ) . getContent ( ) ;
2009-11-04 17:15:28 +01:00
if ( b = = null ) throw new IOException ( " load == null " ) ;
File tmp = new File ( targetFile . getAbsolutePath ( ) + " .tmp " ) ;
// transaction-safe writing
File parent = targetFile . getParentFile ( ) ;
if ( ! parent . exists ( ) ) parent . mkdirs ( ) ;
FileUtils . copy ( b , tmp ) ;
tmp . renameTo ( targetFile ) ;
}
2009-10-01 00:11:00 +02:00
/ * *
* generate a request object
* @param url the target url
* @param forText shows that this was a for - text crawling request
* @param global shows that this was a global crawling request
* @return the request object
* /
2009-07-24 16:52:27 +02:00
public Request request (
2009-10-11 02:12:19 +02:00
final DigestURI url ,
2009-07-24 16:52:27 +02:00
final boolean forText ,
final boolean global
2009-08-08 00:53:49 +02:00
) {
2009-07-24 16:52:27 +02:00
return new Request (
sb . peers . mySeed ( ) . hash ,
url ,
" " ,
" " ,
new Date ( ) ,
new Date ( ) ,
( forText ) ?
( ( global ) ?
sb . crawler . defaultTextSnippetGlobalProfile . handle ( ) :
sb . crawler . defaultTextSnippetLocalProfile . handle ( ) )
:
( ( global ) ?
sb . crawler . defaultMediaSnippetGlobalProfile . handle ( ) :
sb . crawler . defaultMediaSnippetLocalProfile . handle ( ) ) , // crawl profile
0 ,
0 ,
0 ) ;
2009-07-23 23:31:51 +02:00
}
2009-09-03 13:46:08 +02:00
public Response load ( final Request request , final boolean acceptOnlyParseable ) throws IOException {
2009-07-24 16:52:27 +02:00
CrawlProfile . entry crawlProfile = sb . crawler . profilesActiveCrawls . getEntry ( request . profileHandle ( ) ) ;
2010-03-20 11:28:03 +01:00
int cacheStrategy = CrawlProfile . CACHE_STRATEGY_IFEXIST ;
2009-07-24 16:52:27 +02:00
if ( crawlProfile ! = null ) cacheStrategy = crawlProfile . cacheStrategy ( ) ;
2009-09-03 13:46:08 +02:00
return load ( request , acceptOnlyParseable , cacheStrategy ) ;
2009-07-24 16:52:27 +02:00
}
2009-09-03 13:46:08 +02:00
public Response load ( final Request request , final boolean acceptOnlyParseable , int cacheStrategy ) throws IOException {
2009-07-23 23:31:51 +02:00
// get the protocol of the next URL
final String protocol = request . url ( ) . getProtocol ( ) ;
final String host = request . url ( ) . getHost ( ) ;
2008-05-16 21:50:28 +02:00
2008-05-20 01:05:19 +02:00
// check if this loads a page from localhost, which must be prevented to protect the server
// against attacks to the administration interface when localhost access is granted
2010-03-19 15:28:25 +01:00
if ( Domains . isLocal ( host ) & & sb . getConfigBool ( " adminAccountForLocalhost " , false ) ) throw new IOException ( " access to localhost not granted for url " + request . url ( ) ) ;
2009-07-23 23:31:51 +02:00
// check if we have the page in the cache
CrawlProfile . entry crawlProfile = sb . crawler . profilesActiveCrawls . getEntry ( request . profileHandle ( ) ) ;
2009-07-24 16:52:27 +02:00
if ( crawlProfile ! = null & & cacheStrategy ! = CrawlProfile . CACHE_STRATEGY_NOCACHE ) {
2009-07-23 23:31:51 +02:00
// we have passed a first test if caching is allowed
// now see if there is a cache entry
ResponseHeader cachedResponse = ( request . url ( ) . isLocal ( ) ) ? null : Cache . getResponseHeader ( request . url ( ) ) ;
2009-10-01 15:08:19 +02:00
byte [ ] content = null ;
try {
content = ( cachedResponse = = null ) ? null : Cache . getContent ( request . url ( ) ) ;
} catch ( IOException e ) {
2009-11-05 21:28:37 +01:00
Log . logException ( e ) ;
2009-10-01 15:08:19 +02:00
content = null ;
}
2009-07-23 23:31:51 +02:00
if ( cachedResponse ! = null & & content ! = null ) {
// yes we have the content
// create request header values and a response object because we need that
// in case that we want to return the cached content in the next step
final RequestHeader requestHeader = new RequestHeader ( ) ;
requestHeader . put ( HeaderFramework . USER_AGENT , HTTPLoader . crawlerUserAgent ) ;
2009-10-11 02:12:19 +02:00
DigestURI refererURL = null ;
2009-10-09 16:44:20 +02:00
if ( request . referrerhash ( ) ! = null ) refererURL = sb . getURL ( Segments . Process . LOCALCRAWLING , request . referrerhash ( ) ) ;
2009-07-23 23:31:51 +02:00
if ( refererURL ! = null ) requestHeader . put ( RequestHeader . REFERER , refererURL . toNormalform ( true , true ) ) ;
Response response = new Response (
request ,
requestHeader ,
cachedResponse ,
" 200 " ,
crawlProfile ,
content ) ;
// check which caching strategy shall be used
if ( cacheStrategy = = CrawlProfile . CACHE_STRATEGY_IFEXIST | | cacheStrategy = = CrawlProfile . CACHE_STRATEGY_CACHEONLY ) {
// well, just take the cache and don't care about freshness of the content
log . logInfo ( " cache hit/useall for: " + request . url ( ) . toNormalform ( true , false ) ) ;
return response ;
}
// now the cacheStrategy must be CACHE_STRATEGY_IFFRESH, that means we should do a proxy freshness test
assert cacheStrategy = = CrawlProfile . CACHE_STRATEGY_IFFRESH : " cacheStrategy = " + cacheStrategy ;
if ( response . isFreshForProxy ( ) ) {
log . logInfo ( " cache hit/fresh for: " + request . url ( ) . toNormalform ( true , false ) ) ;
return response ;
} else {
log . logInfo ( " cache hit/stale for: " + request . url ( ) . toNormalform ( true , false ) ) ;
}
}
}
2008-05-20 01:05:19 +02:00
2009-07-23 23:31:51 +02:00
// check case where we want results from the cache exclusively, and never from the internet (offline mode)
if ( cacheStrategy = = CrawlProfile . CACHE_STRATEGY_CACHEONLY ) {
// we had a chance to get the content from the cache .. its over. We don't have it.
return null ;
}
// now forget about the cache, nothing there. Try to load the content from the internet
// check access time: this is a double-check (we checked possibly already in the balancer)
2009-07-25 23:38:57 +02:00
// to make sure that we don't DoS the target by mistake
2009-07-23 23:31:51 +02:00
if ( ! request . url ( ) . isLocal ( ) ) {
2008-08-02 14:12:04 +02:00
final Long lastAccess = accessTime . get ( host ) ;
2008-05-16 21:50:28 +02:00
long wait = 0 ;
if ( lastAccess ! = null ) wait = Math . max ( 0 , minDelay + lastAccess . longValue ( ) - System . currentTimeMillis ( ) ) ;
if ( wait > 0 ) {
// force a sleep here. Instead just sleep we clean up the accessTime map
2008-08-02 14:12:04 +02:00
final long untilTime = System . currentTimeMillis ( ) + wait ;
2009-01-06 14:51:59 +01:00
cleanupAccessTimeTable ( untilTime ) ;
2008-05-16 21:50:28 +02:00
if ( System . currentTimeMillis ( ) < untilTime )
2008-08-02 14:12:04 +02:00
try { Thread . sleep ( untilTime - System . currentTimeMillis ( ) ) ; } catch ( final InterruptedException ee ) { }
2008-05-16 21:50:28 +02:00
}
}
2009-07-23 23:31:51 +02:00
2009-07-25 23:38:57 +02:00
// now it's for sure that we will access the target. Remember the access time
2008-05-16 21:50:28 +02:00
accessTime . put ( host , System . currentTimeMillis ( ) ) ;
2007-10-29 02:43:20 +01:00
2009-07-23 23:31:51 +02:00
// load resource from the internet
Response response = null ;
2009-09-03 13:46:08 +02:00
if ( ( protocol . equals ( " http " ) | | ( protocol . equals ( " https " ) ) ) ) response = httpLoader . load ( request , acceptOnlyParseable ) ;
2010-03-11 16:43:06 +01:00
if ( protocol . equals ( " ftp " ) ) response = ftpLoader . load ( request , true ) ;
2010-03-10 09:55:29 +01:00
if ( protocol . equals ( " smb " ) ) response = smbLoader . load ( request , true ) ;
2009-07-23 23:31:51 +02:00
if ( response ! = null ) {
// we got something. Now check if we want to store that to the cache
2009-10-01 15:08:19 +02:00
String storeError = response . shallStoreCacheForCrawler ( ) ;
2009-07-23 23:31:51 +02:00
if ( storeError = = null ) {
2009-10-01 15:08:19 +02:00
try {
Cache . store ( request . url ( ) , response . getResponseHeader ( ) , response . getContent ( ) ) ;
} catch ( IOException e ) {
log . logWarning ( " cannot write " + response . url ( ) + " to Cache (3): " + e . getMessage ( ) , e ) ;
}
2009-07-23 23:31:51 +02:00
} else {
2009-10-01 15:08:19 +02:00
log . logWarning ( " cannot write " + response . url ( ) + " to Cache (4): " + storeError ) ;
2009-07-23 23:31:51 +02:00
}
return response ;
}
2007-10-29 02:43:20 +01:00
2009-07-23 23:31:51 +02:00
throw new IOException ( " Unsupported protocol ' " + protocol + " ' in url " + request . url ( ) ) ;
2007-10-29 02:43:20 +01:00
}
2009-08-27 16:34:41 +02:00
/ * *
2009-11-19 11:17:26 +01:00
* load the url as resource from the web or the cache
2009-08-27 16:34:41 +02:00
* @param url
* @param fetchOnline
* @param socketTimeout
* @param forText
2009-11-19 11:17:26 +01:00
* @return the content as { @link byte [ ] }
2009-08-27 16:34:41 +02:00
* @throws IOException
* /
2009-11-19 11:17:26 +01:00
public byte [ ] getResource ( final DigestURI url , final boolean fetchOnline , final int socketTimeout , final boolean forText , final boolean reindexing ) throws IOException {
byte [ ] resource = Cache . getContent ( url ) ;
if ( resource ! = null ) return resource ;
2009-08-27 16:34:41 +02:00
2009-11-19 11:17:26 +01:00
if ( ! fetchOnline ) return null ;
// try to download the resource using the loader
final Response entry = load ( url , forText , reindexing ) ;
if ( entry = = null ) return null ; // not found in web
// read resource body (if it is there)
return entry . getContent ( ) ;
2009-08-27 16:34:41 +02:00
}
2007-10-29 02:43:20 +01:00
2009-08-27 16:34:41 +02:00
/ * *
* Tries to load and parse a resource specified by it ' s URL .
* If the resource is not stored in cache and if fetchOnline is set the
* this function tries to download the resource from web .
*
* @param url the URL of the resource
* @param fetchOnline specifies if the resource should be loaded from web if it ' as not available in the cache
* @param timeout
* @param forText
* @param global the domain of the search . If global = = true then the content is re - indexed
* @return the parsed document as { @link Document }
* /
2009-10-11 02:12:19 +02:00
public static Document retrieveDocument ( final DigestURI url , final boolean fetchOnline , final int timeout , final boolean forText , final boolean global ) {
2009-08-27 16:34:41 +02:00
// load resource
2009-11-19 11:17:26 +01:00
byte [ ] resContent = null ;
2009-08-27 16:34:41 +02:00
ResponseHeader responseHeader = null ;
try {
// trying to load the resource from the cache
2009-11-19 11:17:26 +01:00
resContent = Cache . getContent ( url ) ;
2009-08-27 16:34:41 +02:00
responseHeader = Cache . getResponseHeader ( url ) ;
if ( resContent ! = null ) {
// if the content was found
} else if ( fetchOnline ) {
// if not found try to download it
// download resource using the crawler and keep resource in memory if possible
final Response entry = Switchboard . getSwitchboard ( ) . loader . load ( url , forText , global ) ;
// getting resource metadata (e.g. the http headers for http resources)
if ( entry ! = null ) {
// read resource body (if it is there)
final byte [ ] resourceArray = entry . getContent ( ) ;
if ( resourceArray ! = null ) {
2009-11-19 11:17:26 +01:00
resContent = resourceArray ;
2009-08-27 16:34:41 +02:00
} else {
2009-11-19 11:17:26 +01:00
resContent = Cache . getContent ( url ) ;
2009-08-27 16:34:41 +02:00
}
2010-03-07 00:41:51 +01:00
// read a fresh header
responseHeader = entry . getResponseHeader ( ) ;
2009-08-27 16:34:41 +02:00
}
// if it is still not available, report an error
if ( resContent = = null ) {
Log . logFine ( " snippet fetch " , " plasmaHTCache.Entry cache is NULL for url " + url ) ;
return null ;
}
} else {
Log . logFine ( " snippet fetch " , " no resource available for url " + url ) ;
return null ;
}
} catch ( final Exception e ) {
Log . logFine ( " snippet fetch " , " error loading resource: " + e . getMessage ( ) + " for url " + url ) ;
return null ;
}
// parse resource
Document document = null ;
try {
2009-11-19 11:17:26 +01:00
document = parseDocument ( url , resContent . length , new ByteArrayInputStream ( resContent ) , responseHeader ) ;
2009-08-27 16:34:41 +02:00
} catch ( final ParserException e ) {
Log . logFine ( " snippet fetch " , " parser error " + e . getMessage ( ) + " for url " + url ) ;
return null ;
} finally {
2009-11-19 11:17:26 +01:00
resContent = null ;
2009-08-27 16:34:41 +02:00
}
return document ;
}
2009-10-18 02:53:43 +02:00
/ * *
* Parse the resource
* @param url the URL of the resource
* @param contentLength the contentLength of the resource
* @param resourceStream the resource body as stream
* @param docInfo metadata about the resource
* @return the extracted data
* @throws ParserException
* /
public static Document parseDocument ( final DigestURI url , final long contentLength , final InputStream resourceStream , ResponseHeader responseHeader ) throws ParserException {
try {
if ( resourceStream = = null ) return null ;
// STEP 1: if no resource metadata is available, try to load it from cache
if ( responseHeader = = null ) {
// try to get the header from the htcache directory
try {
responseHeader = Cache . getResponseHeader ( url ) ;
} catch ( final Exception e ) {
// ignore this. resource info loading failed
}
}
// STEP 2: if the metadata is still null try to download it from web
if ( ( responseHeader = = null ) & & ( url . getProtocol ( ) . startsWith ( " http " ) ) ) {
// TODO: we need a better solution here
// e.g. encapsulate this in the crawlLoader class
// getting URL mimeType
try {
responseHeader = Client . whead ( url . toString ( ) ) ;
} catch ( final Exception e ) {
// ingore this. http header download failed
}
}
// STEP 3: if the metadata is still null try to guess the mimeType of the resource
2009-10-20 00:34:44 +02:00
String supportError = TextParser . supports ( url , responseHeader = = null ? null : responseHeader . mime ( ) ) ;
2009-10-18 02:53:43 +02:00
if ( supportError ! = null ) {
return null ;
}
if ( responseHeader = = null ) {
2009-10-20 00:34:44 +02:00
return TextParser . parseSource ( url , null , null , contentLength , resourceStream ) ;
2009-10-18 02:53:43 +02:00
}
2009-10-20 00:34:44 +02:00
return TextParser . parseSource ( url , responseHeader . mime ( ) , responseHeader . getCharacterEncoding ( ) , contentLength , resourceStream ) ;
2009-10-18 02:53:43 +02:00
} catch ( final InterruptedException e ) {
// interruption of thread detected
return null ;
}
}
public static ContentScraper parseResource ( final LoaderDispatcher loader , final DigestURI location , int cachePolicy ) throws IOException {
// load page
Response r = loader . load ( location , true , false , cachePolicy ) ;
byte [ ] page = ( r = = null ) ? null : r . getContent ( ) ;
if ( page = = null ) throw new IOException ( " no response from url " + location . toString ( ) ) ;
// scrape content
final ContentScraper scraper = new ContentScraper ( location ) ;
final Writer writer = new TransformerWriter ( null , null , scraper , null , false ) ;
writer . write ( new String ( page , " UTF-8 " ) ) ;
return scraper ;
}
2009-08-27 16:34:41 +02:00
2009-01-06 14:51:59 +01:00
public synchronized void cleanupAccessTimeTable ( long timeout ) {
final Iterator < Map . Entry < String , Long > > i = accessTime . entrySet ( ) . iterator ( ) ;
Map . Entry < String , Long > e ;
while ( i . hasNext ( ) ) {
e = i . next ( ) ;
if ( System . currentTimeMillis ( ) > timeout ) break ;
if ( System . currentTimeMillis ( ) - e . getValue ( ) . longValue ( ) > minDelay ) i . remove ( ) ;
}
}
2008-10-19 20:10:42 +02:00
}