2009-07-15 23:07:46 +02:00
// LoaderDispatcher.java
2007-10-29 02:43:20 +01:00
// (C) 2007 by Michael Peter Christen; mc@yacy.net, Frankfurt a. M., Germany
// first published 24.10.2007 on http://yacy.net
//
// This is a part of YaCy, a peer-to-peer based web search engine
//
2009-09-23 23:26:14 +02:00
// $LastChangedDate$
// $LastChangedRevision$
// $LastChangedBy$
2007-10-29 02:43:20 +01:00
//
// LICENSE
2011-06-01 21:31:56 +02:00
//
2007-10-29 02:43:20 +01:00
// This program is free software; you can redistribute it and/or modify
// it under the terms of the GNU General Public License as published by
// the Free Software Foundation; either version 2 of the License, or
// (at your option) any later version.
//
// This program is distributed in the hope that it will be useful,
// but WITHOUT ANY WARRANTY; without even the implied warranty of
// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
// GNU General Public License for more details.
//
// You should have received a copy of the GNU General Public License
// along with this program; if not, write to the Free Software
// Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
2009-10-21 17:12:34 +02:00
package net.yacy.repository ;
2007-10-29 02:43:20 +01:00
2009-11-04 17:15:28 +01:00
import java.io.File ;
2008-11-11 22:33:40 +01:00
import java.io.IOException ;
2010-04-30 16:03:51 +02:00
import java.net.MalformedURLException ;
2007-10-29 02:43:20 +01:00
import java.util.Arrays ;
2009-07-23 23:31:51 +02:00
import java.util.Date ;
2007-10-29 02:43:20 +01:00
import java.util.HashSet ;
2008-05-16 21:50:28 +02:00
import java.util.Iterator ;
import java.util.Map ;
import java.util.concurrent.ConcurrentHashMap ;
2010-09-06 12:23:30 +02:00
import java.util.concurrent.Semaphore ;
import java.util.concurrent.TimeUnit ;
2007-10-29 02:43:20 +01:00
2011-05-27 10:24:54 +02:00
import net.yacy.cora.document.ASCII ;
2011-03-10 13:35:32 +01:00
import net.yacy.cora.document.UTF8 ;
2012-09-25 21:20:03 +02:00
import net.yacy.cora.federate.yacy.CacheStrategy ;
2011-04-26 15:35:29 +02:00
import net.yacy.cora.protocol.ClientIdentification ;
2010-08-23 14:32:02 +02:00
import net.yacy.cora.protocol.HeaderFramework ;
import net.yacy.cora.protocol.RequestHeader ;
import net.yacy.cora.protocol.ResponseHeader ;
2013-07-09 14:28:25 +02:00
import net.yacy.cora.util.ConcurrentLog ;
2012-09-21 15:48:16 +02:00
import net.yacy.crawler.data.Cache ;
import net.yacy.crawler.data.CrawlProfile ;
import net.yacy.crawler.data.ZURL.FailCategory ;
import net.yacy.crawler.retrieval.FTPLoader ;
import net.yacy.crawler.retrieval.FileLoader ;
import net.yacy.crawler.retrieval.HTTPLoader ;
import net.yacy.crawler.retrieval.Request ;
import net.yacy.crawler.retrieval.Response ;
import net.yacy.crawler.retrieval.SMBLoader ;
2009-10-18 02:53:43 +02:00
import net.yacy.document.Document ;
2010-06-29 21:20:45 +02:00
import net.yacy.document.Parser ;
2009-10-20 00:34:44 +02:00
import net.yacy.document.TextParser ;
2009-10-11 02:12:19 +02:00
import net.yacy.kelondro.data.meta.DigestURI ;
2009-11-04 17:15:28 +01:00
import net.yacy.kelondro.util.FileUtils ;
2012-06-11 00:17:30 +02:00
import net.yacy.repository.Blacklist.BlacklistType ;
2011-09-25 18:59:06 +02:00
import net.yacy.search.Switchboard ;
2007-10-29 02:43:20 +01:00
2009-07-15 23:07:46 +02:00
public final class LoaderDispatcher {
2007-10-29 02:43:20 +01:00
2013-05-11 11:19:06 +02:00
private final static int accessTimeMaxsize = 1000 ;
2008-05-16 21:50:28 +02:00
private static final ConcurrentHashMap < String , Long > accessTime = new ConcurrentHashMap < String , Long > ( ) ; // to protect targets from DDoS
2011-06-01 21:31:56 +02:00
2009-07-19 22:37:44 +02:00
private final Switchboard sb ;
2008-08-02 14:12:04 +02:00
private final HashSet < String > supportedProtocols ;
private final HTTPLoader httpLoader ;
private final FTPLoader ftpLoader ;
2010-03-10 09:55:29 +01:00
private final SMBLoader smbLoader ;
2010-05-25 14:54:57 +02:00
private final FileLoader fileLoader ;
2011-11-25 15:09:25 +01:00
private final ConcurrentHashMap < DigestURI , Semaphore > loaderSteering ; // a map that delivers a 'finish' semaphore for urls
2013-07-09 14:28:25 +02:00
private final ConcurrentLog log ;
2011-06-01 21:31:56 +02:00
2009-07-23 23:31:51 +02:00
public LoaderDispatcher ( final Switchboard sb ) {
2007-10-29 02:43:20 +01:00
this . sb = sb ;
2010-05-25 14:54:57 +02:00
this . supportedProtocols = new HashSet < String > ( Arrays . asList ( new String [ ] { " http " , " https " , " ftp " , " smb " , " file " } ) ) ;
2011-06-01 21:31:56 +02:00
2007-10-29 02:43:20 +01:00
// initiate loader objects
2013-07-09 14:28:25 +02:00
this . log = new ConcurrentLog ( " LOADER " ) ;
2011-06-01 21:31:56 +02:00
this . httpLoader = new HTTPLoader ( sb , this . log ) ;
this . ftpLoader = new FTPLoader ( sb , this . log ) ;
this . smbLoader = new SMBLoader ( sb , this . log ) ;
this . fileLoader = new FileLoader ( sb , this . log ) ;
2011-11-25 15:09:25 +01:00
this . loaderSteering = new ConcurrentHashMap < DigestURI , Semaphore > ( ) ;
2007-10-29 02:43:20 +01:00
}
2011-06-01 21:31:56 +02:00
2008-08-02 14:12:04 +02:00
public boolean isSupportedProtocol ( final String protocol ) {
2012-07-10 22:59:03 +02:00
if ( ( protocol = = null ) | | ( protocol . isEmpty ( ) ) ) return false ;
2007-10-29 02:43:20 +01:00
return this . supportedProtocols . contains ( protocol . trim ( ) . toLowerCase ( ) ) ;
}
2011-06-01 21:31:56 +02:00
2008-01-28 21:08:32 +01:00
@SuppressWarnings ( " unchecked " )
public HashSet < String > getSupportedProtocols ( ) {
return ( HashSet < String > ) this . supportedProtocols . clone ( ) ;
2007-10-29 02:43:20 +01:00
}
2009-11-04 17:15:28 +01:00
2009-10-01 00:11:00 +02:00
/ * *
* generate a request object
* @param url the target url
* @param forText shows that this was a for - text crawling request
* @param global shows that this was a global crawling request
* @return the request object
* /
2009-07-24 16:52:27 +02:00
public Request request (
2009-10-11 02:12:19 +02:00
final DigestURI url ,
2009-07-24 16:52:27 +02:00
final boolean forText ,
final boolean global
2009-08-08 00:53:49 +02:00
) {
2009-07-24 16:52:27 +02:00
return new Request (
2011-06-01 21:31:56 +02:00
ASCII . getBytes ( this . sb . peers . mySeed ( ) . hash ) ,
url ,
null ,
" " ,
2009-07-24 16:52:27 +02:00
new Date ( ) ,
( forText ) ?
( ( global ) ?
2011-06-01 21:31:56 +02:00
this . sb . crawler . defaultTextSnippetGlobalProfile . handle ( ) :
this . sb . crawler . defaultTextSnippetLocalProfile . handle ( ) )
2009-07-24 16:52:27 +02:00
:
( ( global ) ?
2011-06-01 21:31:56 +02:00
this . sb . crawler . defaultMediaSnippetGlobalProfile . handle ( ) :
this . sb . crawler . defaultMediaSnippetLocalProfile . handle ( ) ) , // crawl profile
0 ,
0 ,
2010-12-11 01:31:57 +01:00
0 ,
2009-07-24 16:52:27 +02:00
0 ) ;
2009-07-23 23:31:51 +02:00
}
2010-06-22 14:28:53 +02:00
2013-08-22 14:23:47 +02:00
public void load ( final DigestURI url , final CacheStrategy cacheStratgy , final int maxFileSize , final File targetFile , BlacklistType blacklistType , ClientIdentification . Agent agent ) throws IOException {
2010-06-22 14:28:53 +02:00
2013-08-22 14:23:47 +02:00
final byte [ ] b = load ( request ( url , false , true ) , cacheStratgy , maxFileSize , blacklistType , agent ) . getContent ( ) ;
2010-06-22 14:28:53 +02:00
if ( b = = null ) throw new IOException ( " load == null " ) ;
2011-06-01 21:31:56 +02:00
final File tmp = new File ( targetFile . getAbsolutePath ( ) + " .tmp " ) ;
2010-06-22 14:28:53 +02:00
// transaction-safe writing
2011-06-01 21:31:56 +02:00
final File parent = targetFile . getParentFile ( ) ;
2010-06-22 14:28:53 +02:00
if ( ! parent . exists ( ) ) parent . mkdirs ( ) ;
FileUtils . copy ( b , tmp ) ;
tmp . renameTo ( targetFile ) ;
}
2011-08-31 18:02:06 +02:00
2013-08-22 14:23:47 +02:00
public Response load ( final Request request , final CacheStrategy cacheStrategy , final BlacklistType blacklistType , ClientIdentification . Agent agent ) throws IOException {
return load ( request , cacheStrategy , protocolMaxFileSize ( request . url ( ) ) , blacklistType , agent ) ;
2011-08-24 14:07:53 +02:00
}
2011-06-01 21:31:56 +02:00
2013-08-22 14:23:47 +02:00
public Response load ( final Request request , final CacheStrategy cacheStrategy , final int maxFileSize , final BlacklistType blacklistType , ClientIdentification . Agent agent ) throws IOException {
2011-11-25 15:09:25 +01:00
Semaphore check = this . loaderSteering . get ( request . url ( ) ) ;
2010-09-06 12:23:30 +02:00
if ( check ! = null ) {
// a loading process may be going on for that url
2011-06-01 21:31:56 +02:00
try { check . tryAcquire ( 5 , TimeUnit . SECONDS ) ; } catch ( final InterruptedException e ) { }
2010-09-06 12:23:30 +02:00
// now the process may have terminated and we run a normal loading
// which may be successful faster because of a cache hit
}
2011-06-01 21:31:56 +02:00
2011-11-25 15:09:25 +01:00
this . loaderSteering . put ( request . url ( ) , new Semaphore ( 0 ) ) ;
2010-09-06 12:23:30 +02:00
try {
2013-08-22 14:23:47 +02:00
final Response response = loadInternal ( request , cacheStrategy , maxFileSize , blacklistType , agent ) ;
2011-11-25 15:09:25 +01:00
check = this . loaderSteering . remove ( request . url ( ) ) ;
2010-09-06 12:23:30 +02:00
if ( check ! = null ) check . release ( 1000 ) ;
return response ;
2011-06-01 21:31:56 +02:00
} catch ( final IOException e ) {
2010-09-06 12:23:30 +02:00
// release the semaphore anyway
2011-11-25 15:09:25 +01:00
check = this . loaderSteering . remove ( request . url ( ) ) ;
2013-07-17 18:37:34 +02:00
if ( check ! = null ) check . release ( 1000 ) ;
// Very noisy: ConcurrentLog.logException(e);
2010-09-06 12:23:30 +02:00
throw new IOException ( e ) ;
}
}
2011-06-01 21:31:56 +02:00
2010-06-22 14:28:53 +02:00
/ * *
* load a resource from the web , from ftp , from smb or a file
* @param request the request essentials
2010-09-06 12:23:30 +02:00
* @param cacheStratgy strategy according to NOCACHE , IFFRESH , IFEXIST , CACHEONLY
2010-06-22 14:28:53 +02:00
* @return the loaded entity in a Response object
* @throws IOException
* /
2013-08-22 14:23:47 +02:00
private Response loadInternal ( final Request request , CacheStrategy cacheStrategy , final int maxFileSize , final BlacklistType blacklistType , ClientIdentification . Agent agent ) throws IOException {
2009-07-23 23:31:51 +02:00
// get the protocol of the next URL
2010-07-18 22:14:20 +02:00
final DigestURI url = request . url ( ) ;
2011-06-13 23:44:03 +02:00
if ( url . isFile ( ) | | url . isSMB ( ) ) cacheStrategy = CacheStrategy . NOCACHE ; // load just from the file system
2010-07-18 22:14:20 +02:00
final String protocol = url . getProtocol ( ) ;
final String host = url . getHost ( ) ;
2013-07-17 15:20:56 +02:00
final CrawlProfile crawlProfile = request . profileHandle ( ) = = null ? null : this . sb . crawler . getActive ( UTF8 . getBytes ( request . profileHandle ( ) ) ) ;
2012-06-11 00:17:30 +02:00
// check if url is in blacklist
2012-07-02 13:57:29 +02:00
if ( blacklistType ! = null & & host ! = null & & Switchboard . urlBlacklist . isListed ( blacklistType , host . toLowerCase ( ) , url . getFile ( ) ) ) {
2013-07-17 15:20:56 +02:00
this . sb . crawlQueues . errorURL . push ( request , crawlProfile , this . sb . peers . mySeed ( ) . hash . getBytes ( ) , new Date ( ) , 1 , FailCategory . FINAL_LOAD_CONTEXT , " url in blacklist " , - 1 ) ;
2011-12-28 20:09:17 +01:00
throw new IOException ( " DISPATCHER Rejecting URL ' " + request . url ( ) . toString ( ) + " '. URL is in blacklist. " ) ;
}
2009-07-23 23:31:51 +02:00
// check if we have the page in the cache
2012-10-28 22:48:11 +01:00
if ( cacheStrategy ! = CacheStrategy . NOCACHE & & crawlProfile ! = null ) {
2009-07-23 23:31:51 +02:00
// we have passed a first test if caching is allowed
// now see if there is a cache entry
2011-06-01 21:31:56 +02:00
final ResponseHeader cachedResponse = ( url . isLocal ( ) ) ? null : Cache . getResponseHeader ( url . hash ( ) ) ;
2012-07-06 08:29:41 +02:00
if ( cachedResponse ! = null & & Cache . hasContent ( url . hash ( ) ) ) {
2009-07-23 23:31:51 +02:00
// yes we have the content
2011-06-01 21:31:56 +02:00
2009-07-23 23:31:51 +02:00
// create request header values and a response object because we need that
// in case that we want to return the cached content in the next step
final RequestHeader requestHeader = new RequestHeader ( ) ;
2013-08-22 14:23:47 +02:00
requestHeader . put ( HeaderFramework . USER_AGENT , agent . userAgent ) ;
2009-10-11 02:12:19 +02:00
DigestURI refererURL = null ;
2012-06-28 14:27:29 +02:00
if ( request . referrerhash ( ) ! = null ) refererURL = this . sb . getURL ( request . referrerhash ( ) ) ;
2012-10-10 11:46:22 +02:00
if ( refererURL ! = null ) requestHeader . put ( RequestHeader . REFERER , refererURL . toNormalform ( true ) ) ;
2011-06-01 21:31:56 +02:00
final Response response = new Response (
2009-07-23 23:31:51 +02:00
request ,
requestHeader ,
cachedResponse ,
crawlProfile ,
2012-05-21 03:03:47 +02:00
true ,
2012-07-06 08:29:41 +02:00
null ) ;
2011-06-01 21:31:56 +02:00
2009-07-23 23:31:51 +02:00
// check which caching strategy shall be used
2011-06-13 23:44:03 +02:00
if ( cacheStrategy = = CacheStrategy . IFEXIST | | cacheStrategy = = CacheStrategy . CACHEONLY ) {
2009-07-23 23:31:51 +02:00
// well, just take the cache and don't care about freshness of the content
2012-07-06 08:29:41 +02:00
final byte [ ] content = Cache . getContent ( url . hash ( ) ) ;
if ( content ! = null ) {
2013-07-09 14:28:25 +02:00
this . log . info ( " cache hit/useall for: " + url . toNormalform ( true ) ) ;
2012-07-06 08:29:41 +02:00
response . setContent ( content ) ;
return response ;
}
2009-07-23 23:31:51 +02:00
}
2011-06-01 21:31:56 +02:00
2009-07-23 23:31:51 +02:00
// now the cacheStrategy must be CACHE_STRATEGY_IFFRESH, that means we should do a proxy freshness test
2012-12-19 10:41:22 +01:00
//assert cacheStrategy == CacheStrategy.IFFRESH : "cacheStrategy = " + cacheStrategy;
2009-07-23 23:31:51 +02:00
if ( response . isFreshForProxy ( ) ) {
2012-07-06 08:29:41 +02:00
final byte [ ] content = Cache . getContent ( url . hash ( ) ) ;
if ( content ! = null ) {
2013-07-09 14:28:25 +02:00
this . log . info ( " cache hit/fresh for: " + url . toNormalform ( true ) ) ;
2012-07-06 08:29:41 +02:00
response . setContent ( content ) ;
return response ;
}
2009-07-23 23:31:51 +02:00
}
2013-07-09 14:28:25 +02:00
this . log . info ( " cache hit/stale for: " + url . toNormalform ( true ) ) ;
2011-03-22 11:35:26 +01:00
} else if ( cachedResponse ! = null ) {
2013-07-09 14:28:25 +02:00
this . log . warn ( " HTCACHE contained response header, but not content for url " + url . toNormalform ( true ) ) ;
2009-07-23 23:31:51 +02:00
}
}
2011-06-01 21:31:56 +02:00
2009-07-23 23:31:51 +02:00
// check case where we want results from the cache exclusively, and never from the internet (offline mode)
2011-06-13 23:44:03 +02:00
if ( cacheStrategy = = CacheStrategy . CACHEONLY ) {
2009-07-23 23:31:51 +02:00
// we had a chance to get the content from the cache .. its over. We don't have it.
2010-06-18 17:25:25 +02:00
throw new IOException ( " cache only strategy " ) ;
2009-07-23 23:31:51 +02:00
}
2011-06-01 21:31:56 +02:00
2009-07-23 23:31:51 +02:00
// now forget about the cache, nothing there. Try to load the content from the internet
2011-06-01 21:31:56 +02:00
2009-07-23 23:31:51 +02:00
// check access time: this is a double-check (we checked possibly already in the balancer)
2009-07-25 23:38:57 +02:00
// to make sure that we don't DoS the target by mistake
2010-07-18 22:14:20 +02:00
if ( ! url . isLocal ( ) ) {
2008-08-02 14:12:04 +02:00
final Long lastAccess = accessTime . get ( host ) ;
2008-05-16 21:50:28 +02:00
long wait = 0 ;
2013-08-22 14:23:47 +02:00
if ( lastAccess ! = null ) wait = Math . max ( 0 , agent . minimumDelta + lastAccess . longValue ( ) - System . currentTimeMillis ( ) ) ;
2008-05-16 21:50:28 +02:00
if ( wait > 0 ) {
// force a sleep here. Instead just sleep we clean up the accessTime map
2008-08-02 14:12:04 +02:00
final long untilTime = System . currentTimeMillis ( ) + wait ;
2009-01-06 14:51:59 +01:00
cleanupAccessTimeTable ( untilTime ) ;
2012-07-30 10:38:23 +02:00
if ( System . currentTimeMillis ( ) < untilTime ) {
long frcdslp = untilTime - System . currentTimeMillis ( ) ;
2013-07-09 14:28:25 +02:00
this . log . info ( " Forcing sleep of " + frcdslp + " ms for host " + host ) ;
2012-07-30 10:38:23 +02:00
try { Thread . sleep ( frcdslp ) ; } catch ( final InterruptedException ee ) { }
}
2008-05-16 21:50:28 +02:00
}
}
2009-07-23 23:31:51 +02:00
2009-07-25 23:38:57 +02:00
// now it's for sure that we will access the target. Remember the access time
2013-05-11 11:19:06 +02:00
if ( host ! = null ) {
if ( accessTime . size ( ) > accessTimeMaxsize ) accessTime . clear ( ) ; // prevent a memory leak here
accessTime . put ( host , System . currentTimeMillis ( ) ) ;
}
2011-06-01 21:31:56 +02:00
2009-07-23 23:31:51 +02:00
// load resource from the internet
Response response = null ;
2011-12-29 14:37:19 +01:00
if ( protocol . equals ( " http " ) | | protocol . equals ( " https " ) ) {
2013-08-22 14:23:47 +02:00
response = this . httpLoader . load ( request , crawlProfile , maxFileSize , blacklistType , agent ) ;
2011-12-29 14:37:19 +01:00
} else if ( protocol . equals ( " ftp " ) ) {
response = this . ftpLoader . load ( request , true ) ;
} else if ( protocol . equals ( " smb " ) ) {
response = this . smbLoader . load ( request , true ) ;
} else if ( protocol . equals ( " file " ) ) {
response = this . fileLoader . load ( request , true ) ;
} else {
throw new IOException ( " Unsupported protocol ' " + protocol + " ' in url " + url ) ;
}
if ( response = = null ) {
throw new IOException ( " no response (NULL) for url " + url ) ;
}
if ( response . getContent ( ) = = null ) {
throw new IOException ( " empty response (code " + response . getStatus ( ) + " ) for url " + url ) ;
}
2012-01-23 17:27:29 +01:00
2011-12-29 14:37:19 +01:00
// we got something. Now check if we want to store that to the cache
// first check looks if we want to store the content to the cache
if ( crawlProfile = = null | | ! crawlProfile . storeHTCache ( ) ) {
// no caching wanted. Thats ok, do not write any message
2009-07-23 23:31:51 +02:00
return response ;
}
2011-12-29 14:37:19 +01:00
// second check tells us if the protocoll tells us something about caching
final String storeError = response . shallStoreCacheForCrawler ( ) ;
if ( storeError = = null ) {
try {
Cache . store ( url , response . getResponseHeader ( ) , response . getContent ( ) ) ;
} catch ( final IOException e ) {
2013-07-09 14:28:25 +02:00
this . log . warn ( " cannot write " + response . url ( ) + " to Cache (3): " + e . getMessage ( ) , e ) ;
2011-12-29 14:37:19 +01:00
}
} else {
2013-07-09 14:28:25 +02:00
this . log . warn ( " cannot write " + response . url ( ) + " to Cache (4): " + storeError ) ;
2011-12-29 14:37:19 +01:00
}
return response ;
2007-10-29 02:43:20 +01:00
}
2011-08-31 18:02:06 +02:00
2011-08-24 14:07:53 +02:00
private int protocolMaxFileSize ( final DigestURI url ) {
if ( url . isHTTP ( ) | | url . isHTTPS ( ) )
return this . sb . getConfigInt ( " crawler.http.maxFileSize " , HTTPLoader . DEFAULT_MAXFILESIZE ) ;
if ( url . isFTP ( ) )
return this . sb . getConfigInt ( " crawler.ftp.maxFileSize " , ( int ) FTPLoader . DEFAULT_MAXFILESIZE ) ;
if ( url . isSMB ( ) )
return this . sb . getConfigInt ( " crawler.smb.maxFileSize " , ( int ) SMBLoader . DEFAULT_MAXFILESIZE ) ;
return Integer . MAX_VALUE ;
}
2009-08-27 16:34:41 +02:00
/ * *
2010-06-22 14:28:53 +02:00
* load the url as byte [ ] content from the web or the cache
* @param request
* @param cacheStrategy
* @param timeout
2009-11-19 11:17:26 +01:00
* @return the content as { @link byte [ ] }
2011-06-01 21:31:56 +02:00
* @throws IOException
2009-08-27 16:34:41 +02:00
* /
2013-08-22 14:23:47 +02:00
public byte [ ] loadContent ( final Request request , final CacheStrategy cacheStrategy , BlacklistType blacklistType , final ClientIdentification . Agent agent ) throws IOException {
2009-11-19 11:17:26 +01:00
// try to download the resource using the loader
2013-08-22 14:23:47 +02:00
final Response entry = load ( request , cacheStrategy , blacklistType , agent ) ;
2009-11-19 11:17:26 +01:00
if ( entry = = null ) return null ; // not found in web
2011-06-01 21:31:56 +02:00
2009-11-19 11:17:26 +01:00
// read resource body (if it is there)
return entry . getContent ( ) ;
2009-08-27 16:34:41 +02:00
}
2011-06-01 21:31:56 +02:00
2013-08-22 14:23:47 +02:00
public Document [ ] loadDocuments ( final Request request , final CacheStrategy cacheStrategy , final int maxFileSize , BlacklistType blacklistType , final ClientIdentification . Agent agent ) throws IOException , Parser . Failure {
2009-08-27 16:34:41 +02:00
// load resource
2013-08-22 14:23:47 +02:00
final Response response = load ( request , cacheStrategy , maxFileSize , blacklistType , agent ) ;
2010-07-18 22:14:20 +02:00
final DigestURI url = request . url ( ) ;
if ( response = = null ) throw new IOException ( " no Response for url " + url ) ;
2009-08-27 16:34:41 +02:00
2010-06-22 14:28:53 +02:00
// if it is still not available, report an error
2010-07-18 22:14:20 +02:00
if ( response . getContent ( ) = = null | | response . getResponseHeader ( ) = = null ) throw new IOException ( " no Content available for url " + url ) ;
2009-08-27 16:34:41 +02:00
// parse resource
2010-06-22 14:28:53 +02:00
return response . parse ( ) ;
2009-10-18 02:53:43 +02:00
}
2013-08-22 14:23:47 +02:00
public Document loadDocument ( final DigestURI location , final CacheStrategy cachePolicy , BlacklistType blacklistType , final ClientIdentification . Agent agent ) throws IOException {
2012-01-23 17:27:29 +01:00
// load resource
Request request = request ( location , true , false ) ;
2013-08-22 14:23:47 +02:00
final Response response = this . load ( request , cachePolicy , blacklistType , agent ) ;
2012-01-23 17:27:29 +01:00
final DigestURI url = request . url ( ) ;
if ( response = = null ) throw new IOException ( " no Response for url " + url ) ;
// if it is still not available, report an error
if ( response . getContent ( ) = = null | | response . getResponseHeader ( ) = = null ) throw new IOException ( " no Content available for url " + url ) ;
2011-06-01 21:31:56 +02:00
2012-01-23 17:27:29 +01:00
// parse resource
2010-09-24 15:14:20 +02:00
try {
2012-01-23 17:27:29 +01:00
Document [ ] documents = response . parse ( ) ;
return Document . mergeDocuments ( location , response . getMimeType ( ) , documents ) ;
2011-06-01 21:31:56 +02:00
} catch ( final Parser . Failure e ) {
2012-01-23 17:27:29 +01:00
throw new IOException ( e . getMessage ( ) ) ;
2010-09-24 15:14:20 +02:00
}
2009-10-18 02:53:43 +02:00
}
2009-08-27 16:34:41 +02:00
2010-06-22 14:28:53 +02:00
/ * *
* load all links from a resource
* @param url the url that shall be loaded
* @param cacheStrategy the cache strategy
* @return a map from URLs to the anchor texts of the urls
* @throws IOException
* /
2013-08-22 14:23:47 +02:00
public final Map < DigestURI , String > loadLinks ( final DigestURI url , final CacheStrategy cacheStrategy , BlacklistType blacklistType , final ClientIdentification . Agent agent ) throws IOException {
final Response response = load ( request ( url , true , false ) , cacheStrategy , Integer . MAX_VALUE , blacklistType , agent ) ;
2010-06-22 14:28:53 +02:00
if ( response = = null ) throw new IOException ( " response == null " ) ;
2011-06-01 21:31:56 +02:00
final ResponseHeader responseHeader = response . getResponseHeader ( ) ;
2011-08-02 01:28:23 +02:00
if ( response . getContent ( ) = = null ) throw new IOException ( " resource == null " ) ;
2010-06-22 14:28:53 +02:00
if ( responseHeader = = null ) throw new IOException ( " responseHeader == null " ) ;
2011-06-01 21:31:56 +02:00
2010-06-29 21:20:45 +02:00
Document [ ] documents = null ;
2011-06-01 21:31:56 +02:00
final String supportError = TextParser . supports ( url , responseHeader . mime ( ) ) ;
2010-06-22 14:28:53 +02:00
if ( supportError ! = null ) throw new IOException ( " no parser support: " + supportError ) ;
try {
2012-04-24 16:07:03 +02:00
documents = TextParser . parseSource ( url , responseHeader . mime ( ) , responseHeader . getCharacterEncoding ( ) , response . getContent ( ) ) ;
2010-06-29 21:20:45 +02:00
if ( documents = = null ) throw new IOException ( " document == null " ) ;
} catch ( final Exception e ) {
2010-06-22 14:28:53 +02:00
throw new IOException ( " parser error: " + e . getMessage ( ) ) ;
}
2010-06-29 21:20:45 +02:00
return Document . getHyperlinks ( documents ) ;
2010-06-22 14:28:53 +02:00
}
2011-06-01 21:31:56 +02:00
2012-07-05 12:38:41 +02:00
public synchronized static void cleanupAccessTimeTable ( final long timeout ) {
2009-01-06 14:51:59 +01:00
final Iterator < Map . Entry < String , Long > > i = accessTime . entrySet ( ) . iterator ( ) ;
Map . Entry < String , Long > e ;
while ( i . hasNext ( ) ) {
e = i . next ( ) ;
if ( System . currentTimeMillis ( ) > timeout ) break ;
2012-07-30 10:38:23 +02:00
if ( System . currentTimeMillis ( ) - e . getValue ( ) . longValue ( ) > 1000 ) i . remove ( ) ;
2009-01-06 14:51:59 +01:00
}
}
2011-06-01 21:31:56 +02:00
2013-08-22 14:23:47 +02:00
public void loadIfNotExistBackground ( final DigestURI url , final File cache , final int maxFileSize , BlacklistType blacklistType , final ClientIdentification . Agent agent ) {
new Loader ( url , cache , maxFileSize , CacheStrategy . IFEXIST , blacklistType , agent ) . start ( ) ;
2010-09-06 01:02:46 +02:00
}
2011-06-01 21:31:56 +02:00
2013-08-22 14:23:47 +02:00
public void loadIfNotExistBackground ( final DigestURI url , final int maxFileSize , BlacklistType blacklistType , final ClientIdentification . Agent agent ) {
new Loader ( url , null , maxFileSize , CacheStrategy . IFEXIST , blacklistType , agent ) . start ( ) ;
2010-04-30 16:03:51 +02:00
}
2011-06-01 21:31:56 +02:00
2010-04-30 16:03:51 +02:00
private class Loader extends Thread {
2011-11-25 15:09:25 +01:00
private final DigestURI url ;
2011-06-01 21:31:56 +02:00
private final File cache ;
2011-08-02 01:28:23 +02:00
private final int maxFileSize ;
2011-06-13 23:44:03 +02:00
private final CacheStrategy cacheStrategy ;
2012-07-02 13:57:29 +02:00
private final BlacklistType blacklistType ;
2013-08-22 14:23:47 +02:00
private final ClientIdentification . Agent agent ;
2011-06-01 21:31:56 +02:00
2013-08-22 14:23:47 +02:00
public Loader ( final DigestURI url , final File cache , final int maxFileSize , final CacheStrategy cacheStrategy , BlacklistType blacklistType , final ClientIdentification . Agent agent ) {
2010-04-30 16:03:51 +02:00
this . url = url ;
this . cache = cache ;
2010-05-14 20:30:11 +02:00
this . maxFileSize = maxFileSize ;
2010-09-06 01:02:46 +02:00
this . cacheStrategy = cacheStrategy ;
2012-07-02 13:57:29 +02:00
this . blacklistType = blacklistType ;
2013-08-22 14:23:47 +02:00
this . agent = agent ;
2010-04-30 16:03:51 +02:00
}
2011-06-01 21:31:56 +02:00
2011-12-29 14:37:19 +01:00
@Override
2010-04-30 16:03:51 +02:00
public void run ( ) {
2010-09-06 01:02:46 +02:00
if ( this . cache ! = null & & this . cache . exists ( ) ) return ;
2010-04-30 16:03:51 +02:00
try {
// load from the net
2013-08-22 14:23:47 +02:00
final Response response = load ( request ( this . url , false , true ) , this . cacheStrategy , this . maxFileSize , this . blacklistType , this . agent ) ;
2011-06-01 21:31:56 +02:00
final byte [ ] b = response . getContent ( ) ;
2010-09-06 01:02:46 +02:00
if ( this . cache ! = null ) FileUtils . copy ( b , this . cache ) ;
2011-06-01 21:31:56 +02:00
} catch ( final MalformedURLException e ) { } catch ( final IOException e ) { }
2010-04-30 16:03:51 +02:00
}
}
2011-12-28 20:09:17 +01:00
}