2009-09-23 23:26:14 +02:00
// Response.java
2008-08-19 16:10:40 +02:00
// (C) 2008 by Michael Peter Christen; mc@yacy.net, Frankfurt a. M., Germany
// first published 19.08.2008 on http://yacy.net
//
// This is a part of YaCy, a peer-to-peer based web search engine
//
2009-09-23 23:26:14 +02:00
// $LastChangedDate$
// $LastChangedRevision$
// $LastChangedBy$
2008-08-19 16:10:40 +02:00
//
// LICENSE
2011-09-07 12:08:57 +02:00
//
2008-08-19 16:10:40 +02:00
// This program is free software; you can redistribute it and/or modify
// it under the terms of the GNU General Public License as published by
// the Free Software Foundation; either version 2 of the License, or
// (at your option) any later version.
//
// This program is distributed in the hope that it will be useful,
// but WITHOUT ANY WARRANTY; without even the implied warranty of
// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
// GNU General Public License for more details.
//
// You should have received a copy of the GNU General Public License
// along with this program; if not, write to the Free Software
// Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
2012-09-21 15:48:16 +02:00
package net.yacy.crawler.retrieval ;
2008-08-19 16:10:40 +02:00
2016-01-05 23:37:05 +01:00
import java.nio.charset.StandardCharsets ;
2008-08-19 16:10:40 +02:00
import java.util.Date ;
2017-06-27 06:42:33 +02:00
import java.util.Locale ;
2008-08-19 16:10:40 +02:00
2012-11-21 18:46:49 +01:00
import net.yacy.cora.document.analysis.Classification ;
2013-09-15 00:30:23 +02:00
import net.yacy.cora.document.encoding.ASCII ;
import net.yacy.cora.document.encoding.UTF8 ;
import net.yacy.cora.document.id.DigestURL ;
import net.yacy.cora.document.id.MultiProtocolURL ;
2010-08-23 14:32:02 +02:00
import net.yacy.cora.protocol.HeaderFramework ;
import net.yacy.cora.protocol.RequestHeader ;
import net.yacy.cora.protocol.ResponseHeader ;
2012-05-21 17:52:30 +02:00
import net.yacy.cora.util.NumberTools ;
2012-09-21 15:48:16 +02:00
import net.yacy.crawler.data.CrawlProfile ;
import net.yacy.crawler.data.ResultURLs.EventOrigin ;
2010-06-22 14:28:53 +02:00
import net.yacy.document.Document ;
2010-06-29 21:20:45 +02:00
import net.yacy.document.Parser ;
2009-10-20 00:34:44 +02:00
import net.yacy.document.TextParser ;
2015-01-30 13:20:56 +01:00
import net.yacy.document.VocabularyScraper ;
2014-11-24 20:28:52 +01:00
import net.yacy.search.Switchboard ;
2008-08-19 16:10:40 +02:00
2009-07-15 23:07:46 +02:00
public class Response {
2011-09-07 12:08:57 +02:00
2008-08-19 16:10:40 +02:00
// doctypes:
public static final char DT_PDFPS = 'p' ;
public static final char DT_TEXT = 't' ;
public static final char DT_HTML = 'h' ;
public static final char DT_DOC = 'd' ;
public static final char DT_IMAGE = 'i' ;
public static final char DT_MOVIE = 'm' ;
public static final char DT_FLASH = 'f' ;
public static final char DT_SHARE = 's' ;
public static final char DT_AUDIO = 'a' ;
public static final char DT_BINARY = 'b' ;
public static final char DT_UNKNOWN = 'u' ;
// the class objects
2009-07-16 01:08:35 +02:00
private final Request request ;
2010-05-09 00:09:36 +02:00
private final RequestHeader requestHeader ;
private final ResponseHeader responseHeader ;
2011-08-02 01:31:08 +02:00
private final CrawlProfile profile ;
2009-07-17 15:59:21 +02:00
private byte [ ] content ;
private int status ; // tracker indexing status, see status defs below
2012-05-21 03:03:47 +02:00
private final boolean fromCache ;
2015-11-20 19:35:39 +01:00
/** Maximum file size to put in cache for crawler */
public static final long CRAWLER_MAX_SIZE_TO_CACHE = 10 * 1024L * 1024L ;
2011-09-07 12:08:57 +02:00
2014-06-26 12:56:33 +02:00
/ * *
* doctype calculation by file extension
* TODO : this must be enhanced with a more generic way of configuration
* @param ext
* @return a character denoting the file type
* /
public static char docTypeExt ( final String ext ) {
if ( ext = = null ) return DT_UNKNOWN ;
if ( ext . equals ( " gif " ) ) return DT_IMAGE ;
if ( ext . equals ( " ico " ) ) return DT_IMAGE ;
if ( ext . equals ( " bmp " ) ) return DT_IMAGE ;
if ( ext . equals ( " jpg " ) ) return DT_IMAGE ;
if ( ext . equals ( " jpeg " ) ) return DT_IMAGE ;
if ( ext . equals ( " png " ) ) return DT_IMAGE ;
if ( ext . equals ( " tif " ) ) return DT_IMAGE ;
if ( ext . equals ( " tiff " ) ) return DT_IMAGE ;
if ( ext . equals ( " htm " ) ) return DT_HTML ;
if ( ext . equals ( " html " ) ) return DT_HTML ;
if ( ext . equals ( " txt " ) ) return DT_TEXT ;
if ( ext . equals ( " doc " ) ) return DT_DOC ;
if ( ext . equals ( " rtf " ) ) return DT_DOC ;
if ( ext . equals ( " pdf " ) ) return DT_PDFPS ;
if ( ext . equals ( " ps " ) ) return DT_PDFPS ;
if ( ext . equals ( " mp3 " ) ) return DT_AUDIO ;
if ( ext . equals ( " aac " ) ) return DT_AUDIO ;
if ( ext . equals ( " m4a " ) ) return DT_AUDIO ;
if ( ext . equals ( " ogg " ) ) return DT_AUDIO ;
if ( ext . equals ( " wav " ) ) return DT_AUDIO ;
if ( ext . equals ( " wma " ) ) return DT_AUDIO ;
if ( ext . equals ( " avi " ) ) return DT_MOVIE ;
if ( ext . equals ( " mov " ) ) return DT_MOVIE ;
if ( ext . equals ( " qt " ) ) return DT_MOVIE ;
if ( ext . equals ( " mpg " ) ) return DT_MOVIE ;
if ( ext . equals ( " mp4 " ) ) return DT_MOVIE ;
if ( ext . equals ( " m4v " ) ) return DT_MOVIE ;
if ( ext . equals ( " mkv " ) ) return DT_MOVIE ;
if ( ext . equals ( " md5 " ) ) return DT_SHARE ;
if ( ext . equals ( " mpeg " ) ) return DT_MOVIE ;
if ( ext . equals ( " asf " ) ) return DT_FLASH ;
return DT_UNKNOWN ;
}
/ * *
* doctype calculation based on file extensions ; this is the url wrapper
* @param url
* @return a character denoting the file type
* /
2013-09-15 00:30:23 +02:00
public static char docType ( final MultiProtocolURL url ) {
String ext = MultiProtocolURL . getFileExtension ( url . getFileName ( ) ) ;
2012-08-05 15:49:27 +02:00
if ( ext = = null ) return DT_UNKNOWN ;
2014-06-26 12:56:33 +02:00
return docTypeExt ( ext ) ;
2008-08-19 16:10:40 +02:00
}
2014-06-26 12:56:33 +02:00
/ * *
* doctype calculation based on the mime type
* @param mime
* @return a character denoting the file type
* /
2008-08-19 16:10:40 +02:00
public static char docType ( final String mime ) {
// serverLog.logFinest("PLASMA", "docType mime=" + mime);
char doctype = DT_UNKNOWN ;
if ( mime = = null ) doctype = DT_UNKNOWN ;
else if ( mime . startsWith ( " image/ " ) ) doctype = DT_IMAGE ;
else if ( mime . endsWith ( " /gif " ) ) doctype = DT_IMAGE ;
else if ( mime . endsWith ( " /jpeg " ) ) doctype = DT_IMAGE ;
else if ( mime . endsWith ( " /png " ) ) doctype = DT_IMAGE ;
else if ( mime . endsWith ( " /html " ) ) doctype = DT_HTML ;
else if ( mime . endsWith ( " /rtf " ) ) doctype = DT_DOC ;
else if ( mime . endsWith ( " /pdf " ) ) doctype = DT_PDFPS ;
else if ( mime . endsWith ( " /octet-stream " ) ) doctype = DT_BINARY ;
else if ( mime . endsWith ( " /x-shockwave-flash " ) ) doctype = DT_FLASH ;
else if ( mime . endsWith ( " /msword " ) ) doctype = DT_DOC ;
else if ( mime . endsWith ( " /mspowerpoint " ) ) doctype = DT_DOC ;
else if ( mime . endsWith ( " /postscript " ) ) doctype = DT_PDFPS ;
else if ( mime . startsWith ( " text/ " ) ) doctype = DT_TEXT ;
else if ( mime . startsWith ( " audio/ " ) ) doctype = DT_AUDIO ;
else if ( mime . startsWith ( " video/ " ) ) doctype = DT_MOVIE ;
return doctype ;
}
2011-09-07 12:08:57 +02:00
2014-06-26 12:56:33 +02:00
/ * *
* reverse mime type calculation ; this is just a heuristic
* @param ext
* @param doctype
* @return a mime type string
* /
2012-08-31 10:30:43 +02:00
public static String [ ] doctype2mime ( String ext , char doctype ) {
if ( doctype = = DT_PDFPS ) return new String [ ] { " application/pdf " } ;
if ( doctype = = DT_HTML ) return new String [ ] { " text/html " } ;
if ( doctype = = DT_DOC ) return new String [ ] { " application/msword " } ;
if ( doctype = = DT_FLASH ) return new String [ ] { " application/x-shockwave-flash " } ;
if ( doctype = = DT_SHARE ) return new String [ ] { " text/plain " } ;
if ( doctype = = DT_BINARY ) return new String [ ] { " application/octet-stream " } ;
2012-08-16 17:49:35 +02:00
String mime = Classification . ext2mime ( ext ) ;
int p = mime . indexOf ( '/' ) ;
2012-08-31 10:30:43 +02:00
if ( p < 0 ) return new String [ ] { mime } ;
if ( doctype = = DT_TEXT ) return new String [ ] { " text " + mime . substring ( p ) } ;
if ( doctype = = DT_IMAGE ) return new String [ ] { " image " + mime . substring ( p ) } ;
if ( doctype = = DT_AUDIO ) return new String [ ] { " audio " + mime . substring ( p ) } ;
if ( doctype = = DT_MOVIE ) return new String [ ] { " video " + mime . substring ( p ) } ;
return new String [ ] { mime } ;
2012-08-05 15:49:27 +02:00
}
2012-08-16 17:49:35 +02:00
2009-07-17 15:59:21 +02:00
public static final int QUEUE_STATE_FRESH = 0 ;
public static final int QUEUE_STATE_PARSING = 1 ;
public static final int QUEUE_STATE_CONDENSING = 2 ;
public static final int QUEUE_STATE_STRUCTUREANALYSIS = 3 ;
public static final int QUEUE_STATE_INDEXSTORAGE = 4 ;
public static final int QUEUE_STATE_FINISHED = 5 ;
2011-09-07 12:08:57 +02:00
2009-07-15 23:07:46 +02:00
public Response (
2010-12-11 01:31:57 +01:00
final Request request ,
2009-07-19 22:37:44 +02:00
final RequestHeader requestHeader ,
final ResponseHeader responseHeader ,
2010-08-31 17:47:47 +02:00
final CrawlProfile profile ,
2012-05-21 03:03:47 +02:00
final boolean fromCache ,
2009-07-23 23:31:51 +02:00
final byte [ ] content ) {
2009-07-16 01:08:35 +02:00
this . request = request ;
2009-07-17 15:59:21 +02:00
// request and response headers may be zero in case that we process surrogates
2008-08-25 20:11:47 +02:00
this . requestHeader = requestHeader ;
this . responseHeader = responseHeader ;
2008-08-19 16:10:40 +02:00
this . profile = profile ;
2009-07-17 15:59:21 +02:00
this . status = QUEUE_STATE_FRESH ;
2009-07-23 23:31:51 +02:00
this . content = content ;
2012-05-21 03:03:47 +02:00
this . fromCache = fromCache ;
2014-03-11 09:51:04 +01:00
if ( this . responseHeader ! = null & & content ! = null & & Integer . parseInt ( this . responseHeader . get ( HeaderFramework . CONTENT_LENGTH , " 0 " ) ) < = content . length ) {
this . responseHeader . put ( HeaderFramework . CONTENT_LENGTH , Integer . toString ( content . length ) ) ; // repair length
}
2009-07-23 23:31:51 +02:00
}
2011-09-07 12:08:57 +02:00
2012-04-24 16:07:03 +02:00
/ * *
* create a ' virtual ' response that is composed using crawl details from the request object
* this is used when the NOLOAD queue is processed
* @param request
* @param profile
* /
2010-12-11 01:31:57 +01:00
public Response ( final Request request , final CrawlProfile profile ) {
this . request = request ;
// request and response headers may be zero in case that we process surrogates
2016-12-18 02:38:43 +01:00
this . requestHeader = null ;
2012-06-25 18:17:31 +02:00
this . responseHeader = new ResponseHeader ( 200 ) ;
2013-09-15 00:30:23 +02:00
this . responseHeader . put ( HeaderFramework . CONTENT_TYPE , Classification . ext2mime ( MultiProtocolURL . getFileExtension ( request . url ( ) . getFileName ( ) ) , " text/plain " ) ) ; // tell parser how to handle the content
2010-12-11 01:31:57 +01:00
this . profile = profile ;
this . status = QUEUE_STATE_FRESH ;
2012-10-09 12:14:28 +02:00
this . content = request . name ( ) . length ( ) > 0 ? UTF8 . getBytes ( request . name ( ) ) : UTF8 . getBytes ( request . url ( ) . toTokens ( ) ) ;
2012-05-21 03:03:47 +02:00
this . fromCache = true ;
2014-03-11 09:51:04 +01:00
if ( this . responseHeader ! = null ) this . responseHeader . put ( HeaderFramework . CONTENT_LENGTH , " 0 " ) ; // 'virtual' length, shows that the resource was not loaded
2010-12-11 01:31:57 +01:00
}
2014-06-12 05:23:26 +02:00
2009-07-17 15:59:21 +02:00
public void updateStatus ( final int newStatus ) {
this . status = newStatus ;
}
2017-06-02 01:46:06 +02:00
/ * *
* @return the original request that produced this response
* /
public Request getRequest ( ) {
return request ;
}
2011-09-07 12:08:57 +02:00
2009-07-23 23:31:51 +02:00
public ResponseHeader getResponseHeader ( ) {
return this . responseHeader ;
}
2017-06-02 01:46:06 +02:00
public RequestHeader getRequestHeader ( ) {
return this . requestHeader ;
}
2011-09-07 12:08:57 +02:00
2012-05-21 03:03:47 +02:00
public boolean fromCache ( ) {
return this . fromCache ;
}
2009-07-17 15:59:21 +02:00
public int getStatus ( ) {
return this . status ;
}
2011-09-07 12:08:57 +02:00
2008-08-19 16:10:40 +02:00
public String name ( ) {
// the anchor name; can be either the text inside the anchor tag or the
// page description after loading of the page
2009-07-16 01:08:35 +02:00
return this . request . name ( ) ;
2008-08-19 16:10:40 +02:00
}
2013-09-15 00:30:23 +02:00
public DigestURL url ( ) {
2009-07-16 01:08:35 +02:00
return this . request . url ( ) ;
2008-08-19 16:10:40 +02:00
}
2011-09-07 12:08:57 +02:00
2008-08-25 20:11:47 +02:00
public char docType ( ) {
char doctype = docType ( getMimeType ( ) ) ;
2009-07-16 01:08:35 +02:00
if ( doctype = = DT_UNKNOWN ) doctype = docType ( url ( ) ) ;
2008-08-25 20:11:47 +02:00
return doctype ;
}
2008-08-19 16:10:40 +02:00
2015-12-20 15:49:24 +01:00
/ * *
* Get respons header last modified date
* if missing the first seen date or current date
* @return valid date always ! = null
* /
2008-08-19 16:10:40 +02:00
public Date lastModified ( ) {
2008-08-25 20:11:47 +02:00
Date docDate = null ;
2011-09-07 12:08:57 +02:00
if ( this . responseHeader ! = null ) {
2015-12-20 15:49:24 +01:00
docDate = this . responseHeader . lastModified ( ) ; // is always != null
2008-08-25 20:11:47 +02:00
}
2011-09-07 12:08:57 +02:00
if ( docDate = = null & & this . request ! = null ) docDate = this . request . appdate ( ) ;
2015-04-15 13:17:23 +02:00
if ( docDate = = null ) docDate = new Date ( ) ;
2011-09-07 12:08:57 +02:00
2008-08-25 20:11:47 +02:00
return docDate ;
2008-08-19 16:10:40 +02:00
}
2011-09-07 12:08:57 +02:00
2010-08-31 17:47:47 +02:00
public CrawlProfile profile ( ) {
2008-08-19 16:10:40 +02:00
return this . profile ;
}
2010-04-08 02:11:32 +02:00
public byte [ ] initiator ( ) {
2009-07-16 01:08:35 +02:00
return this . request . initiator ( ) ;
2008-08-19 16:10:40 +02:00
}
public boolean proxy ( ) {
return initiator ( ) = = null ;
}
public long size ( ) {
2011-09-05 14:21:25 +02:00
if ( this . responseHeader ! = null & & this . responseHeader . getContentLengthLong ( ) ! = - 1 ) {
2008-10-16 23:24:09 +02:00
// take the size from the response header
2010-12-28 18:25:39 +01:00
return this . responseHeader . getContentLengthLong ( ) ;
2008-10-16 23:24:09 +02:00
}
2010-12-28 03:15:22 +01:00
if ( this . content ! = null ) return this . content . length ;
2008-10-16 23:24:09 +02:00
// the size is unknown
return - 1 ;
2008-08-19 16:10:40 +02:00
}
public int depth ( ) {
2009-07-16 01:08:35 +02:00
return this . request . depth ( ) ;
2008-08-19 16:10:40 +02:00
}
2009-07-16 01:08:35 +02:00
public void setContent ( final byte [ ] data ) {
this . content = data ;
2014-03-11 09:51:04 +01:00
if ( this . responseHeader ! = null & & this . content ! = null & & Integer . parseInt ( this . responseHeader . get ( HeaderFramework . CONTENT_LENGTH , " 0 " ) ) < = content . length ) {
this . responseHeader . put ( HeaderFramework . CONTENT_LENGTH , Integer . toString ( content . length ) ) ; // repair length
}
2008-08-19 16:10:40 +02:00
}
2009-07-16 01:08:35 +02:00
public byte [ ] getContent ( ) {
return this . content ;
2008-08-19 16:10:40 +02:00
}
// the following three methods for cache read/write granting shall be as loose
// as possible but also as strict as necessary to enable caching of most items
/ * *
* @return NULL if the answer is TRUE , in case of FALSE , the reason as
* String is returned
* /
2009-10-01 15:08:19 +02:00
public String shallStoreCacheForProxy ( ) {
2008-08-19 16:10:40 +02:00
2011-09-07 12:08:57 +02:00
final String crawlerReason = shallStoreCacheForCrawler ( ) ;
2009-10-01 15:08:19 +02:00
if ( crawlerReason ! = null ) return crawlerReason ;
2011-09-07 12:08:57 +02:00
2008-08-19 16:10:40 +02:00
// check profile (disabled: we will check this in the plasmaSwitchboard)
// if (!this.profile.storeHTCache()) { return "storage_not_wanted"; }
// decide upon header information if a specific file should be stored to
// the cache or not
// if the storage was requested by prefetching, the request map is null
// -CGI access in request
// CGI access makes the page very individual, and therefore not usable
// in caches
2011-09-07 12:08:57 +02:00
if ( url ( ) . isPOST ( ) & & this . profile ! = null & & ! this . profile . crawlingQ ( ) ) {
2008-08-19 16:10:40 +02:00
return " dynamic_post " ;
}
2011-09-07 12:08:57 +02:00
2013-09-15 00:30:23 +02:00
if ( MultiProtocolURL . isCGI ( MultiProtocolURL . getFileExtension ( url ( ) . getFileName ( ) ) ) ) {
2008-08-19 16:10:40 +02:00
return " dynamic_cgi " ;
}
2011-09-07 12:08:57 +02:00
if ( url ( ) . isLocal ( ) ) {
2009-07-23 23:31:51 +02:00
return " local_URL_no_cache_needed " ;
}
2011-09-07 12:08:57 +02:00
if ( this . responseHeader ! = null ) {
2008-08-25 20:11:47 +02:00
// -if-modified-since in request
// we do not care about if-modified-since, because this case only occurres if the
// cache file does not exist, and we need as much info as possible for the indexing
// -cookies in request
// we do not care about cookies, because that would prevent loading more pages
// from one domain once a request resulted in a client-side stored cookie
// -set-cookie in response
// we do not care about cookies in responses, because that info comes along
// any/many pages from a server and does not express the validity of the page
// in modes of life-time/expiration or individuality
// -pragma in response
// if we have a pragma non-cache, we don't cache. usually if this is wanted from
// the server, it makes sense
2011-09-07 12:08:57 +02:00
String cacheControl = this . responseHeader . get ( HeaderFramework . PRAGMA ) ;
2014-12-19 17:37:58 +01:00
if ( cacheControl ! = null & & cacheControl . trim ( ) . toUpperCase ( ) . contains ( " NO-CACHE " ) ) { return " controlled_no_cache " ; }
2008-08-25 20:11:47 +02:00
// -expires in response
// we do not care about expires, because at the time this is called the data is
// obvious valid and that header info is used in the indexing later on
// -cache-control in response
// the cache-control has many value options.
2011-09-07 12:08:57 +02:00
cacheControl = this . responseHeader . get ( HeaderFramework . CACHE_CONTROL ) ;
2008-08-25 20:11:47 +02:00
if ( cacheControl ! = null ) {
cacheControl = cacheControl . trim ( ) . toUpperCase ( ) ;
if ( cacheControl . startsWith ( " MAX-AGE= " ) ) {
// we need also the load date
2011-09-07 12:08:57 +02:00
final Date date = this . responseHeader . date ( ) ;
2008-08-25 20:11:47 +02:00
if ( date = = null ) return " stale_no_date_given_in_response " ;
try {
2012-05-21 13:40:46 +02:00
final long ttl = 1000 * NumberTools . parseLongDecSubstring ( cacheControl , 8 ) ; // milliseconds to live
2015-04-15 13:17:23 +02:00
if ( System . currentTimeMillis ( ) - date . getTime ( ) > ttl ) {
2008-08-25 20:11:47 +02:00
//System.out.println("***not indexed because cache-control");
return " stale_expired " ;
}
} catch ( final Exception e ) {
return " stale_error_ " + e . getMessage ( ) + " ) " ;
}
}
}
2008-08-19 16:10:40 +02:00
}
return null ;
}
2009-10-01 15:08:19 +02:00
public String shallStoreCacheForCrawler ( ) {
// check storage size: all files will be handled in RAM before storage, so they must not exceed
// a given size, which we consider as 1MB
2015-11-20 19:35:39 +01:00
if ( size ( ) > CRAWLER_MAX_SIZE_TO_CACHE ) return " too_large_for_caching_ " + size ( ) ;
2011-09-07 12:08:57 +02:00
2009-10-01 15:08:19 +02:00
// check status code
if ( ! validResponseStatus ( ) ) {
2012-06-25 18:17:31 +02:00
return " bad_status_ " + this . responseHeader . getStatusCode ( ) ;
2009-10-01 15:08:19 +02:00
}
2011-09-07 12:08:57 +02:00
if ( this . requestHeader ! = null ) {
2009-10-01 15:08:19 +02:00
// -authorization cases in request
// authorization makes pages very individual, and therefore we cannot use the
// content in the cache
2011-09-07 12:08:57 +02:00
if ( this . requestHeader . containsKey ( RequestHeader . AUTHORIZATION ) ) { return " personalized " ; }
2009-10-01 15:08:19 +02:00
// -ranges in request and response
// we do not cache partial content
2011-09-07 12:08:57 +02:00
if ( this . requestHeader . containsKey ( HeaderFramework . RANGE ) ) { return " partial_request " ; }
2009-10-01 15:08:19 +02:00
}
2011-09-07 12:08:57 +02:00
if ( this . responseHeader ! = null ) {
2009-10-01 15:08:19 +02:00
// -ranges in request and response
2011-09-07 12:08:57 +02:00
// we do not cache partial content
if ( this . responseHeader . containsKey ( HeaderFramework . CONTENT_RANGE ) ) { return " partial_response " ; }
2009-10-01 15:08:19 +02:00
}
return null ;
}
2011-09-07 12:08:57 +02:00
2008-08-19 16:10:40 +02:00
/ * *
* decide upon header information if a specific file should be taken from
* the cache or not
2011-09-07 12:08:57 +02:00
*
2008-08-19 16:10:40 +02:00
* @return whether the file should be taken from the cache
* /
2009-07-23 23:31:51 +02:00
public boolean isFreshForProxy ( ) {
2008-08-19 16:10:40 +02:00
2014-11-24 20:28:52 +01:00
if ( Switchboard . getSwitchboard ( ) . getConfigBool ( " proxyAlwaysFresh " , false ) ) return true ;
2008-08-19 16:10:40 +02:00
// -CGI access in request
// CGI access makes the page very individual, and therefore not usable
// in caches
2011-09-07 12:08:57 +02:00
if ( url ( ) . isPOST ( ) ) {
2008-08-19 16:10:40 +02:00
return false ;
}
2013-09-15 00:30:23 +02:00
if ( MultiProtocolURL . isCGI ( MultiProtocolURL . getFileExtension ( url ( ) . getFileName ( ) ) ) ) {
2008-08-19 16:10:40 +02:00
return false ;
}
2008-08-25 20:11:47 +02:00
String cacheControl ;
2011-09-07 12:08:57 +02:00
if ( this . requestHeader ! = null ) {
2008-08-25 20:11:47 +02:00
// -authorization cases in request
2011-09-07 12:08:57 +02:00
if ( this . requestHeader . containsKey ( RequestHeader . AUTHORIZATION ) ) { return false ; }
2008-08-25 20:11:47 +02:00
// -ranges in request
// we do not cache partial content
2011-09-07 12:08:57 +02:00
if ( this . requestHeader . containsKey ( HeaderFramework . RANGE ) ) { return false ; }
2008-08-25 20:11:47 +02:00
// if the client requests a un-cached copy of the resource ...
2011-09-07 12:08:57 +02:00
cacheControl = this . requestHeader . get ( HeaderFramework . PRAGMA ) ;
2014-12-19 17:37:58 +01:00
if ( cacheControl ! = null & & cacheControl . trim ( ) . toUpperCase ( ) . contains ( " NO-CACHE " ) ) { return false ; }
2008-08-25 20:11:47 +02:00
2011-09-07 12:08:57 +02:00
cacheControl = this . requestHeader . get ( HeaderFramework . CACHE_CONTROL ) ;
2008-08-25 20:11:47 +02:00
if ( cacheControl ! = null ) {
cacheControl = cacheControl . trim ( ) . toUpperCase ( ) ;
2014-12-19 17:37:58 +01:00
if ( cacheControl . contains ( " NO-CACHE " ) | | cacheControl . startsWith ( " MAX-AGE=0 " ) ) { return false ; }
2008-08-25 20:11:47 +02:00
}
// -if-modified-since in request
// The entity has to be transferred only if it has
// been modified since the date given by the If-Modified-Since header.
2011-09-07 12:08:57 +02:00
if ( this . requestHeader . containsKey ( RequestHeader . IF_MODIFIED_SINCE ) ) {
2008-08-25 20:11:47 +02:00
// checking this makes only sense if the cached response contains
// a Last-Modified field. If the field does not exist, we go the safe way
2011-09-07 12:08:57 +02:00
if ( ! this . responseHeader . containsKey ( HeaderFramework . LAST_MODIFIED ) ) { return false ; }
2008-08-25 20:11:47 +02:00
// parse date
Date d1 , d2 ;
2015-04-15 13:17:23 +02:00
d2 = this . responseHeader . lastModified ( ) ; if ( d2 = = null ) { d2 = new Date ( ) ; }
d1 = this . requestHeader . ifModifiedSince ( ) ; if ( d1 = = null ) { d1 = new Date ( ) ; }
2008-08-25 20:11:47 +02:00
// finally, we shall treat the cache as stale if the modification time is after the if-.. time
if ( d2 . after ( d1 ) ) { return false ; }
}
final String mimeType = getMimeType ( ) ;
2009-07-19 23:59:29 +02:00
if ( ! Classification . isPictureMime ( mimeType ) ) {
2008-08-25 20:11:47 +02:00
// -cookies in request
// unfortunately, we should reload in case of a cookie
// but we think that pictures can still be considered as fresh
// -set-cookie in cached response
// this is a similar case as for COOKIE.
2011-09-07 12:08:57 +02:00
if ( this . requestHeader . containsKey ( RequestHeader . COOKIE ) | |
this . responseHeader . containsKey ( HeaderFramework . SET_COOKIE ) | |
this . responseHeader . containsKey ( HeaderFramework . SET_COOKIE2 ) ) {
2008-08-25 20:11:47 +02:00
return false ; // too strong
}
}
2008-08-19 16:10:40 +02:00
}
2011-09-07 12:08:57 +02:00
if ( this . responseHeader ! = null ) {
2008-08-25 20:11:47 +02:00
// -pragma in cached response
// logically, we would not need to care about no-cache pragmas in cached response headers,
// because they cannot exist since they are not written to the cache.
// So this IF should always fail..
2011-09-07 12:08:57 +02:00
cacheControl = this . responseHeader . get ( HeaderFramework . PRAGMA ) ;
2014-12-19 17:37:58 +01:00
if ( cacheControl ! = null & & cacheControl . trim ( ) . toUpperCase ( ) . contains ( " NO-CACHE " ) ) { return false ; }
2011-09-07 12:08:57 +02:00
2008-08-25 20:11:47 +02:00
// see for documentation also:
// http://www.web-caching.com/cacheability.html
// http://vancouver-webpages.com/CacheNow/
2011-09-07 12:08:57 +02:00
2008-08-25 20:11:47 +02:00
// look for freshnes information
// if we don't have any freshnes indication, we treat the file as stale.
// no handle for freshness control:
2011-09-07 12:08:57 +02:00
2008-08-25 20:11:47 +02:00
// -expires in cached response
// the expires value gives us a very easy hint when the cache is stale
2011-09-07 12:08:57 +02:00
final Date expires = this . responseHeader . expires ( ) ;
2015-04-15 13:17:23 +02:00
final Date now = new Date ( ) ;
2008-08-25 20:11:47 +02:00
if ( expires ! = null ) {
// System.out.println("EXPIRES-TEST: expires=" + expires + ", NOW=" + serverDate.correctedGMTDate() + ", url=" + url);
2015-04-15 13:17:23 +02:00
if ( expires . before ( now ) ) { return false ; }
2008-08-25 20:11:47 +02:00
}
2011-09-07 12:08:57 +02:00
final Date lastModified = this . responseHeader . lastModified ( ) ;
cacheControl = this . responseHeader . get ( HeaderFramework . CACHE_CONTROL ) ;
2008-08-25 20:11:47 +02:00
if ( cacheControl = = null & & lastModified = = null & & expires = = null ) { return false ; }
2011-09-07 12:08:57 +02:00
2008-08-25 20:11:47 +02:00
// -lastModified in cached response
// we can apply a TTL (Time To Live) heuristic here. We call the time delta between the last read
// of the file and the last modified date as the age of the file. If we consider the file as
// middel-aged then, the maximum TTL would be cache-creation plus age.
// This would be a TTL factor of 100% we want no more than 10% TTL, so that a 10 month old cache
// file may only be treated as fresh for one more month, not more.
2011-09-07 12:08:57 +02:00
Date date = this . responseHeader . date ( ) ;
2008-08-25 20:11:47 +02:00
if ( lastModified ! = null ) {
2015-04-15 13:17:23 +02:00
if ( date = = null ) { date = now ; }
2008-08-25 20:11:47 +02:00
final long age = date . getTime ( ) - lastModified . getTime ( ) ;
if ( age < 0 ) { return false ; }
// TTL (Time-To-Live) is age/10 = (d2.getTime() - d1.getTime()) / 10
// the actual living-time is serverDate.correctedGMTDate().getTime() - d2.getTime()
// therefore the cache is stale, if serverDate.correctedGMTDate().getTime() - d2.getTime() > age/10
2015-04-15 13:17:23 +02:00
if ( now . getTime ( ) - date . getTime ( ) > age / 10 ) { return false ; }
2008-08-25 20:11:47 +02:00
}
2011-09-07 12:08:57 +02:00
2008-08-25 20:11:47 +02:00
// -cache-control in cached response
// the cache-control has many value options.
if ( cacheControl ! = null ) {
cacheControl = cacheControl . trim ( ) . toUpperCase ( ) ;
2014-12-19 17:37:58 +01:00
if ( cacheControl . contains ( " PRIVATE " ) | |
cacheControl . contains ( " NO-CACHE " ) | |
cacheControl . contains ( " NO-STORE " ) ) {
2008-08-25 20:11:47 +02:00
// easy case
return false ;
// } else if (cacheControl.startsWith("PUBLIC")) {
// // ok, do nothing
} else if ( cacheControl . startsWith ( " MAX-AGE= " ) ) {
// we need also the load date
if ( date = = null ) { return false ; }
try {
2012-05-21 13:40:46 +02:00
final long ttl = 1000 * NumberTools . parseLongDecSubstring ( cacheControl , 8 ) ; // milliseconds to live
2015-04-15 13:17:23 +02:00
if ( now . getTime ( ) - date . getTime ( ) > ttl ) {
2008-08-25 20:11:47 +02:00
return false ;
}
} catch ( final Exception e ) {
return false ;
}
}
}
}
2011-09-07 12:08:57 +02:00
2008-08-19 16:10:40 +02:00
return true ;
}
2011-09-07 12:08:57 +02:00
2009-07-17 15:59:21 +02:00
/ * *
* decide upon header information if a specific file should be indexed
* this method returns null if the answer is ' YES ' !
* if the answer is ' NO ' ( do not index ) , it returns a string with the reason
* to reject the crawling demand in clear text
2011-09-07 12:08:57 +02:00
*
2009-07-17 15:59:21 +02:00
* This function is used by plasmaSwitchboard # processResourceStack
* /
public final String shallIndexCacheForProxy ( ) {
if ( profile ( ) = = null ) {
return " shallIndexCacheForProxy: profile() is null ! " ;
}
// check profile
if ( ! profile ( ) . indexText ( ) & & ! profile ( ) . indexMedia ( ) ) {
2012-10-31 14:08:33 +01:00
return " indexing not allowed - indexText and indexMedia not set (for proxy = " + this . profile . collectionName ( ) + " ) " ;
2009-07-17 15:59:21 +02:00
}
// -CGI access in request
// CGI access makes the page very individual, and therefore not usable in caches
if ( ! profile ( ) . crawlingQ ( ) ) {
if ( url ( ) . isPOST ( ) ) {
return " Dynamic_(POST) " ;
}
2013-09-15 00:30:23 +02:00
if ( MultiProtocolURL . isCGI ( MultiProtocolURL . getFileExtension ( url ( ) . getFileName ( ) ) ) ) {
2009-07-17 15:59:21 +02:00
return " Dynamic_(CGI) " ;
}
}
// -authorization cases in request
// we checked that in shallStoreCache
// -ranges in request
// we checked that in shallStoreCache
// -cookies in request
// unfortunately, we cannot index pages which have been requested with a cookie
// because the returned content may be special for the client
if ( requestWithCookie ( ) ) {
// System.out.println("***not indexed because cookie");
return " Dynamic_(Requested_With_Cookie) " ;
}
2011-09-07 12:08:57 +02:00
if ( this . responseHeader ! = null ) {
2009-07-17 15:59:21 +02:00
// -set-cookie in response
// the set-cookie from the server does not indicate that the content is special
2011-09-07 12:08:57 +02:00
// thus we do not care about it here for indexing
2015-12-20 15:49:24 +01:00
final String parserError = TextParser . supportsMime ( this . responseHeader . getContentType ( ) ) ;
2009-07-17 15:59:21 +02:00
if ( parserError ! = null ) {
return " Media_Content, no parser: " + parserError ;
}
// -if-modified-since in request
// if the page is fresh at the very moment we can index it
2013-09-30 02:50:53 +02:00
final Date ifModifiedSince = this . ifModifiedSince ( ) ;
2015-04-15 13:17:23 +02:00
final Date now = new Date ( ) ;
2011-09-07 12:08:57 +02:00
if ( ( ifModifiedSince ! = null ) & & ( this . responseHeader . containsKey ( HeaderFramework . LAST_MODIFIED ) ) ) {
2009-07-17 15:59:21 +02:00
// parse date
2011-09-07 12:08:57 +02:00
Date d = this . responseHeader . lastModified ( ) ;
2015-04-15 13:17:23 +02:00
if ( d = = null ) d = now ;
2009-07-17 15:59:21 +02:00
// finally, we shall treat the cache as stale if the modification time is after the if-.. time
if ( d . after ( ifModifiedSince ) ) {
//System.out.println("***not indexed because if-modified-since");
return " Stale_(Last-Modified>Modified-Since) " ;
}
}
// -pragma in cached response
2011-09-07 12:08:57 +02:00
if ( this . responseHeader . containsKey ( HeaderFramework . PRAGMA ) & &
2014-12-19 17:37:58 +01:00
( this . responseHeader . get ( HeaderFramework . PRAGMA ) ) . toUpperCase ( ) . contains ( " NO-CACHE " ) ) {
2009-07-17 15:59:21 +02:00
return " Denied_(pragma_no_cache) " ;
}
// see for documentation also:
// http://www.web-caching.com/cacheability.html
// look for freshnes information
// -expires in cached response
// the expires value gives us a very easy hint when the cache is stale
// sometimes, the expires date is set to the past to prevent that a page is cached
// we use that information to see if we should index it
2011-09-07 12:08:57 +02:00
final Date expires = this . responseHeader . expires ( ) ;
2015-04-15 13:17:23 +02:00
if ( expires ! = null & & expires . before ( now ) ) {
2009-07-17 15:59:21 +02:00
return " Stale_(Expired) " ;
}
// -lastModified in cached response
// this information is too weak to use it to prevent indexing
// even if we can apply a TTL heuristic for cache usage
// -cache-control in cached response
// the cache-control has many value options.
2011-09-07 12:08:57 +02:00
String cacheControl = this . responseHeader . get ( HeaderFramework . CACHE_CONTROL ) ;
2009-07-17 15:59:21 +02:00
if ( cacheControl ! = null ) {
cacheControl = cacheControl . trim ( ) . toUpperCase ( ) ;
/ * we have the following cases for cache - control :
" public " - - can be indexed
" private " , " no-cache " , " no-store " - - cannot be indexed
" max-age=<delta-seconds> " - - stale / fresh dependent on date
* /
2014-12-19 17:37:58 +01:00
if ( cacheControl . contains ( " PRIVATE " ) | |
cacheControl . contains ( " NO-CACHE " ) | |
cacheControl . contains ( " NO-STORE " ) ) {
2009-07-17 15:59:21 +02:00
// easy case
return " Stale_(denied_by_cache-control= " + cacheControl + " ) " ;
// } else if (cacheControl.startsWith("PUBLIC")) {
// // ok, do nothing
} else if ( cacheControl . startsWith ( " MAX-AGE= " ) ) {
// we need also the load date
2011-09-07 12:08:57 +02:00
final Date date = this . responseHeader . date ( ) ;
2009-07-17 15:59:21 +02:00
if ( date = = null ) {
return " Stale_(no_date_given_in_response) " ;
}
try {
2012-05-21 13:40:46 +02:00
final long ttl = 1000 * NumberTools . parseLongDecSubstring ( cacheControl , 8 ) ; // milliseconds to live
2015-04-15 13:17:23 +02:00
if ( now . getTime ( ) - date . getTime ( ) > ttl ) {
2009-07-17 15:59:21 +02:00
//System.out.println("***not indexed because cache-control");
return " Stale_(expired_by_cache-control) " ;
}
} catch ( final Exception e ) {
return " Error_( " + e . getMessage ( ) + " ) " ;
}
}
}
}
return null ;
}
/ * *
* decide upon header information if a specific file should be indexed
* this method returns null if the answer is ' YES ' !
* if the answer is ' NO ' ( do not index ) , it returns a string with the reason
* to reject the crawling demand in clear text
*
* This function is used by plasmaSwitchboard # processResourceStack
* /
public final String shallIndexCacheForCrawler ( ) {
if ( profile ( ) = = null ) {
return " shallIndexCacheForCrawler: profile() is null ! " ;
}
// check profile
if ( ! profile ( ) . indexText ( ) & & ! profile ( ) . indexMedia ( ) ) {
2012-10-31 14:08:33 +01:00
return " indexing not allowed - indexText and indexMedia not set (for crawler = " + this . profile . collectionName ( ) + " ) " ;
2009-07-17 15:59:21 +02:00
}
// -CGI access in request
// CGI access makes the page very individual, and therefore not usable in caches
if ( ! profile ( ) . crawlingQ ( ) ) {
if ( url ( ) . isPOST ( ) ) { return " Dynamic_(POST) " ; }
2013-09-15 00:30:23 +02:00
if ( MultiProtocolURL . isCGI ( MultiProtocolURL . getFileExtension ( url ( ) . getFileName ( ) ) ) ) { return " Dynamic_(CGI) " ; }
2009-07-17 15:59:21 +02:00
}
// -authorization cases in request
// we checked that in shallStoreCache
// -ranges in request
// we checked that in shallStoreCache
2010-01-07 01:42:12 +01:00
// check if document can be indexed
2011-09-07 12:08:57 +02:00
if ( this . responseHeader ! = null ) {
2015-12-20 15:49:24 +01:00
final String mimeType = this . responseHeader . getContentType ( ) ;
2011-09-07 12:08:57 +02:00
final String parserError = TextParser . supportsMime ( mimeType ) ;
2010-01-07 01:42:12 +01:00
if ( parserError ! = null & & TextParser . supportsExtension ( url ( ) ) ! = null ) return " no parser available: " + parserError ;
2009-07-17 15:59:21 +02:00
}
2011-09-07 12:08:57 +02:00
2009-07-17 15:59:21 +02:00
// -if-modified-since in request
// if the page is fresh at the very moment we can index it
// -> this does not apply for the crawler
// -cookies in request
// unfortunately, we cannot index pages which have been requested with a cookie
// because the returned content may be special for the client
// -> this does not apply for a crawler
// -set-cookie in response
// the set-cookie from the server does not indicate that the content is special
// thus we do not care about it here for indexing
// -> this does not apply for a crawler
// -pragma in cached response
// -> in the crawler we ignore this
// look for freshnes information
// -expires in cached response
// the expires value gives us a very easy hint when the cache is stale
// sometimes, the expires date is set to the past to prevent that a page is cached
// we use that information to see if we should index it
// -> this does not apply for a crawler
// -lastModified in cached response
// this information is too weak to use it to prevent indexing
// even if we can apply a TTL heuristic for cache usage
// -cache-control in cached response
// the cache-control has many value options.
// -> in the crawler we ignore this
return null ;
}
2011-09-07 12:08:57 +02:00
2015-12-20 15:49:24 +01:00
/ * *
* Get Mime type from http header or null if unknown ( not included in response header )
* @return mime ( trimmed and lowercase ) or null
* /
2008-08-25 20:11:47 +02:00
public String getMimeType ( ) {
2011-09-07 12:08:57 +02:00
if ( this . responseHeader = = null ) return null ;
2015-12-20 15:49:24 +01:00
String mimeType = this . responseHeader . getContentType ( ) ;
if ( mimeType ! = null ) {
2017-06-27 06:42:33 +02:00
mimeType = mimeType . trim ( ) . toLowerCase ( Locale . ROOT ) ;
2011-09-07 12:08:57 +02:00
2015-12-20 15:49:24 +01:00
final int pos = mimeType . indexOf ( ';' ) ;
return ( ( pos < 0 ) ? mimeType : mimeType . substring ( 0 , pos ) ) ;
}
return null ;
2008-08-25 20:11:47 +02:00
}
2011-09-07 12:08:57 +02:00
2008-08-25 20:11:47 +02:00
public String getCharacterEncoding ( ) {
2011-09-07 12:08:57 +02:00
if ( this . responseHeader = = null ) return null ;
return this . responseHeader . getCharacterEncoding ( ) ;
2008-08-25 20:11:47 +02:00
}
2011-09-07 12:08:57 +02:00
2013-09-15 00:30:23 +02:00
public DigestURL referrerURL ( ) {
2011-09-07 12:08:57 +02:00
if ( this . requestHeader = = null ) return null ;
2017-03-04 22:45:17 +01:00
return this . requestHeader . referer ( ) ;
2009-07-17 15:59:21 +02:00
}
2011-09-07 12:08:57 +02:00
2010-04-08 02:11:32 +02:00
public byte [ ] referrerHash ( ) {
2011-09-07 12:08:57 +02:00
if ( this . requestHeader = = null ) return null ;
2017-03-04 22:45:17 +01:00
final DigestURL url = this . requestHeader . referer ( ) ;
if ( url = = null ) return null ;
return url . hash ( ) ;
2008-08-25 20:11:47 +02:00
}
2011-09-07 12:08:57 +02:00
2008-08-25 20:11:47 +02:00
public boolean validResponseStatus ( ) {
2012-06-25 18:17:31 +02:00
int status = this . responseHeader . getStatusCode ( ) ;
return status = = 200 | | status = = 203 ;
2008-08-25 20:11:47 +02:00
}
public Date ifModifiedSince ( ) {
2011-09-07 12:08:57 +02:00
return ( this . requestHeader = = null ) ? null : this . requestHeader . ifModifiedSince ( ) ;
2008-08-25 20:11:47 +02:00
}
public boolean requestWithCookie ( ) {
2011-09-07 12:08:57 +02:00
return ( this . requestHeader = = null ) ? false : this . requestHeader . containsKey ( RequestHeader . COOKIE ) ;
2008-08-25 20:11:47 +02:00
}
public boolean requestProhibitsIndexing ( ) {
2011-09-07 12:08:57 +02:00
return ( this . requestHeader = = null )
? false
: this . requestHeader . containsKey ( HeaderFramework . X_YACY_INDEX_CONTROL ) & &
( this . requestHeader . get ( HeaderFramework . X_YACY_INDEX_CONTROL ) ) . toUpperCase ( ) . equals ( " NO-INDEX " ) ;
2008-08-25 20:11:47 +02:00
}
2011-09-07 12:08:57 +02:00
public EventOrigin processCase ( final String mySeedHash ) {
2009-07-17 15:59:21 +02:00
// we must distinguish the following cases: resource-load was initiated by
// 1) global crawling: the index is extern, not here (not possible here)
// 2) result of search queries, some indexes are here (not possible here)
// 3) result of index transfer, some of them are here (not possible here)
// 4) proxy-load (initiator is "------------")
// 5) local prefetch/crawling (initiator is own seedHash)
2017-04-24 18:24:26 +02:00
// 6) local fetching for global crawling (other known or unknown initiator)
// 7) local surrogates processing (can not be known here : crawl profile is required)
2009-10-21 22:14:30 +02:00
EventOrigin processCase = EventOrigin . UNKNOWN ;
2009-07-17 15:59:21 +02:00
// FIXME the equals seems to be incorrect: String.equals(boolean)
2011-05-27 10:24:54 +02:00
if ( initiator ( ) = = null | | initiator ( ) . length = = 0 | | ASCII . String ( initiator ( ) ) . equals ( " ------------ " ) ) {
2009-07-17 15:59:21 +02:00
// proxy-load
2009-10-21 22:14:30 +02:00
processCase = EventOrigin . PROXY_LOAD ;
2011-03-07 21:36:40 +01:00
} else if ( UTF8 . String ( initiator ( ) ) . equals ( mySeedHash ) ) {
2009-07-17 15:59:21 +02:00
// normal crawling
2009-10-21 22:14:30 +02:00
processCase = EventOrigin . LOCAL_CRAWLING ;
2009-07-17 15:59:21 +02:00
} else {
// this was done for remote peer (a global crawl)
2009-10-21 22:14:30 +02:00
processCase = EventOrigin . GLOBAL_CRAWLING ;
2009-07-17 15:59:21 +02:00
}
return processCase ;
}
2011-09-07 12:08:57 +02:00
2010-06-29 21:20:45 +02:00
public Document [ ] parse ( ) throws Parser . Failure {
2015-12-20 15:49:24 +01:00
final String supportError = TextParser . supports ( url ( ) , this . responseHeader = = null ? null : this . responseHeader . getContentType ( ) ) ;
2010-06-29 21:20:45 +02:00
if ( supportError ! = null ) throw new Parser . Failure ( " no parser support: " + supportError , url ( ) ) ;
2010-06-22 14:28:53 +02:00
try {
2016-02-16 02:05:58 +01:00
return TextParser . parseSource ( url ( ) , this . responseHeader = = null ? null : this . responseHeader . getContentType ( ) , this . responseHeader = = null ? StandardCharsets . UTF_8 . name ( ) : this . responseHeader . getCharacterEncoding ( ) , new VocabularyScraper ( ) , this . request . timezoneOffset ( ) , this . request . depth ( ) , this . content ) ;
2017-07-16 14:39:53 +02:00
} catch ( Parser . Failure e ) {
throw e ;
2011-09-07 12:08:57 +02:00
} catch ( final Exception e ) {
2010-06-22 14:28:53 +02:00
return null ;
}
}
2008-08-19 16:10:40 +02:00
}