2005-06-08 02:52:24 +02:00
// plasmaSnippetCache.java
// -----------------------
// part of YaCy
// (C) by Michael Peter Christen; mc@anomic.de
// first published on http://www.anomic.de
// Frankfurt, Germany, 2005
2006-10-09 15:37:38 +02:00
// last major change: 09.10.2006
2006-09-30 02:27:42 +02:00
//
// contributions by Marc Nause [MN]
2005-06-08 02:52:24 +02:00
//
// This program is free software; you can redistribute it and/or modify
// it under the terms of the GNU General Public License as published by
// the Free Software Foundation; either version 2 of the License, or
// (at your option) any later version.
//
// This program is distributed in the hope that it will be useful,
// but WITHOUT ANY WARRANTY; without even the implied warranty of
// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
// GNU General Public License for more details.
//
// You should have received a copy of the GNU General Public License
// along with this program; if not, write to the Free Software
// Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
//
// Using this software in any meaning (reading, learning, copying, compiling,
// running) means that you agree that the Author(s) is (are) not responsible
// for cost, loss of data or any harm that may be caused directly or indirectly
// by usage of this softare or this documentation. The usage of this software
// is on your own risk. The installation and usage (starting/running) of this
// software may allow other people or application to access your computer and
// any attached devices and is highly dependent on the configuration of the
// software which must be done by the user of the software; the author(s) is
// (are) also not responsible for proper configuration and usage of the
// software, even if provoked by documentation provided together with
// the software.
//
// Any changes to this file according to the GPL as documented in the file
// gpl.txt aside this file in the shipment you received can be done to the
// lines that follows this copyright notice here, but changes must not be
// done inside the copyright notive above. A re-distribution must contain
// the intact and unchanged copyright notice.
// Contributions and changes to the program code must be marked as such.
package de.anomic.plasma ;
2006-10-03 13:05:48 +02:00
import java.io.ByteArrayInputStream ;
2005-06-08 02:52:24 +02:00
import java.io.IOException ;
2006-10-03 13:05:48 +02:00
import java.io.InputStream ;
2006-12-11 02:31:23 +01:00
import java.util.ArrayList ;
2005-07-06 16:48:41 +02:00
import java.util.Enumeration ;
import java.util.HashMap ;
import java.util.HashSet ;
import java.util.Iterator ;
2006-12-08 03:14:56 +01:00
import java.util.Map ;
2005-07-06 16:48:41 +02:00
import java.util.Set ;
2006-12-09 03:13:43 +01:00
import java.util.TreeMap ;
2006-12-11 02:31:23 +01:00
import java.util.TreeSet ;
2005-07-06 16:48:41 +02:00
2006-12-11 02:31:23 +01:00
import de.anomic.htmlFilter.htmlFilterImageEntry ;
2006-09-22 13:55:28 +02:00
import de.anomic.http.httpHeader ;
import de.anomic.http.httpc ;
2006-11-10 02:13:33 +01:00
import de.anomic.plasma.plasmaURL ;
2006-09-30 00:27:20 +02:00
import de.anomic.kelondro.kelondroMScoreCluster ;
2007-03-13 23:18:36 +01:00
import de.anomic.kelondro.kelondroMSetTools ;
2006-09-30 00:27:20 +02:00
import de.anomic.net.URL ;
import de.anomic.plasma.cache.IResourceInfo ;
import de.anomic.plasma.crawler.plasmaCrawlerException ;
import de.anomic.plasma.parser.ParserException ;
import de.anomic.server.logging.serverLog ;
import de.anomic.yacy.yacySearch ;
2006-10-10 01:07:10 +02:00
import de.anomic.yacy.yacyCore ;
2005-06-08 02:52:24 +02:00
public class plasmaSnippetCache {
private static final int maxCache = 500 ;
2005-06-30 02:01:53 +02:00
2005-06-23 14:12:12 +02:00
public static final int SOURCE_CACHE = 0 ;
2005-06-30 02:01:53 +02:00
public static final int SOURCE_FILE = 1 ;
public static final int SOURCE_WEB = 2 ;
2005-06-23 14:12:12 +02:00
2005-06-30 02:01:53 +02:00
public static final int ERROR_NO_HASH_GIVEN = 11 ;
public static final int ERROR_SOURCE_LOADING = 12 ;
public static final int ERROR_RESOURCE_LOADING = 13 ;
public static final int ERROR_PARSER_FAILED = 14 ;
public static final int ERROR_PARSER_NO_LINES = 15 ;
public static final int ERROR_NO_MATCH = 16 ;
2005-06-08 02:52:24 +02:00
2007-08-15 13:36:59 +02:00
private static int snippetsScoreCounter ;
private static kelondroMScoreCluster snippetsScore ;
private static HashMap snippetsCache ;
2007-06-09 17:22:37 +02:00
/ * *
* a cache holding URLs to favicons specified by the page content , e . g . by using the html link - tag . e . g .
* < pre >
* & lt ; link rel = " shortcut icon " type = " image/x-icon " href = " ../src/favicon.ico " & gt ;
* < / pre >
* /
2007-08-15 13:36:59 +02:00
private static HashMap faviconCache ;
private static plasmaHTCache cacheManager ;
private static plasmaParser parser ;
private static serverLog log ;
2005-06-08 02:52:24 +02:00
2007-08-15 13:36:59 +02:00
public static void init (
plasmaHTCache cacheManagerx ,
plasmaParser parserx ,
serverLog logx
2005-10-22 15:28:04 +02:00
) {
2007-08-15 13:36:59 +02:00
cacheManager = cacheManagerx ;
parser = parserx ;
log = logx ;
snippetsScoreCounter = 0 ;
snippetsScore = new kelondroMScoreCluster ( ) ;
snippetsCache = new HashMap ( ) ;
faviconCache = new HashMap ( ) ;
2005-06-08 02:52:24 +02:00
}
2007-08-15 13:36:59 +02:00
public static class TextSnippet {
2007-03-13 23:18:36 +01:00
private URL url ;
2006-03-05 02:11:06 +01:00
private String line ;
private String error ;
2007-03-13 23:18:36 +01:00
private int errorCode ;
private Set remaingHashes ;
2007-06-09 17:22:37 +02:00
private URL favicon ;
2007-03-13 23:18:36 +01:00
public TextSnippet ( URL url , String line , int errorCode , Set remaingHashes , String errortext ) {
2007-06-09 17:22:37 +02:00
this ( url , line , errorCode , remaingHashes , errortext , null ) ;
}
public TextSnippet ( URL url , String line , int errorCode , Set remaingHashes , String errortext , URL favicon ) {
2007-03-13 23:18:36 +01:00
this . url = url ;
2005-06-23 14:12:12 +02:00
this . line = line ;
2007-03-13 23:18:36 +01:00
this . errorCode = errorCode ;
2005-06-30 02:01:53 +02:00
this . error = errortext ;
2007-03-13 23:18:36 +01:00
this . remaingHashes = remaingHashes ;
2007-06-09 17:22:37 +02:00
this . favicon = favicon ;
2007-03-13 23:18:36 +01:00
}
public URL getUrl ( ) {
return this . url ;
2005-06-23 14:12:12 +02:00
}
2006-03-05 02:11:06 +01:00
public boolean exists ( ) {
return line ! = null ;
}
2005-06-23 14:12:12 +02:00
public String toString ( ) {
2006-03-05 02:11:06 +01:00
return ( line = = null ) ? " " : line ;
}
public String getLineRaw ( ) {
return ( line = = null ) ? " " : line ;
}
public String getError ( ) {
return ( error = = null ) ? " " : error . trim ( ) ;
}
2007-03-13 23:18:36 +01:00
public int getErrorCode ( ) {
return errorCode ;
}
public Set getRemainingHashes ( ) {
return this . remaingHashes ;
}
2006-03-05 02:11:06 +01:00
public String getLineMarked ( Set queryHashes ) {
if ( line = = null ) return " " ;
if ( ( queryHashes = = null ) | | ( queryHashes . size ( ) = = 0 ) ) return line . trim ( ) ;
if ( line . endsWith ( " . " ) ) line = line . substring ( 0 , line . length ( ) - 1 ) ;
Iterator i = queryHashes . iterator ( ) ;
String h ;
String [ ] w = line . split ( " " ) ;
2006-10-09 15:37:38 +02:00
String prefix = " " ;
String postfix = " " ;
int len = 0 ;
2006-03-05 02:11:06 +01:00
while ( i . hasNext ( ) ) {
h = ( String ) i . next ( ) ;
for ( int j = 0 ; j < w . length ; j + + ) {
2006-09-30 02:27:42 +02:00
//ignore punctuation marks (contrib [MN])
2006-10-09 15:37:38 +02:00
//note to myself:
//For details on regex see "Mastering regular expressions" by J.E.F. Friedl
//especially p. 123 and p. 390/391 (in the German version of the 2nd edition)
prefix = " " ;
postfix = " " ;
2006-12-11 12:07:36 +01:00
// cut off prefix if it contains of non-characters or non-numbers
while ( w [ j ] . matches ( " \\ A[^ \\ p{L} \\ p{N}].+ " ) ) {
2007-06-17 18:29:04 +02:00
prefix = prefix + w [ j ] . substring ( 0 , 1 ) ;
2006-10-09 15:37:38 +02:00
w [ j ] = w [ j ] . substring ( 1 ) ;
2006-09-30 02:27:42 +02:00
}
2006-10-09 15:37:38 +02:00
2006-12-11 12:07:36 +01:00
// cut off postfix if it contains of non-characters or non-numbers
while ( w [ j ] . matches ( " .+[^ \\ p{L} \\ p{N}] \\ Z " ) ) {
2006-10-09 15:37:38 +02:00
len = w [ j ] . length ( ) ;
postfix = w [ j ] . substring ( len - 1 , len ) + postfix ;
w [ j ] = w [ j ] . substring ( 0 , len - 1 ) ;
2006-09-30 02:27:42 +02:00
}
2006-10-09 15:37:38 +02:00
2006-12-11 16:19:35 +01:00
//special treatment if there is a special character in the word
if ( w [ j ] . matches ( " \\ A[ \\ p{L} \\ p{N}]+[^ \\ p{L} \\ p{N}].+ \\ Z " ) ) {
String out = " " ;
String temp = " " ;
for ( int k = 0 ; k < w [ j ] . length ( ) ; k + + ) {
//is character a special character?
if ( w [ j ] . substring ( k , k + 1 ) . matches ( " [^ \\ p{L} \\ p{N}] " ) ) {
if ( plasmaCondenser . word2hash ( temp ) . equals ( h ) ) temp = " <b> " + temp + " </b> " ;
out = out + temp + w [ j ] . substring ( k , k + 1 ) ;
temp = " " ;
}
//last character
else if ( k = = ( w [ j ] . length ( ) - 1 ) ) {
temp = temp + w [ j ] . substring ( k , k + 1 ) ;
if ( plasmaCondenser . word2hash ( temp ) . equals ( h ) ) temp = " <b> " + temp + " </b> " ;
out = out + temp ;
temp = " " ;
}
else temp = temp + w [ j ] . substring ( k , k + 1 ) ;
}
w [ j ] = out ;
2006-12-11 12:07:36 +01:00
}
2006-12-11 16:19:35 +01:00
//end contrib [MN]
else if ( plasmaCondenser . word2hash ( w [ j ] ) . equals ( h ) ) w [ j ] = " <b> " + w [ j ] + " </b> " ;
2006-12-11 12:07:36 +01:00
2006-10-09 15:37:38 +02:00
w [ j ] = prefix + w [ j ] + postfix ;
2006-03-05 02:11:06 +01:00
}
}
StringBuffer l = new StringBuffer ( line . length ( ) + queryHashes . size ( ) * 8 ) ;
for ( int j = 0 ; j < w . length ; j + + ) {
l . append ( w [ j ] ) ;
l . append ( ' ' ) ;
}
return l . toString ( ) . trim ( ) ;
}
2007-06-09 17:22:37 +02:00
public URL getFavicon ( ) {
return this . favicon ;
}
2005-06-23 14:12:12 +02:00
}
2005-06-08 02:52:24 +02:00
2007-08-15 13:36:59 +02:00
public static class MediaSnippet {
2006-12-11 02:31:23 +01:00
public String type , href , name , attr ;
public MediaSnippet ( String type , String href , String name , String attr ) {
this . type = type ;
this . href = href ;
this . name = name ;
this . attr = attr ;
2006-12-12 03:09:25 +01:00
if ( ( this . name = = null ) | | ( this . name . length ( ) = = 0 ) ) this . name = " _ " ;
if ( ( this . attr = = null ) | | ( this . attr . length ( ) = = 0 ) ) this . attr = " _ " ;
2006-12-11 02:31:23 +01:00
}
}
2007-08-15 13:36:59 +02:00
public static boolean existsInCache ( URL url , Set queryhashes ) {
2005-07-02 01:35:36 +02:00
String hashes = yacySearch . set2string ( queryhashes ) ;
2006-11-10 02:13:33 +01:00
return retrieveFromCache ( hashes , plasmaURL . urlHash ( url ) ) ! = null ;
2005-06-24 09:41:07 +02:00
}
2007-08-15 13:36:59 +02:00
public static TextSnippet retrieveTextSnippet ( URL url , Set queryhashes , boolean fetchOnline , boolean pre , int snippetMaxLength , int timeout ) {
2005-06-30 00:55:37 +02:00
// heise = "0OQUNU3JSs05"
2005-06-23 14:12:12 +02:00
if ( queryhashes . size ( ) = = 0 ) {
2005-08-02 18:09:19 +02:00
//System.out.println("found no queryhashes for URL retrieve " + url);
2007-03-13 23:18:36 +01:00
return new TextSnippet ( url , null , ERROR_NO_HASH_GIVEN , queryhashes , " no query hashes given " ) ;
2005-06-23 14:12:12 +02:00
}
2006-11-10 02:13:33 +01:00
String urlhash = plasmaURL . urlHash ( url ) ;
2005-06-23 14:12:12 +02:00
// try to get snippet from snippetCache
2005-06-30 02:01:53 +02:00
int source = SOURCE_CACHE ;
2005-06-23 14:12:12 +02:00
String wordhashes = yacySearch . set2string ( queryhashes ) ;
String line = retrieveFromCache ( wordhashes , urlhash ) ;
2007-06-09 17:22:37 +02:00
if ( line ! = null ) {
2005-08-02 18:09:19 +02:00
//System.out.println("found snippet for URL " + url + " in cache: " + line);
2007-08-15 13:36:59 +02:00
return new TextSnippet ( url , line , source , null , null , ( URL ) faviconCache . get ( urlhash ) ) ;
2005-06-23 14:12:12 +02:00
}
2006-09-20 14:25:07 +02:00
/ * = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = =
* LOADING RESOURCE DATA
* = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = * /
2005-06-23 14:12:12 +02:00
// if the snippet is not in the cache, we can try to get it from the htcache
2006-10-03 13:05:48 +02:00
long resContentLength = 0 ;
InputStream resContent = null ;
IResourceInfo resInfo = null ;
2005-06-23 14:12:12 +02:00
try {
2006-09-20 14:25:07 +02:00
// trying to load the resource from the cache
2007-08-15 13:36:59 +02:00
resContent = cacheManager . getResourceContentStream ( url ) ;
2006-10-03 13:05:48 +02:00
if ( resContent ! = null ) {
// if the content was found
2007-08-15 13:36:59 +02:00
resContentLength = cacheManager . getResourceContentLength ( url ) ;
2006-10-03 13:05:48 +02:00
} else if ( fetchOnline ) {
// if not found try to download it
2006-09-20 14:25:07 +02:00
2006-10-03 13:05:48 +02:00
// download resource using the crawler and keep resource in memory if possible
2006-12-19 04:10:46 +01:00
plasmaHTCache . Entry entry = loadResourceFromWeb ( url , timeout , true , true ) ;
2006-09-20 14:25:07 +02:00
2006-10-03 13:05:48 +02:00
// getting resource metadata (e.g. the http headers for http resources)
if ( entry ! = null ) {
resInfo = entry . getDocumentInfo ( ) ;
// read resource body (if it is there)
byte [ ] resourceArray = entry . cacheArray ( ) ;
if ( resourceArray ! = null ) {
resContent = new ByteArrayInputStream ( resourceArray ) ;
resContentLength = resourceArray . length ;
} else {
2007-08-15 13:36:59 +02:00
resContent = cacheManager . getResourceContentStream ( url ) ;
resContentLength = cacheManager . getResourceContentLength ( url ) ;
2006-10-03 13:05:48 +02:00
}
}
2006-10-02 19:18:24 +02:00
2006-10-03 13:05:48 +02:00
// if it is still not available, report an error
2007-03-13 23:18:36 +01:00
if ( resContent = = null ) return new TextSnippet ( url , null , ERROR_RESOURCE_LOADING , queryhashes , " error loading resource, plasmaHTCache.Entry cache is NULL " ) ;
2006-10-02 19:18:24 +02:00
2005-06-23 14:12:12 +02:00
source = SOURCE_WEB ;
2006-10-03 13:05:48 +02:00
} else {
2007-03-13 23:18:36 +01:00
return new TextSnippet ( url , null , ERROR_SOURCE_LOADING , queryhashes , " no resource available " ) ;
2005-06-23 14:12:12 +02:00
}
2006-09-20 14:25:07 +02:00
} catch ( Exception e ) {
if ( ! ( e instanceof plasmaCrawlerException ) ) e . printStackTrace ( ) ;
2007-03-13 23:18:36 +01:00
return new TextSnippet ( url , null , ERROR_SOURCE_LOADING , queryhashes , " error loading resource: " + e . getMessage ( ) ) ;
2006-10-03 13:05:48 +02:00
}
2006-10-02 03:15:02 +02:00
2006-09-20 14:25:07 +02:00
/ * = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = =
* PARSING RESOURCE
* = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = * /
plasmaParserDocument document = null ;
try {
2007-06-09 17:22:37 +02:00
document = parseDocument ( url , resContentLength , resContent , resInfo ) ;
2006-09-20 14:25:07 +02:00
} catch ( ParserException e ) {
2007-03-13 23:18:36 +01:00
return new TextSnippet ( url , null , ERROR_PARSER_FAILED , queryhashes , e . getMessage ( ) ) ; // cannot be parsed
2006-10-03 13:05:48 +02:00
} finally {
try { resContent . close ( ) ; } catch ( Exception e ) { /* ignore this */ }
2006-09-20 14:25:07 +02:00
}
2007-03-13 23:18:36 +01:00
if ( document = = null ) return new TextSnippet ( url , null , ERROR_PARSER_FAILED , queryhashes , " parser error/failed " ) ; // cannot be parsed
2006-12-08 03:14:56 +01:00
2006-09-20 14:25:07 +02:00
/ * = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = =
* COMPUTE SNIPPET
2007-06-09 17:22:37 +02:00
* = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = * /
URL resFavicon = document . getFavicon ( ) ;
2007-08-15 13:36:59 +02:00
if ( resFavicon ! = null ) faviconCache . put ( urlhash , resFavicon ) ;
2005-06-23 14:12:12 +02:00
// we have found a parseable non-empty file: use the lines
2006-12-08 03:14:56 +01:00
// compute snippet from text
2006-12-09 03:13:43 +01:00
final Iterator sentences = document . getSentences ( pre ) ;
2007-06-09 17:22:37 +02:00
if ( sentences = = null ) return new TextSnippet ( url , null , ERROR_PARSER_NO_LINES , queryhashes , " parser returned no sentences " , resFavicon ) ;
2007-03-13 23:18:36 +01:00
Object [ ] tsr = computeTextSnippet ( sentences , queryhashes , snippetMaxLength ) ;
String textline = ( tsr = = null ) ? null : ( String ) tsr [ 0 ] ;
Set remainingHashes = ( tsr = = null ) ? queryhashes : ( Set ) tsr [ 1 ] ;
2006-12-08 03:14:56 +01:00
// compute snippet from media
String audioline = computeMediaSnippet ( document . getAudiolinks ( ) , queryhashes ) ;
String videoline = computeMediaSnippet ( document . getVideolinks ( ) , queryhashes ) ;
String appline = computeMediaSnippet ( document . getApplinks ( ) , queryhashes ) ;
//String hrefline = computeMediaSnippet(document.getAnchors(), queryhashes);
//String imageline = computeMediaSnippet(document.getAudiolinks(), queryhashes);
line = " " ;
if ( audioline ! = null ) line + = ( line . length ( ) = = 0 ) ? audioline : " <br /> " + audioline ;
if ( videoline ! = null ) line + = ( line . length ( ) = = 0 ) ? videoline : " <br /> " + videoline ;
if ( appline ! = null ) line + = ( line . length ( ) = = 0 ) ? appline : " <br /> " + appline ;
//if (hrefline != null) line += (line.length() == 0) ? hrefline : "<br />" + hrefline;
if ( textline ! = null ) line + = ( line . length ( ) = = 0 ) ? textline : " <br /> " + textline ;
2007-06-09 17:22:37 +02:00
if ( ( line = = null ) | | ( remainingHashes . size ( ) > 0 ) ) return new TextSnippet ( url , null , ERROR_NO_MATCH , remainingHashes , " no matching snippet found " , resFavicon ) ;
2005-07-01 01:19:08 +02:00
if ( line . length ( ) > snippetMaxLength ) line = line . substring ( 0 , snippetMaxLength ) ;
2005-06-23 14:12:12 +02:00
// finally store this snippet in our own cache
storeToCache ( wordhashes , urlhash , line ) ;
2006-12-08 03:14:56 +01:00
document . close ( ) ;
2007-06-09 17:22:37 +02:00
return new TextSnippet ( url , line , source , null , null , resFavicon ) ;
2006-09-18 02:37:02 +02:00
}
2006-09-20 14:25:07 +02:00
/ * *
* Tries to load and parse a resource specified by it ' s URL .
* If the resource is not stored in cache and if fetchOnline is set the
* this function tries to download the resource from web .
*
* @param url the URL of the resource
* @param fetchOnline specifies if the resource should be loaded from web if it ' as not available in the cache
* @return the parsed document as { @link plasmaParserDocument }
* /
2007-08-15 13:36:59 +02:00
public static plasmaParserDocument retrieveDocument ( URL url , boolean fetchOnline , int timeout , boolean forText ) {
2006-12-11 02:31:23 +01:00
// load resource
long resContentLength = 0 ;
InputStream resContent = null ;
IResourceInfo resInfo = null ;
2006-09-18 02:37:02 +02:00
try {
2006-12-11 02:31:23 +01:00
// trying to load the resource from the cache
2007-08-15 13:36:59 +02:00
resContent = cacheManager . getResourceContentStream ( url ) ;
2006-12-11 02:31:23 +01:00
if ( resContent ! = null ) {
// if the content was found
2007-08-15 13:36:59 +02:00
resContentLength = cacheManager . getResourceContentLength ( url ) ;
2006-12-11 02:31:23 +01:00
} else if ( fetchOnline ) {
// if not found try to download it
2006-09-20 14:25:07 +02:00
2006-12-11 02:31:23 +01:00
// download resource using the crawler and keep resource in memory if possible
2006-12-19 04:10:46 +01:00
plasmaHTCache . Entry entry = loadResourceFromWeb ( url , timeout , true , forText ) ;
2006-12-11 02:31:23 +01:00
// getting resource metadata (e.g. the http headers for http resources)
2006-10-03 13:05:48 +02:00
if ( entry ! = null ) {
2006-12-11 02:31:23 +01:00
resInfo = entry . getDocumentInfo ( ) ;
// read resource body (if it is there)
byte [ ] resourceArray = entry . cacheArray ( ) ;
2006-10-03 13:05:48 +02:00
if ( resourceArray ! = null ) {
2006-12-11 02:31:23 +01:00
resContent = new ByteArrayInputStream ( resourceArray ) ;
resContentLength = resourceArray . length ;
2006-10-03 13:05:48 +02:00
} else {
2007-08-15 13:36:59 +02:00
resContent = cacheManager . getResourceContentStream ( url ) ;
resContentLength = cacheManager . getResourceContentLength ( url ) ;
2006-10-03 13:05:48 +02:00
}
}
2006-12-11 02:31:23 +01:00
// if it is still not available, report an error
if ( resContent = = null ) {
serverLog . logFine ( " snippet fetch " , " plasmaHTCache.Entry cache is NULL for url " + url ) ;
return null ;
}
2006-09-20 14:25:07 +02:00
} else {
2006-12-11 02:31:23 +01:00
serverLog . logFine ( " snippet fetch " , " no resource available for url " + url ) ;
return null ;
2006-09-18 02:37:02 +02:00
}
2006-09-20 14:25:07 +02:00
} catch ( Exception e ) {
2006-12-11 02:31:23 +01:00
serverLog . logFine ( " snippet fetch " , " error loading resource: " + e . getMessage ( ) + " for url " + url ) ;
2006-09-18 02:37:02 +02:00
return null ;
2006-12-11 02:31:23 +01:00
}
2006-09-20 14:25:07 +02:00
2006-12-11 02:31:23 +01:00
// parse resource
plasmaParserDocument document = null ;
try {
document = parseDocument ( url , resContentLength , resContent , resInfo ) ;
} catch ( ParserException e ) {
serverLog . logFine ( " snippet fetch " , " parser error " + e . getMessage ( ) + " for url " + url ) ;
return null ;
} finally {
try { resContent . close ( ) ; } catch ( Exception e ) { }
}
return document ;
2005-06-23 14:12:12 +02:00
}
2006-12-11 02:31:23 +01:00
2005-06-23 14:12:12 +02:00
2007-08-15 13:36:59 +02:00
public static void storeToCache ( String wordhashes , String urlhash , String snippet ) {
2005-06-08 02:52:24 +02:00
// generate key
String key = urlhash + wordhashes ;
// do nothing if snippet is known
if ( snippetsCache . containsKey ( key ) ) return ;
// learn new snippet
snippetsScore . addScore ( key , snippetsScoreCounter + + ) ;
snippetsCache . put ( key , snippet ) ;
// care for counter
if ( snippetsScoreCounter = = java . lang . Integer . MAX_VALUE ) {
snippetsScoreCounter = 0 ;
snippetsScore = new kelondroMScoreCluster ( ) ;
snippetsCache = new HashMap ( ) ;
}
// flush cache if cache is full
while ( snippetsCache . size ( ) > maxCache ) {
key = ( String ) snippetsScore . getMinObject ( ) ;
snippetsScore . deleteScore ( key ) ;
snippetsCache . remove ( key ) ;
}
}
2007-08-15 13:36:59 +02:00
private static String retrieveFromCache ( String wordhashes , String urlhash ) {
2005-06-08 02:52:24 +02:00
// generate key
String key = urlhash + wordhashes ;
return ( String ) snippetsCache . get ( key ) ;
}
2007-08-15 13:36:59 +02:00
private static String computeMediaSnippet ( Map media , Set queryhashes ) {
2006-12-08 03:14:56 +01:00
Iterator i = media . entrySet ( ) . iterator ( ) ;
Map . Entry entry ;
String url , desc ;
Set s ;
String result = " " ;
while ( i . hasNext ( ) ) {
entry = ( Map . Entry ) i . next ( ) ;
url = ( String ) entry . getKey ( ) ;
desc = ( String ) entry . getValue ( ) ;
s = removeAppearanceHashes ( url , queryhashes ) ;
if ( s . size ( ) = = 0 ) {
result + = " <br /><a href= \" " + url + " \" > " + ( ( desc . length ( ) = = 0 ) ? url : desc ) + " </a> " ;
continue ;
}
s = removeAppearanceHashes ( desc , s ) ;
if ( s . size ( ) = = 0 ) {
result + = " <br /><a href= \" " + url + " \" > " + ( ( desc . length ( ) = = 0 ) ? url : desc ) + " </a> " ;
continue ;
}
}
if ( result . length ( ) = = 0 ) return null ;
return result . substring ( 6 ) ;
}
2007-08-15 13:36:59 +02:00
private static Object [ ] /*{String - the snippet, Set - remaining hashes}*/
2007-03-13 23:18:36 +01:00
computeTextSnippet ( Iterator sentences , Set queryhashes , int maxLength ) {
2005-07-20 15:03:41 +02:00
try {
2006-10-07 02:06:09 +02:00
if ( sentences = = null ) return null ;
2005-08-15 01:35:18 +02:00
if ( ( queryhashes = = null ) | | ( queryhashes . size ( ) = = 0 ) ) return null ;
Iterator j ;
HashMap hs ;
2007-01-29 02:11:22 +01:00
StringBuffer sentence ;
2006-12-09 03:13:43 +01:00
TreeMap os = new TreeMap ( ) ;
int uniqCounter = 9999 ;
int score ;
while ( sentences . hasNext ( ) ) {
2007-01-29 02:11:22 +01:00
sentence = ( StringBuffer ) sentences . next ( ) ;
2007-03-13 23:18:36 +01:00
hs = hashSentence ( sentence . toString ( ) ) ;
j = queryhashes . iterator ( ) ;
score = 0 ;
while ( j . hasNext ( ) ) { if ( hs . containsKey ( ( String ) j . next ( ) ) ) score + + ; }
if ( score > 0 ) {
os . put ( new Integer ( 1000000 * score - sentence . length ( ) * 10000 + uniqCounter - - ) , sentence ) ;
2005-06-23 14:12:12 +02:00
}
2005-06-08 02:52:24 +02:00
}
2006-12-09 03:13:43 +01:00
String result ;
Set remaininghashes ;
while ( os . size ( ) > 0 ) {
2007-01-29 02:11:22 +01:00
sentence = ( StringBuffer ) os . remove ( ( Integer ) os . lastKey ( ) ) ; // sentence with the biggest score
2007-03-13 23:18:36 +01:00
Object [ ] tsr = computeTextSnippet ( sentence . toString ( ) , queryhashes , maxLength ) ;
if ( tsr = = null ) continue ;
result = ( String ) tsr [ 0 ] ;
2006-12-09 03:13:43 +01:00
if ( ( result ! = null ) & & ( result . length ( ) > 0 ) ) {
2007-03-13 23:18:36 +01:00
remaininghashes = ( Set ) tsr [ 1 ] ;
2006-12-09 03:13:43 +01:00
if ( remaininghashes . size ( ) = = 0 ) {
// we have found the snippet
2007-03-13 23:18:36 +01:00
return new Object [ ] { result , remaininghashes } ;
2006-12-09 03:13:43 +01:00
} else if ( remaininghashes . size ( ) < queryhashes . size ( ) ) {
// the result has not all words in it.
// find another sentence that represents the missing other words
// and find recursively more sentences
maxLength = maxLength - result . length ( ) ;
if ( maxLength < 20 ) maxLength = 20 ;
2007-03-13 23:18:36 +01:00
tsr = computeTextSnippet ( os . values ( ) . iterator ( ) , remaininghashes , maxLength ) ;
2007-03-16 14:25:56 +01:00
if ( tsr = = null ) return null ;
2007-03-13 23:18:36 +01:00
String nextSnippet = ( String ) tsr [ 0 ] ;
if ( nextSnippet = = null ) return tsr ;
return new Object [ ] { result + ( " / " + nextSnippet ) , tsr [ 1 ] } ;
2006-12-09 03:13:43 +01:00
} else {
// error
//assert remaininghashes.size() < queryhashes.size() : "remaininghashes.size() = " + remaininghashes.size() + ", queryhashes.size() = " + queryhashes.size() + ", sentence = '" + sentence + "', result = '" + result + "'";
continue ;
}
2005-08-15 01:35:18 +02:00
}
2005-06-24 09:41:07 +02:00
}
2006-12-09 03:13:43 +01:00
return null ;
2006-12-08 03:14:56 +01:00
} catch ( IndexOutOfBoundsException e ) {
log . logSevere ( " computeSnippet: error with string generation " , e ) ;
2007-03-13 23:18:36 +01:00
return new Object [ ] { null , queryhashes } ;
2006-12-08 03:14:56 +01:00
}
}
2007-08-15 13:36:59 +02:00
private static Object [ ] /*{String - the snippet, Set - remaining hashes}*/
2007-03-13 23:18:36 +01:00
computeTextSnippet ( String sentence , Set queryhashes , int maxLength ) {
2006-12-08 03:14:56 +01:00
try {
if ( sentence = = null ) return null ;
if ( ( queryhashes = = null ) | | ( queryhashes . size ( ) = = 0 ) ) return null ;
Iterator j ;
HashMap hs ;
String hash ;
// find all hashes that appear in the sentence
hs = hashSentence ( sentence ) ;
2005-08-15 01:35:18 +02:00
j = queryhashes . iterator ( ) ;
Integer pos ;
2006-12-08 03:14:56 +01:00
int p , minpos = sentence . length ( ) , maxpos = - 1 ;
2007-03-13 23:18:36 +01:00
HashSet remainingHashes = new HashSet ( ) ;
2005-08-15 01:35:18 +02:00
while ( j . hasNext ( ) ) {
hash = ( String ) j . next ( ) ;
pos = ( Integer ) hs . get ( hash ) ;
2007-03-13 23:18:36 +01:00
if ( pos = = null ) {
remainingHashes . add ( hash ) ;
} else {
2005-08-15 01:35:18 +02:00
p = pos . intValue ( ) ;
if ( p > maxpos ) maxpos = p ;
if ( p < minpos ) minpos = p ;
}
2005-06-30 20:54:00 +02:00
}
2005-08-15 01:35:18 +02:00
// check result size
maxpos = maxpos + 10 ;
2006-12-08 03:14:56 +01:00
if ( maxpos > sentence . length ( ) ) maxpos = sentence . length ( ) ;
2005-08-15 01:35:18 +02:00
if ( minpos < 0 ) minpos = 0 ;
// we have a result, but is it short enough?
if ( maxpos - minpos + 10 > maxLength ) {
// the string is too long, even if we cut at both ends
// so cut here in the middle of the string
2006-12-08 03:14:56 +01:00
int lenb = sentence . length ( ) ;
sentence = sentence . substring ( 0 , ( minpos + 20 > sentence . length ( ) ) ? sentence . length ( ) : minpos + 20 ) . trim ( ) +
2005-08-15 01:35:18 +02:00
" [..] " +
2006-12-08 03:14:56 +01:00
sentence . substring ( ( maxpos + 26 > sentence . length ( ) ) ? sentence . length ( ) : maxpos + 26 ) . trim ( ) ;
maxpos = maxpos + lenb - sentence . length ( ) + 6 ;
2005-08-15 01:35:18 +02:00
}
if ( maxpos > maxLength ) {
// the string is too long, even if we cut it at the end
// so cut it here at both ends at once
int newlen = maxpos - minpos + 10 ;
int around = ( maxLength - newlen ) / 2 ;
2006-12-08 03:14:56 +01:00
sentence = " [..] " + sentence . substring ( minpos - around , ( ( maxpos + around ) > sentence . length ( ) ) ? sentence . length ( ) : ( maxpos + around ) ) . trim ( ) + " [..] " ;
2005-08-15 01:35:18 +02:00
minpos = around ;
2006-12-08 03:14:56 +01:00
maxpos = sentence . length ( ) - around - 5 ;
2005-08-15 01:35:18 +02:00
}
2006-12-08 03:14:56 +01:00
if ( sentence . length ( ) > maxLength ) {
// trim sentence, 1st step (cut at right side)
sentence = sentence . substring ( 0 , maxpos ) . trim ( ) + " [..] " ;
2005-08-15 01:35:18 +02:00
}
2006-12-08 03:14:56 +01:00
if ( sentence . length ( ) > maxLength ) {
// trim sentence, 2nd step (cut at left side)
sentence = " [..] " + sentence . substring ( minpos ) . trim ( ) ;
2005-08-15 01:35:18 +02:00
}
2006-12-08 03:14:56 +01:00
if ( sentence . length ( ) > maxLength ) {
// trim sentence, 3rd step (cut in the middle)
sentence = sentence . substring ( 6 , 20 ) . trim ( ) + " [..] " + sentence . substring ( sentence . length ( ) - 26 , sentence . length ( ) - 6 ) . trim ( ) ;
2005-08-15 01:35:18 +02:00
}
2007-03-13 23:18:36 +01:00
return new Object [ ] { sentence , remainingHashes } ;
2005-07-20 15:03:41 +02:00
} catch ( IndexOutOfBoundsException e ) {
2005-08-30 23:32:59 +02:00
log . logSevere ( " computeSnippet: error with string generation " , e ) ;
2006-12-08 03:14:56 +01:00
return null ;
}
}
2007-08-15 13:36:59 +02:00
public static ArrayList retrieveMediaSnippets ( URL url , Set queryhashes , String mediatype , boolean fetchOnline , int timeout ) {
2006-12-11 02:31:23 +01:00
if ( queryhashes . size ( ) = = 0 ) {
serverLog . logFine ( " snippet fetch " , " no query hashes given for url " + url ) ;
return new ArrayList ( ) ;
}
2006-12-20 16:44:29 +01:00
if ( mediatype = = null ) mediatype = " " ;
2006-12-19 04:10:46 +01:00
plasmaParserDocument document = retrieveDocument ( url , fetchOnline , timeout , false ) ;
2006-12-11 02:31:23 +01:00
ArrayList a = new ArrayList ( ) ;
if ( document ! = null ) {
2006-12-20 16:44:29 +01:00
if ( ( mediatype . length ( ) = = 0 ) | | ( mediatype . equals ( " audio " ) ) ) a . addAll ( computeMediaSnippets ( document , queryhashes , " audio " ) ) ;
if ( ( mediatype . length ( ) = = 0 ) | | ( mediatype . equals ( " video " ) ) ) a . addAll ( computeMediaSnippets ( document , queryhashes , " video " ) ) ;
if ( ( mediatype . length ( ) = = 0 ) | | ( mediatype . equals ( " app " ) ) ) a . addAll ( computeMediaSnippets ( document , queryhashes , " app " ) ) ;
if ( ( mediatype . length ( ) = = 0 ) | | ( mediatype . equals ( " image " ) ) ) a . addAll ( computeImageSnippets ( document , queryhashes ) ) ;
2006-12-11 02:31:23 +01:00
}
return a ;
}
2007-08-15 13:36:59 +02:00
public static ArrayList computeMediaSnippets ( plasmaParserDocument document , Set queryhashes , String mediatype ) {
2006-12-11 02:31:23 +01:00
if ( document = = null ) return new ArrayList ( ) ;
Map media = null ;
if ( mediatype . equals ( " audio " ) ) media = document . getAudiolinks ( ) ;
else if ( mediatype . equals ( " video " ) ) media = document . getVideolinks ( ) ;
else if ( mediatype . equals ( " app " ) ) media = document . getApplinks ( ) ;
if ( media = = null ) return null ;
Iterator i = media . entrySet ( ) . iterator ( ) ;
Map . Entry entry ;
String url , desc ;
Set s ;
ArrayList result = new ArrayList ( ) ;
while ( i . hasNext ( ) ) {
entry = ( Map . Entry ) i . next ( ) ;
url = ( String ) entry . getKey ( ) ;
desc = ( String ) entry . getValue ( ) ;
//result.add(new MediaSnippet(mediatype, url, (desc.length() == 0) ? url : desc, null));
s = removeAppearanceHashes ( url , queryhashes ) ;
if ( s . size ( ) = = 0 ) {
2006-12-12 03:09:25 +01:00
result . add ( new MediaSnippet ( mediatype , url , desc , null ) ) ;
2006-12-11 02:31:23 +01:00
continue ;
}
s = removeAppearanceHashes ( desc , s ) ;
if ( s . size ( ) = = 0 ) {
2006-12-12 03:09:25 +01:00
result . add ( new MediaSnippet ( mediatype , url , desc , null ) ) ;
2006-12-11 02:31:23 +01:00
continue ;
}
}
return result ;
}
2007-08-15 13:36:59 +02:00
public static ArrayList computeImageSnippets ( plasmaParserDocument document , Set queryhashes ) {
2006-12-11 02:31:23 +01:00
TreeSet images = document . getImages ( ) ;
Iterator i = images . iterator ( ) ;
htmlFilterImageEntry ientry ;
String url , desc ;
Set s ;
ArrayList result = new ArrayList ( ) ;
while ( i . hasNext ( ) ) {
ientry = ( htmlFilterImageEntry ) i . next ( ) ;
2007-07-19 17:32:10 +02:00
url = ( String ) ientry . url ( ) . toNormalform ( true , true ) ;
2006-12-11 02:31:23 +01:00
desc = ( String ) ientry . alt ( ) ;
//result.add(new MediaSnippet("image", url, (desc.length() == 0) ? url : desc, ientry.width() + " x " + ientry.height()));
s = removeAppearanceHashes ( url , queryhashes ) ;
if ( s . size ( ) = = 0 ) {
2006-12-12 03:09:25 +01:00
result . add ( new MediaSnippet ( " image " , url , desc , ientry . width ( ) + " x " + ientry . height ( ) ) ) ;
2006-12-11 02:31:23 +01:00
continue ;
}
s = removeAppearanceHashes ( desc , s ) ;
if ( s . size ( ) = = 0 ) {
2006-12-12 03:09:25 +01:00
result . add ( new MediaSnippet ( " image " , url , desc , ientry . width ( ) + " x " + ientry . height ( ) ) ) ;
2006-12-11 02:31:23 +01:00
continue ;
}
}
return result ;
}
2007-08-15 13:36:59 +02:00
private static Set removeAppearanceHashes ( String sentence , Set queryhashes ) {
2006-12-08 03:14:56 +01:00
// remove all hashes that appear in the sentence
if ( sentence = = null ) return queryhashes ;
HashMap hs = hashSentence ( sentence ) ;
Iterator j = queryhashes . iterator ( ) ;
String hash ;
Integer pos ;
Set remaininghashes = new HashSet ( ) ;
while ( j . hasNext ( ) ) {
hash = ( String ) j . next ( ) ;
pos = ( Integer ) hs . get ( hash ) ;
if ( pos = = null ) {
remaininghashes . add ( new String ( hash ) ) ;
}
2005-07-20 15:03:41 +02:00
}
2006-12-08 03:14:56 +01:00
return remaininghashes ;
2005-06-23 14:12:12 +02:00
}
2005-06-24 09:41:07 +02:00
2007-08-15 13:36:59 +02:00
private static HashMap hashSentence ( String sentence ) {
2005-06-30 20:54:00 +02:00
// generates a word-wordPos mapping
HashMap map = new HashMap ( ) ;
2006-11-28 16:00:15 +01:00
Enumeration words = plasmaCondenser . wordTokenizer ( sentence , " UTF-8 " , 0 ) ;
2005-06-30 20:54:00 +02:00
int pos = 0 ;
2007-01-29 02:11:22 +01:00
StringBuffer word ;
2005-06-30 20:54:00 +02:00
while ( words . hasMoreElements ( ) ) {
2007-01-29 02:11:22 +01:00
word = ( StringBuffer ) words . nextElement ( ) ;
map . put ( plasmaCondenser . word2hash ( new String ( word ) ) , new Integer ( pos ) ) ;
2005-06-30 20:54:00 +02:00
pos + = word . length ( ) + 1 ;
}
return map ;
2005-06-08 02:52:24 +02:00
}
2005-06-23 14:12:12 +02:00
2007-08-15 13:36:59 +02:00
public static plasmaParserDocument parseDocument ( URL url , long contentLength , InputStream resourceStream ) throws ParserException {
2006-10-03 13:05:48 +02:00
return parseDocument ( url , contentLength , resourceStream , null ) ;
2006-02-23 14:29:07 +01:00
}
2006-10-03 13:05:48 +02:00
/ * *
* Parse the resource
* @param url the URL of the resource
* @param contentLength the contentLength of the resource
* @param resourceStream the resource body as stream
* @param docInfo metadata about the resource
* @return the extracted data
* @throws ParserException
* /
2007-08-15 13:36:59 +02:00
public static plasmaParserDocument parseDocument ( URL url , long contentLength , InputStream resourceStream , IResourceInfo docInfo ) throws ParserException {
2006-09-03 16:59:00 +02:00
try {
2006-10-03 13:05:48 +02:00
if ( resourceStream = = null ) return null ;
2006-09-03 16:59:00 +02:00
2006-10-03 13:05:48 +02:00
// STEP 1: if no resource metadata is available, try to load it from cache
2006-09-06 16:31:17 +02:00
if ( docInfo = = null ) {
2006-09-22 13:40:46 +02:00
// try to get the header from the htcache directory
try {
2007-08-15 13:36:59 +02:00
docInfo = cacheManager . loadResourceInfo ( url ) ;
2006-09-22 13:55:28 +02:00
} catch ( Exception e ) {
// ignore this. resource info loading failed
2006-10-02 03:15:02 +02:00
}
}
2006-10-03 13:05:48 +02:00
// STEP 2: if the metadata is still null try to download it from web
if ( ( docInfo = = null ) & & ( url . getProtocol ( ) . startsWith ( " http " ) ) ) {
2006-09-22 13:55:28 +02:00
// TODO: we need a better solution here
2006-10-03 13:05:48 +02:00
// e.g. encapsulate this in the crawlLoader class
// getting URL mimeType
try {
2007-08-15 13:36:59 +02:00
httpHeader header = httpc . whead ( url , url . getHost ( ) , 10000 , null , null , plasmaSwitchboard . getSwitchboard ( ) . remoteProxyConfig ) ;
docInfo = cacheManager . getResourceInfoFactory ( ) . buildResourceInfoObj ( url , header ) ;
2006-10-03 13:05:48 +02:00
} catch ( Exception e ) {
// ingore this. http header download failed
}
}
2005-11-14 11:25:43 +01:00
2006-10-03 13:05:48 +02:00
// STEP 3: if the metadata is still null try to guess the mimeType of the resource
2006-09-06 16:31:17 +02:00
if ( docInfo = = null ) {
2007-08-15 13:36:59 +02:00
String filename = cacheManager . getCachePath ( url ) . getName ( ) ;
2006-09-03 16:59:00 +02:00
int p = filename . lastIndexOf ( '.' ) ;
if ( // if no extension is available
( p < 0 ) | |
// or the extension is supported by one of the parsers
( ( p > = 0 ) & & ( plasmaParser . supportedFileExtContains ( filename . substring ( p + 1 ) ) ) )
) {
String supposedMime = " text/html " ;
// if the mimeType Parser is installed we can set the mimeType to null to force
// a mimetype detection
if ( plasmaParser . supportedMimeTypesContains ( " application/octet-stream " ) ) {
supposedMime = null ;
} else if ( p ! = - 1 ) {
// otherwise we try to determine the mimeType per file Extension
supposedMime = plasmaParser . getMimeTypeByFileExt ( filename . substring ( p + 1 ) ) ;
}
2007-08-15 13:36:59 +02:00
return parser . parseSource ( url , supposedMime , null , contentLength , resourceStream ) ;
2006-09-03 16:59:00 +02:00
}
2005-06-23 14:12:12 +02:00
return null ;
2006-10-03 13:05:48 +02:00
}
2006-09-06 16:31:17 +02:00
if ( plasmaParser . supportedMimeTypesContains ( docInfo . getMimeType ( ) ) ) {
2007-08-15 13:36:59 +02:00
return parser . parseSource ( url , docInfo . getMimeType ( ) , docInfo . getCharacterEncoding ( ) , contentLength , resourceStream ) ;
2005-06-23 14:12:12 +02:00
}
2006-09-03 16:59:00 +02:00
return null ;
} catch ( InterruptedException e ) {
// interruption of thread detected
return null ;
2005-06-08 02:52:24 +02:00
}
}
2006-10-03 13:05:48 +02:00
/ * *
*
* @param url
* @param fetchOnline
* @param socketTimeout
* @return an Object array containing
* < table >
* < tr > < td > [ 0 ] < / td > < td > the content as { @link InputStream } < / td > < / tr >
* < tr > < td > [ 1 ] < / td > < td > the content - length as { @link Integer } < / td > < / tr >
* < / table >
* /
2007-08-15 13:36:59 +02:00
public static Object [ ] getResource ( URL url , boolean fetchOnline , int socketTimeout , boolean forText ) {
2005-06-08 02:52:24 +02:00
// load the url as resource from the web
try {
2006-10-03 13:05:48 +02:00
long contentLength = - 1 ;
2006-09-20 14:25:07 +02:00
2006-10-03 13:05:48 +02:00
// trying to load the resource body from cache
2007-08-15 13:36:59 +02:00
InputStream resource = cacheManager . getResourceContentStream ( url ) ;
2006-10-03 13:05:48 +02:00
if ( resource ! = null ) {
2007-08-15 13:36:59 +02:00
contentLength = cacheManager . getResourceContentLength ( url ) ;
2006-10-03 13:05:48 +02:00
} else if ( fetchOnline ) {
// if the content is not available in cache try to download it from web
2006-09-20 14:25:07 +02:00
// try to download the resource using a crawler
2006-12-19 04:10:46 +01:00
plasmaHTCache . Entry entry = loadResourceFromWeb ( url , ( socketTimeout < 0 ) ? - 1 : socketTimeout , true , forText ) ;
2006-09-20 14:25:07 +02:00
2006-10-03 13:05:48 +02:00
// read resource body (if it is there)
byte [ ] resourceArray = entry . cacheArray ( ) ;
// in case that the reosurce was not in ram, read it from disk
if ( resourceArray = = null ) {
2007-08-15 13:36:59 +02:00
resource = cacheManager . getResourceContentStream ( url ) ;
contentLength = cacheManager . getResourceContentLength ( url ) ;
2006-10-03 13:05:48 +02:00
} else {
resource = new ByteArrayInputStream ( resourceArray ) ;
contentLength = resourceArray . length ;
}
} else {
return null ;
2005-06-08 02:52:24 +02:00
}
2006-10-03 13:05:48 +02:00
return new Object [ ] { resource , new Long ( contentLength ) } ;
2005-06-08 02:52:24 +02:00
} catch ( IOException e ) {
return null ;
}
}
2007-08-15 13:36:59 +02:00
public static plasmaHTCache . Entry loadResourceFromWeb (
2006-10-03 13:05:48 +02:00
URL url ,
int socketTimeout ,
2006-12-19 04:10:46 +01:00
boolean keepInMemory ,
boolean forText
2006-10-03 13:05:48 +02:00
) throws plasmaCrawlerException {
2006-09-04 11:00:18 +02:00
2007-08-15 13:36:59 +02:00
plasmaHTCache . Entry result = plasmaSwitchboard . getSwitchboard ( ) . cacheLoader . loadSync (
2006-10-10 01:07:10 +02:00
url , // the url
" " , // name of the url, from anchor tag <a>name</a>
null , // referer
yacyCore . seedDB . mySeed . hash , // initiator
0 , // depth
2007-08-15 13:36:59 +02:00
( forText ) ? plasmaSwitchboard . getSwitchboard ( ) . defaultTextSnippetProfile : plasmaSwitchboard . getSwitchboard ( ) . defaultMediaSnippetProfile , // crawl profile
2006-10-03 13:05:48 +02:00
socketTimeout ,
keepInMemory
2006-09-04 11:00:18 +02:00
) ;
return result ;
2005-06-08 02:52:24 +02:00
}
2007-03-13 23:18:36 +01:00
2007-08-15 13:36:59 +02:00
public static String failConsequences ( TextSnippet snippet , Set queryhashes ) {
2007-03-13 23:18:36 +01:00
// problems with snippet fetch
2007-03-27 10:21:45 +02:00
if ( yacyCore . seedDB . mySeed . isVirgin ( ) ) return snippet . getError ( ) + " (no consequences, no network connection) " ; // no consequences if we do not have a network connection
2007-03-13 23:18:36 +01:00
String urlHash = plasmaURL . urlHash ( snippet . getUrl ( ) ) ;
String querystring = kelondroMSetTools . setToString ( snippet . getRemainingHashes ( ) , ' ' ) ;
if ( ( snippet . getErrorCode ( ) = = ERROR_SOURCE_LOADING ) | |
( snippet . getErrorCode ( ) = = ERROR_RESOURCE_LOADING ) | |
( snippet . getErrorCode ( ) = = ERROR_PARSER_FAILED ) | |
( snippet . getErrorCode ( ) = = ERROR_PARSER_NO_LINES ) ) {
2007-07-19 17:32:10 +02:00
log . logInfo ( " error: ' " + snippet . getError ( ) + " ', remove url = " + snippet . getUrl ( ) . toNormalform ( false , true ) + " , cause: " + snippet . getError ( ) ) ;
2007-08-15 13:36:59 +02:00
plasmaSwitchboard . getSwitchboard ( ) . wordIndex . loadedURL . remove ( urlHash ) ;
plasmaSwitchboard . getSwitchboard ( ) . wordIndex . removeHashReferences ( queryhashes , urlHash ) ;
2005-07-12 17:09:35 +02:00
}
2007-03-13 23:18:36 +01:00
if ( snippet . getErrorCode ( ) = = ERROR_NO_MATCH ) {
2007-07-19 17:32:10 +02:00
log . logInfo ( " error: ' " + snippet . getError ( ) + " ', remove words ' " + querystring + " ' for url = " + snippet . getUrl ( ) . toNormalform ( false , true ) + " , cause: " + snippet . getError ( ) ) ;
2007-08-15 13:36:59 +02:00
plasmaSwitchboard . getSwitchboard ( ) . wordIndex . removeHashReferences ( snippet . remaingHashes , urlHash ) ;
2005-07-12 17:09:35 +02:00
}
2007-03-13 23:18:36 +01:00
return snippet . getError ( ) ;
2005-07-12 17:09:35 +02:00
}
2007-03-13 23:18:36 +01:00
2006-12-11 02:31:23 +01:00
}