2005-06-08 02:52:24 +02:00
// plasmaSnippetCache.java
// -----------------------
// part of YaCy
// (C) by Michael Peter Christen; mc@anomic.de
// first published on http://www.anomic.de
// Frankfurt, Germany, 2005
2006-10-09 15:37:38 +02:00
// last major change: 09.10.2006
2006-09-30 02:27:42 +02:00
//
// contributions by Marc Nause [MN]
2005-06-08 02:52:24 +02:00
//
// This program is free software; you can redistribute it and/or modify
// it under the terms of the GNU General Public License as published by
// the Free Software Foundation; either version 2 of the License, or
// (at your option) any later version.
//
// This program is distributed in the hope that it will be useful,
// but WITHOUT ANY WARRANTY; without even the implied warranty of
// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
// GNU General Public License for more details.
//
// You should have received a copy of the GNU General Public License
// along with this program; if not, write to the Free Software
// Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
//
// Using this software in any meaning (reading, learning, copying, compiling,
// running) means that you agree that the Author(s) is (are) not responsible
// for cost, loss of data or any harm that may be caused directly or indirectly
// by usage of this softare or this documentation. The usage of this software
// is on your own risk. The installation and usage (starting/running) of this
// software may allow other people or application to access your computer and
// any attached devices and is highly dependent on the configuration of the
// software which must be done by the user of the software; the author(s) is
// (are) also not responsible for proper configuration and usage of the
// software, even if provoked by documentation provided together with
// the software.
//
// Any changes to this file according to the GPL as documented in the file
// gpl.txt aside this file in the shipment you received can be done to the
// lines that follows this copyright notice here, but changes must not be
// done inside the copyright notive above. A re-distribution must contain
// the intact and unchanged copyright notice.
// Contributions and changes to the program code must be marked as such.
package de.anomic.plasma ;
2006-10-03 13:05:48 +02:00
import java.io.ByteArrayInputStream ;
import java.io.InputStream ;
2006-12-11 02:31:23 +01:00
import java.util.ArrayList ;
2005-07-06 16:48:41 +02:00
import java.util.Enumeration ;
import java.util.HashMap ;
import java.util.HashSet ;
import java.util.Iterator ;
2006-12-08 03:14:56 +01:00
import java.util.Map ;
2005-07-06 16:48:41 +02:00
import java.util.Set ;
2006-12-09 03:13:43 +01:00
import java.util.TreeMap ;
2006-12-11 02:31:23 +01:00
import java.util.TreeSet ;
2005-07-06 16:48:41 +02:00
2006-12-11 02:31:23 +01:00
import de.anomic.htmlFilter.htmlFilterImageEntry ;
2006-09-22 13:55:28 +02:00
import de.anomic.http.httpHeader ;
import de.anomic.http.httpc ;
2006-09-30 00:27:20 +02:00
import de.anomic.kelondro.kelondroMScoreCluster ;
2007-03-13 23:18:36 +01:00
import de.anomic.kelondro.kelondroMSetTools ;
2006-09-30 00:27:20 +02:00
import de.anomic.plasma.cache.IResourceInfo ;
import de.anomic.plasma.parser.ParserException ;
import de.anomic.server.logging.serverLog ;
2006-10-10 01:07:10 +02:00
import de.anomic.yacy.yacyCore ;
2008-01-18 18:14:02 +01:00
import de.anomic.yacy.yacySearch ;
2007-09-05 11:01:35 +02:00
import de.anomic.yacy.yacyURL ;
2005-06-08 02:52:24 +02:00
public class plasmaSnippetCache {
private static final int maxCache = 500 ;
2005-06-30 02:01:53 +02:00
2005-06-23 14:12:12 +02:00
public static final int SOURCE_CACHE = 0 ;
2005-06-30 02:01:53 +02:00
public static final int SOURCE_FILE = 1 ;
public static final int SOURCE_WEB = 2 ;
2005-06-23 14:12:12 +02:00
2005-06-30 02:01:53 +02:00
public static final int ERROR_NO_HASH_GIVEN = 11 ;
public static final int ERROR_SOURCE_LOADING = 12 ;
public static final int ERROR_RESOURCE_LOADING = 13 ;
public static final int ERROR_PARSER_FAILED = 14 ;
public static final int ERROR_PARSER_NO_LINES = 15 ;
public static final int ERROR_NO_MATCH = 16 ;
2005-06-08 02:52:24 +02:00
2007-08-15 13:36:59 +02:00
private static int snippetsScoreCounter ;
2007-12-28 19:47:45 +01:00
private static kelondroMScoreCluster < String > snippetsScore ;
2008-01-19 01:40:19 +01:00
private static HashMap < String , String > snippetsCache ;
2007-06-09 17:22:37 +02:00
/ * *
* a cache holding URLs to favicons specified by the page content , e . g . by using the html link - tag . e . g .
* < pre >
* & lt ; link rel = " shortcut icon " type = " image/x-icon " href = " ../src/favicon.ico " & gt ;
* < / pre >
* /
2008-01-19 01:40:19 +01:00
private static HashMap < String , yacyURL > faviconCache ;
2007-08-15 13:36:59 +02:00
private static plasmaParser parser ;
private static serverLog log ;
2005-06-08 02:52:24 +02:00
2007-08-15 13:36:59 +02:00
public static void init (
plasmaParser parserx ,
serverLog logx
2005-10-22 15:28:04 +02:00
) {
2007-08-15 13:36:59 +02:00
parser = parserx ;
log = logx ;
snippetsScoreCounter = 0 ;
2007-12-28 19:47:45 +01:00
snippetsScore = new kelondroMScoreCluster < String > ( ) ;
2008-01-19 01:40:19 +01:00
snippetsCache = new HashMap < String , String > ( ) ;
faviconCache = new HashMap < String , yacyURL > ( ) ;
2005-06-08 02:52:24 +02:00
}
2007-08-15 13:36:59 +02:00
public static class TextSnippet {
2007-09-05 11:01:35 +02:00
private yacyURL url ;
2006-03-05 02:11:06 +01:00
private String line ;
private String error ;
2007-03-13 23:18:36 +01:00
private int errorCode ;
2008-01-19 01:40:19 +01:00
private Set < String > remaingHashes ;
2007-09-05 11:01:35 +02:00
private yacyURL favicon ;
2007-06-09 17:22:37 +02:00
2008-01-19 01:40:19 +01:00
public TextSnippet ( yacyURL url , String line , int errorCode , Set < String > remaingHashes , String errortext ) {
2007-06-09 17:22:37 +02:00
this ( url , line , errorCode , remaingHashes , errortext , null ) ;
}
2008-01-19 01:40:19 +01:00
public TextSnippet ( yacyURL url , String line , int errorCode , Set < String > remaingHashes , String errortext , yacyURL favicon ) {
2007-03-13 23:18:36 +01:00
this . url = url ;
2005-06-23 14:12:12 +02:00
this . line = line ;
2007-03-13 23:18:36 +01:00
this . errorCode = errorCode ;
2005-06-30 02:01:53 +02:00
this . error = errortext ;
2007-03-13 23:18:36 +01:00
this . remaingHashes = remaingHashes ;
2007-06-09 17:22:37 +02:00
this . favicon = favicon ;
2007-03-13 23:18:36 +01:00
}
2007-09-05 11:01:35 +02:00
public yacyURL getUrl ( ) {
2007-03-13 23:18:36 +01:00
return this . url ;
2005-06-23 14:12:12 +02:00
}
2006-03-05 02:11:06 +01:00
public boolean exists ( ) {
return line ! = null ;
}
2005-06-23 14:12:12 +02:00
public String toString ( ) {
2006-03-05 02:11:06 +01:00
return ( line = = null ) ? " " : line ;
}
public String getLineRaw ( ) {
return ( line = = null ) ? " " : line ;
}
public String getError ( ) {
return ( error = = null ) ? " " : error . trim ( ) ;
}
2007-03-13 23:18:36 +01:00
public int getErrorCode ( ) {
return errorCode ;
}
2008-01-19 01:40:19 +01:00
public Set < String > getRemainingHashes ( ) {
2007-03-13 23:18:36 +01:00
return this . remaingHashes ;
}
2008-01-19 01:40:19 +01:00
public String getLineMarked ( Set < String > queryHashes ) {
2006-03-05 02:11:06 +01:00
if ( line = = null ) return " " ;
if ( ( queryHashes = = null ) | | ( queryHashes . size ( ) = = 0 ) ) return line . trim ( ) ;
if ( line . endsWith ( " . " ) ) line = line . substring ( 0 , line . length ( ) - 1 ) ;
2008-01-19 01:40:19 +01:00
Iterator < String > i = queryHashes . iterator ( ) ;
2006-03-05 02:11:06 +01:00
String h ;
String [ ] w = line . split ( " " ) ;
2006-10-09 15:37:38 +02:00
String prefix = " " ;
String postfix = " " ;
int len = 0 ;
2006-03-05 02:11:06 +01:00
while ( i . hasNext ( ) ) {
2008-01-19 01:40:19 +01:00
h = i . next ( ) ;
2006-03-05 02:11:06 +01:00
for ( int j = 0 ; j < w . length ; j + + ) {
2006-09-30 02:27:42 +02:00
//ignore punctuation marks (contrib [MN])
2006-10-09 15:37:38 +02:00
//note to myself:
//For details on regex see "Mastering regular expressions" by J.E.F. Friedl
//especially p. 123 and p. 390/391 (in the German version of the 2nd edition)
prefix = " " ;
postfix = " " ;
2006-12-11 12:07:36 +01:00
// cut off prefix if it contains of non-characters or non-numbers
while ( w [ j ] . matches ( " \\ A[^ \\ p{L} \\ p{N}].+ " ) ) {
2007-06-17 18:29:04 +02:00
prefix = prefix + w [ j ] . substring ( 0 , 1 ) ;
2006-10-09 15:37:38 +02:00
w [ j ] = w [ j ] . substring ( 1 ) ;
2006-09-30 02:27:42 +02:00
}
2006-10-09 15:37:38 +02:00
2006-12-11 12:07:36 +01:00
// cut off postfix if it contains of non-characters or non-numbers
while ( w [ j ] . matches ( " .+[^ \\ p{L} \\ p{N}] \\ Z " ) ) {
2006-10-09 15:37:38 +02:00
len = w [ j ] . length ( ) ;
postfix = w [ j ] . substring ( len - 1 , len ) + postfix ;
w [ j ] = w [ j ] . substring ( 0 , len - 1 ) ;
2006-09-30 02:27:42 +02:00
}
2006-10-09 15:37:38 +02:00
2006-12-11 16:19:35 +01:00
//special treatment if there is a special character in the word
if ( w [ j ] . matches ( " \\ A[ \\ p{L} \\ p{N}]+[^ \\ p{L} \\ p{N}].+ \\ Z " ) ) {
String out = " " ;
String temp = " " ;
for ( int k = 0 ; k < w [ j ] . length ( ) ; k + + ) {
//is character a special character?
if ( w [ j ] . substring ( k , k + 1 ) . matches ( " [^ \\ p{L} \\ p{N}] " ) ) {
if ( plasmaCondenser . word2hash ( temp ) . equals ( h ) ) temp = " <b> " + temp + " </b> " ;
out = out + temp + w [ j ] . substring ( k , k + 1 ) ;
temp = " " ;
}
//last character
else if ( k = = ( w [ j ] . length ( ) - 1 ) ) {
temp = temp + w [ j ] . substring ( k , k + 1 ) ;
if ( plasmaCondenser . word2hash ( temp ) . equals ( h ) ) temp = " <b> " + temp + " </b> " ;
out = out + temp ;
temp = " " ;
}
else temp = temp + w [ j ] . substring ( k , k + 1 ) ;
}
w [ j ] = out ;
2006-12-11 12:07:36 +01:00
}
2006-12-11 16:19:35 +01:00
//end contrib [MN]
else if ( plasmaCondenser . word2hash ( w [ j ] ) . equals ( h ) ) w [ j ] = " <b> " + w [ j ] + " </b> " ;
2006-12-11 12:07:36 +01:00
2006-10-09 15:37:38 +02:00
w [ j ] = prefix + w [ j ] + postfix ;
2006-03-05 02:11:06 +01:00
}
}
StringBuffer l = new StringBuffer ( line . length ( ) + queryHashes . size ( ) * 8 ) ;
for ( int j = 0 ; j < w . length ; j + + ) {
l . append ( w [ j ] ) ;
l . append ( ' ' ) ;
}
return l . toString ( ) . trim ( ) ;
}
2007-06-09 17:22:37 +02:00
2007-09-05 11:01:35 +02:00
public yacyURL getFavicon ( ) {
2007-06-09 17:22:37 +02:00
return this . favicon ;
}
2005-06-23 14:12:12 +02:00
}
2005-06-08 02:52:24 +02:00
2007-08-15 13:36:59 +02:00
public static class MediaSnippet {
2007-09-04 01:43:55 +02:00
public int type ;
2008-01-22 12:51:43 +01:00
public yacyURL href ;
public String name , attr ;
public MediaSnippet ( int type , yacyURL href , String name , String attr ) {
2006-12-11 02:31:23 +01:00
this . type = type ;
this . href = href ;
this . name = name ;
this . attr = attr ;
2006-12-12 03:09:25 +01:00
if ( ( this . name = = null ) | | ( this . name . length ( ) = = 0 ) ) this . name = " _ " ;
if ( ( this . attr = = null ) | | ( this . attr . length ( ) = = 0 ) ) this . attr = " _ " ;
2006-12-11 02:31:23 +01:00
}
}
2008-01-19 01:40:19 +01:00
public static boolean existsInCache ( yacyURL url , Set < String > queryhashes ) {
2005-07-02 01:35:36 +02:00
String hashes = yacySearch . set2string ( queryhashes ) ;
2007-09-05 11:01:35 +02:00
return retrieveFromCache ( hashes , url . hash ( ) ) ! = null ;
2005-06-24 09:41:07 +02:00
}
2008-01-19 01:40:19 +01:00
@SuppressWarnings ( " unchecked " )
public static TextSnippet retrieveTextSnippet ( yacyURL url , Set < String > queryhashes , boolean fetchOnline , boolean pre , int snippetMaxLength , int timeout , int maxDocLen ) {
2005-06-30 00:55:37 +02:00
// heise = "0OQUNU3JSs05"
2007-09-05 11:01:35 +02:00
2005-06-23 14:12:12 +02:00
if ( queryhashes . size ( ) = = 0 ) {
2005-08-02 18:09:19 +02:00
//System.out.println("found no queryhashes for URL retrieve " + url);
2007-03-13 23:18:36 +01:00
return new TextSnippet ( url , null , ERROR_NO_HASH_GIVEN , queryhashes , " no query hashes given " ) ;
2005-06-23 14:12:12 +02:00
}
// try to get snippet from snippetCache
2005-06-30 02:01:53 +02:00
int source = SOURCE_CACHE ;
2005-06-23 14:12:12 +02:00
String wordhashes = yacySearch . set2string ( queryhashes ) ;
2007-09-05 11:01:35 +02:00
String line = retrieveFromCache ( wordhashes , url . hash ( ) ) ;
2007-06-09 17:22:37 +02:00
if ( line ! = null ) {
2005-08-02 18:09:19 +02:00
//System.out.println("found snippet for URL " + url + " in cache: " + line);
2008-01-19 01:40:19 +01:00
return new TextSnippet ( url , line , source , null , null , faviconCache . get ( url . hash ( ) ) ) ;
2005-06-23 14:12:12 +02:00
}
2006-09-20 14:25:07 +02:00
/ * = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = =
* LOADING RESOURCE DATA
* = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = * /
2005-06-23 14:12:12 +02:00
// if the snippet is not in the cache, we can try to get it from the htcache
2006-10-03 13:05:48 +02:00
long resContentLength = 0 ;
InputStream resContent = null ;
IResourceInfo resInfo = null ;
2005-06-23 14:12:12 +02:00
try {
2006-09-20 14:25:07 +02:00
// trying to load the resource from the cache
2007-08-15 23:31:31 +02:00
resContent = plasmaHTCache . getResourceContentStream ( url ) ;
2006-10-03 13:05:48 +02:00
if ( resContent ! = null ) {
// if the content was found
2007-08-15 23:31:31 +02:00
resContentLength = plasmaHTCache . getResourceContentLength ( url ) ;
2007-09-26 12:11:50 +02:00
if ( ( resContentLength > maxDocLen ) & & ( ! fetchOnline ) ) {
// content may be too large to be parsed here. To be fast, we omit calculation of snippet here
return new TextSnippet ( url , null , ERROR_SOURCE_LOADING , queryhashes , " resource available, but too large: " + resContentLength + " bytes " ) ;
}
} else if ( fetchOnline ) {
2006-10-03 13:05:48 +02:00
// if not found try to download it
2006-09-20 14:25:07 +02:00
2006-10-03 13:05:48 +02:00
// download resource using the crawler and keep resource in memory if possible
2007-10-29 02:43:20 +01:00
plasmaHTCache . Entry entry = plasmaSwitchboard . getSwitchboard ( ) . crawlQueues . loadResourceFromWeb ( url , timeout , true , true ) ;
2006-09-20 14:25:07 +02:00
2007-12-12 01:33:26 +01:00
// place entry on crawl queue
plasmaHTCache . push ( entry ) ;
2006-10-03 13:05:48 +02:00
// getting resource metadata (e.g. the http headers for http resources)
if ( entry ! = null ) {
resInfo = entry . getDocumentInfo ( ) ;
2007-09-05 11:01:35 +02:00
2006-10-03 13:05:48 +02:00
// read resource body (if it is there)
byte [ ] resourceArray = entry . cacheArray ( ) ;
if ( resourceArray ! = null ) {
resContent = new ByteArrayInputStream ( resourceArray ) ;
resContentLength = resourceArray . length ;
} else {
2007-08-15 23:31:31 +02:00
resContent = plasmaHTCache . getResourceContentStream ( url ) ;
resContentLength = plasmaHTCache . getResourceContentLength ( url ) ;
2006-10-03 13:05:48 +02:00
}
}
2006-10-02 19:18:24 +02:00
2006-10-03 13:05:48 +02:00
// if it is still not available, report an error
2007-03-13 23:18:36 +01:00
if ( resContent = = null ) return new TextSnippet ( url , null , ERROR_RESOURCE_LOADING , queryhashes , " error loading resource, plasmaHTCache.Entry cache is NULL " ) ;
2006-10-02 19:18:24 +02:00
2005-06-23 14:12:12 +02:00
source = SOURCE_WEB ;
2006-10-03 13:05:48 +02:00
} else {
2007-03-13 23:18:36 +01:00
return new TextSnippet ( url , null , ERROR_SOURCE_LOADING , queryhashes , " no resource available " ) ;
2005-06-23 14:12:12 +02:00
}
2006-09-20 14:25:07 +02:00
} catch ( Exception e ) {
2007-10-29 02:43:20 +01:00
e . printStackTrace ( ) ;
2007-03-13 23:18:36 +01:00
return new TextSnippet ( url , null , ERROR_SOURCE_LOADING , queryhashes , " error loading resource: " + e . getMessage ( ) ) ;
2006-10-03 13:05:48 +02:00
}
2007-09-05 11:01:35 +02:00
2006-09-20 14:25:07 +02:00
/ * = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = =
* PARSING RESOURCE
* = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = * /
plasmaParserDocument document = null ;
try {
2007-06-09 17:22:37 +02:00
document = parseDocument ( url , resContentLength , resContent , resInfo ) ;
2006-09-20 14:25:07 +02:00
} catch ( ParserException e ) {
2007-03-13 23:18:36 +01:00
return new TextSnippet ( url , null , ERROR_PARSER_FAILED , queryhashes , e . getMessage ( ) ) ; // cannot be parsed
2006-10-03 13:05:48 +02:00
} finally {
try { resContent . close ( ) ; } catch ( Exception e ) { /* ignore this */ }
2006-09-20 14:25:07 +02:00
}
2007-03-13 23:18:36 +01:00
if ( document = = null ) return new TextSnippet ( url , null , ERROR_PARSER_FAILED , queryhashes , " parser error/failed " ) ; // cannot be parsed
2006-12-08 03:14:56 +01:00
2006-09-20 14:25:07 +02:00
/ * = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = =
* COMPUTE SNIPPET
2007-06-09 17:22:37 +02:00
* = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = * /
2007-09-05 11:01:35 +02:00
yacyURL resFavicon = document . getFavicon ( ) ;
if ( resFavicon ! = null ) faviconCache . put ( url . hash ( ) , resFavicon ) ;
2005-06-23 14:12:12 +02:00
// we have found a parseable non-empty file: use the lines
2006-12-08 03:14:56 +01:00
// compute snippet from text
2008-01-19 01:40:19 +01:00
final Iterator < StringBuffer > sentences = document . getSentences ( pre ) ;
2007-06-09 17:22:37 +02:00
if ( sentences = = null ) return new TextSnippet ( url , null , ERROR_PARSER_NO_LINES , queryhashes , " parser returned no sentences " , resFavicon ) ;
2007-03-13 23:18:36 +01:00
Object [ ] tsr = computeTextSnippet ( sentences , queryhashes , snippetMaxLength ) ;
String textline = ( tsr = = null ) ? null : ( String ) tsr [ 0 ] ;
2008-01-19 01:40:19 +01:00
Set < String > remainingHashes = ( tsr = = null ) ? queryhashes : ( Set ) tsr [ 1 ] ;
2006-12-08 03:14:56 +01:00
// compute snippet from media
String audioline = computeMediaSnippet ( document . getAudiolinks ( ) , queryhashes ) ;
String videoline = computeMediaSnippet ( document . getVideolinks ( ) , queryhashes ) ;
String appline = computeMediaSnippet ( document . getApplinks ( ) , queryhashes ) ;
//String hrefline = computeMediaSnippet(document.getAnchors(), queryhashes);
//String imageline = computeMediaSnippet(document.getAudiolinks(), queryhashes);
line = " " ;
if ( audioline ! = null ) line + = ( line . length ( ) = = 0 ) ? audioline : " <br /> " + audioline ;
if ( videoline ! = null ) line + = ( line . length ( ) = = 0 ) ? videoline : " <br /> " + videoline ;
if ( appline ! = null ) line + = ( line . length ( ) = = 0 ) ? appline : " <br /> " + appline ;
//if (hrefline != null) line += (line.length() == 0) ? hrefline : "<br />" + hrefline;
if ( textline ! = null ) line + = ( line . length ( ) = = 0 ) ? textline : " <br /> " + textline ;
2007-06-09 17:22:37 +02:00
if ( ( line = = null ) | | ( remainingHashes . size ( ) > 0 ) ) return new TextSnippet ( url , null , ERROR_NO_MATCH , remainingHashes , " no matching snippet found " , resFavicon ) ;
2005-07-01 01:19:08 +02:00
if ( line . length ( ) > snippetMaxLength ) line = line . substring ( 0 , snippetMaxLength ) ;
2005-06-23 14:12:12 +02:00
// finally store this snippet in our own cache
2007-09-05 11:01:35 +02:00
storeToCache ( wordhashes , url . hash ( ) , line ) ;
2006-12-08 03:14:56 +01:00
document . close ( ) ;
2007-06-09 17:22:37 +02:00
return new TextSnippet ( url , line , source , null , null , resFavicon ) ;
2006-09-18 02:37:02 +02:00
}
2006-09-20 14:25:07 +02:00
/ * *
* Tries to load and parse a resource specified by it ' s URL .
* If the resource is not stored in cache and if fetchOnline is set the
* this function tries to download the resource from web .
*
* @param url the URL of the resource
* @param fetchOnline specifies if the resource should be loaded from web if it ' as not available in the cache
* @return the parsed document as { @link plasmaParserDocument }
* /
2007-09-05 11:01:35 +02:00
public static plasmaParserDocument retrieveDocument ( yacyURL url , boolean fetchOnline , int timeout , boolean forText ) {
2006-12-11 02:31:23 +01:00
// load resource
long resContentLength = 0 ;
InputStream resContent = null ;
IResourceInfo resInfo = null ;
2006-09-18 02:37:02 +02:00
try {
2006-12-11 02:31:23 +01:00
// trying to load the resource from the cache
2007-08-15 23:31:31 +02:00
resContent = plasmaHTCache . getResourceContentStream ( url ) ;
2006-12-11 02:31:23 +01:00
if ( resContent ! = null ) {
// if the content was found
2007-08-15 23:31:31 +02:00
resContentLength = plasmaHTCache . getResourceContentLength ( url ) ;
2006-12-11 02:31:23 +01:00
} else if ( fetchOnline ) {
// if not found try to download it
2006-09-20 14:25:07 +02:00
2006-12-11 02:31:23 +01:00
// download resource using the crawler and keep resource in memory if possible
2007-10-29 02:43:20 +01:00
plasmaHTCache . Entry entry = plasmaSwitchboard . getSwitchboard ( ) . crawlQueues . loadResourceFromWeb ( url , timeout , true , forText ) ;
2006-12-11 02:31:23 +01:00
// getting resource metadata (e.g. the http headers for http resources)
2006-10-03 13:05:48 +02:00
if ( entry ! = null ) {
2006-12-11 02:31:23 +01:00
resInfo = entry . getDocumentInfo ( ) ;
// read resource body (if it is there)
byte [ ] resourceArray = entry . cacheArray ( ) ;
2006-10-03 13:05:48 +02:00
if ( resourceArray ! = null ) {
2006-12-11 02:31:23 +01:00
resContent = new ByteArrayInputStream ( resourceArray ) ;
resContentLength = resourceArray . length ;
2006-10-03 13:05:48 +02:00
} else {
2007-08-15 23:31:31 +02:00
resContent = plasmaHTCache . getResourceContentStream ( url ) ;
resContentLength = plasmaHTCache . getResourceContentLength ( url ) ;
2006-10-03 13:05:48 +02:00
}
}
2006-12-11 02:31:23 +01:00
// if it is still not available, report an error
if ( resContent = = null ) {
serverLog . logFine ( " snippet fetch " , " plasmaHTCache.Entry cache is NULL for url " + url ) ;
return null ;
}
2006-09-20 14:25:07 +02:00
} else {
2006-12-11 02:31:23 +01:00
serverLog . logFine ( " snippet fetch " , " no resource available for url " + url ) ;
return null ;
2006-09-18 02:37:02 +02:00
}
2006-09-20 14:25:07 +02:00
} catch ( Exception e ) {
2006-12-11 02:31:23 +01:00
serverLog . logFine ( " snippet fetch " , " error loading resource: " + e . getMessage ( ) + " for url " + url ) ;
2006-09-18 02:37:02 +02:00
return null ;
2006-12-11 02:31:23 +01:00
}
2006-09-20 14:25:07 +02:00
2006-12-11 02:31:23 +01:00
// parse resource
plasmaParserDocument document = null ;
try {
document = parseDocument ( url , resContentLength , resContent , resInfo ) ;
} catch ( ParserException e ) {
serverLog . logFine ( " snippet fetch " , " parser error " + e . getMessage ( ) + " for url " + url ) ;
return null ;
} finally {
try { resContent . close ( ) ; } catch ( Exception e ) { }
}
return document ;
2005-06-23 14:12:12 +02:00
}
2006-12-11 02:31:23 +01:00
2005-06-23 14:12:12 +02:00
2007-08-15 13:36:59 +02:00
public static void storeToCache ( String wordhashes , String urlhash , String snippet ) {
2005-06-08 02:52:24 +02:00
// generate key
String key = urlhash + wordhashes ;
// do nothing if snippet is known
if ( snippetsCache . containsKey ( key ) ) return ;
// learn new snippet
snippetsScore . addScore ( key , snippetsScoreCounter + + ) ;
snippetsCache . put ( key , snippet ) ;
// care for counter
if ( snippetsScoreCounter = = java . lang . Integer . MAX_VALUE ) {
snippetsScoreCounter = 0 ;
2007-12-28 19:47:45 +01:00
snippetsScore = new kelondroMScoreCluster < String > ( ) ;
2008-01-19 01:40:19 +01:00
snippetsCache = new HashMap < String , String > ( ) ;
2005-06-08 02:52:24 +02:00
}
// flush cache if cache is full
while ( snippetsCache . size ( ) > maxCache ) {
key = ( String ) snippetsScore . getMinObject ( ) ;
snippetsScore . deleteScore ( key ) ;
snippetsCache . remove ( key ) ;
}
}
2007-08-15 13:36:59 +02:00
private static String retrieveFromCache ( String wordhashes , String urlhash ) {
2005-06-08 02:52:24 +02:00
// generate key
String key = urlhash + wordhashes ;
2008-01-19 01:40:19 +01:00
return snippetsCache . get ( key ) ;
2005-06-08 02:52:24 +02:00
}
2008-01-22 12:51:43 +01:00
private static String computeMediaSnippet ( Map < yacyURL , String > media , Set < String > queryhashes ) {
Iterator < Map . Entry < yacyURL , String > > i = media . entrySet ( ) . iterator ( ) ;
Map . Entry < yacyURL , String > entry ;
yacyURL url ;
String desc ;
2008-01-19 01:40:19 +01:00
Set < String > s ;
2006-12-08 03:14:56 +01:00
String result = " " ;
while ( i . hasNext ( ) ) {
2008-01-19 01:40:19 +01:00
entry = i . next ( ) ;
url = entry . getKey ( ) ;
desc = entry . getValue ( ) ;
2008-01-22 12:51:43 +01:00
s = removeAppearanceHashes ( url . toNormalform ( false , false ) , queryhashes ) ;
2006-12-08 03:14:56 +01:00
if ( s . size ( ) = = 0 ) {
result + = " <br /><a href= \" " + url + " \" > " + ( ( desc . length ( ) = = 0 ) ? url : desc ) + " </a> " ;
continue ;
}
s = removeAppearanceHashes ( desc , s ) ;
if ( s . size ( ) = = 0 ) {
result + = " <br /><a href= \" " + url + " \" > " + ( ( desc . length ( ) = = 0 ) ? url : desc ) + " </a> " ;
continue ;
}
}
if ( result . length ( ) = = 0 ) return null ;
return result . substring ( 6 ) ;
}
2008-01-19 01:40:19 +01:00
@SuppressWarnings ( " unchecked " )
2007-08-15 13:36:59 +02:00
private static Object [ ] /*{String - the snippet, Set - remaining hashes}*/
2008-01-19 01:40:19 +01:00
computeTextSnippet ( Iterator < StringBuffer > sentences , Set < String > queryhashes , int maxLength ) {
2005-07-20 15:03:41 +02:00
try {
2006-10-07 02:06:09 +02:00
if ( sentences = = null ) return null ;
2005-08-15 01:35:18 +02:00
if ( ( queryhashes = = null ) | | ( queryhashes . size ( ) = = 0 ) ) return null ;
2008-01-19 01:40:19 +01:00
Iterator < String > j ;
HashMap < String , Integer > hs ;
2007-01-29 02:11:22 +01:00
StringBuffer sentence ;
2008-01-19 01:40:19 +01:00
TreeMap < Integer , StringBuffer > os = new TreeMap < Integer , StringBuffer > ( ) ;
2006-12-09 03:13:43 +01:00
int uniqCounter = 9999 ;
int score ;
while ( sentences . hasNext ( ) ) {
2008-01-19 01:40:19 +01:00
sentence = sentences . next ( ) ;
2007-03-13 23:18:36 +01:00
hs = hashSentence ( sentence . toString ( ) ) ;
j = queryhashes . iterator ( ) ;
score = 0 ;
2008-01-19 01:40:19 +01:00
while ( j . hasNext ( ) ) { if ( hs . containsKey ( j . next ( ) ) ) score + + ; }
2007-03-13 23:18:36 +01:00
if ( score > 0 ) {
os . put ( new Integer ( 1000000 * score - sentence . length ( ) * 10000 + uniqCounter - - ) , sentence ) ;
2005-06-23 14:12:12 +02:00
}
2005-06-08 02:52:24 +02:00
}
2006-12-09 03:13:43 +01:00
String result ;
2008-01-19 01:40:19 +01:00
Set < String > remaininghashes ;
2006-12-09 03:13:43 +01:00
while ( os . size ( ) > 0 ) {
2008-01-19 01:40:19 +01:00
sentence = os . remove ( os . lastKey ( ) ) ; // sentence with the biggest score
2007-03-13 23:18:36 +01:00
Object [ ] tsr = computeTextSnippet ( sentence . toString ( ) , queryhashes , maxLength ) ;
if ( tsr = = null ) continue ;
result = ( String ) tsr [ 0 ] ;
2006-12-09 03:13:43 +01:00
if ( ( result ! = null ) & & ( result . length ( ) > 0 ) ) {
2008-01-19 01:40:19 +01:00
remaininghashes = ( Set < String > ) tsr [ 1 ] ;
2006-12-09 03:13:43 +01:00
if ( remaininghashes . size ( ) = = 0 ) {
// we have found the snippet
2007-03-13 23:18:36 +01:00
return new Object [ ] { result , remaininghashes } ;
2006-12-09 03:13:43 +01:00
} else if ( remaininghashes . size ( ) < queryhashes . size ( ) ) {
// the result has not all words in it.
// find another sentence that represents the missing other words
// and find recursively more sentences
maxLength = maxLength - result . length ( ) ;
if ( maxLength < 20 ) maxLength = 20 ;
2007-03-13 23:18:36 +01:00
tsr = computeTextSnippet ( os . values ( ) . iterator ( ) , remaininghashes , maxLength ) ;
2007-03-16 14:25:56 +01:00
if ( tsr = = null ) return null ;
2007-03-13 23:18:36 +01:00
String nextSnippet = ( String ) tsr [ 0 ] ;
if ( nextSnippet = = null ) return tsr ;
return new Object [ ] { result + ( " / " + nextSnippet ) , tsr [ 1 ] } ;
2006-12-09 03:13:43 +01:00
} else {
// error
//assert remaininghashes.size() < queryhashes.size() : "remaininghashes.size() = " + remaininghashes.size() + ", queryhashes.size() = " + queryhashes.size() + ", sentence = '" + sentence + "', result = '" + result + "'";
continue ;
}
2005-08-15 01:35:18 +02:00
}
2005-06-24 09:41:07 +02:00
}
2006-12-09 03:13:43 +01:00
return null ;
2006-12-08 03:14:56 +01:00
} catch ( IndexOutOfBoundsException e ) {
log . logSevere ( " computeSnippet: error with string generation " , e ) ;
2007-03-13 23:18:36 +01:00
return new Object [ ] { null , queryhashes } ;
2006-12-08 03:14:56 +01:00
}
}
2007-08-15 13:36:59 +02:00
private static Object [ ] /*{String - the snippet, Set - remaining hashes}*/
2008-01-19 01:40:19 +01:00
computeTextSnippet ( String sentence , Set < String > queryhashes , int maxLength ) {
2006-12-08 03:14:56 +01:00
try {
if ( sentence = = null ) return null ;
if ( ( queryhashes = = null ) | | ( queryhashes . size ( ) = = 0 ) ) return null ;
String hash ;
// find all hashes that appear in the sentence
2008-01-19 01:40:19 +01:00
HashMap < String , Integer > hs = hashSentence ( sentence ) ;
Iterator < String > j = queryhashes . iterator ( ) ;
2005-08-15 01:35:18 +02:00
Integer pos ;
2006-12-08 03:14:56 +01:00
int p , minpos = sentence . length ( ) , maxpos = - 1 ;
2008-01-19 01:40:19 +01:00
HashSet < String > remainingHashes = new HashSet < String > ( ) ;
2005-08-15 01:35:18 +02:00
while ( j . hasNext ( ) ) {
2008-01-19 01:40:19 +01:00
hash = j . next ( ) ;
pos = hs . get ( hash ) ;
2007-03-13 23:18:36 +01:00
if ( pos = = null ) {
remainingHashes . add ( hash ) ;
} else {
2005-08-15 01:35:18 +02:00
p = pos . intValue ( ) ;
if ( p > maxpos ) maxpos = p ;
if ( p < minpos ) minpos = p ;
}
2005-06-30 20:54:00 +02:00
}
2005-08-15 01:35:18 +02:00
// check result size
maxpos = maxpos + 10 ;
2006-12-08 03:14:56 +01:00
if ( maxpos > sentence . length ( ) ) maxpos = sentence . length ( ) ;
2005-08-15 01:35:18 +02:00
if ( minpos < 0 ) minpos = 0 ;
// we have a result, but is it short enough?
if ( maxpos - minpos + 10 > maxLength ) {
// the string is too long, even if we cut at both ends
// so cut here in the middle of the string
2006-12-08 03:14:56 +01:00
int lenb = sentence . length ( ) ;
sentence = sentence . substring ( 0 , ( minpos + 20 > sentence . length ( ) ) ? sentence . length ( ) : minpos + 20 ) . trim ( ) +
2005-08-15 01:35:18 +02:00
" [..] " +
2006-12-08 03:14:56 +01:00
sentence . substring ( ( maxpos + 26 > sentence . length ( ) ) ? sentence . length ( ) : maxpos + 26 ) . trim ( ) ;
maxpos = maxpos + lenb - sentence . length ( ) + 6 ;
2005-08-15 01:35:18 +02:00
}
if ( maxpos > maxLength ) {
// the string is too long, even if we cut it at the end
// so cut it here at both ends at once
2007-09-06 03:28:35 +02:00
assert maxpos > = minpos ;
int newlen = Math . max ( 10 , maxpos - minpos + 10 ) ;
2005-08-15 01:35:18 +02:00
int around = ( maxLength - newlen ) / 2 ;
2007-09-06 03:28:35 +02:00
assert minpos - around < sentence . length ( ) : " maxpos = " + maxpos + " , minpos = " + minpos + " , around = " + around + " , sentence.length() = " + sentence . length ( ) ;
assert ( ( maxpos + around ) < = sentence . length ( ) ) & & ( ( maxpos + around ) < = sentence . length ( ) ) : " maxpos = " + maxpos + " , minpos = " + minpos + " , around = " + around + " , sentence.length() = " + sentence . length ( ) ;
2006-12-08 03:14:56 +01:00
sentence = " [..] " + sentence . substring ( minpos - around , ( ( maxpos + around ) > sentence . length ( ) ) ? sentence . length ( ) : ( maxpos + around ) ) . trim ( ) + " [..] " ;
2005-08-15 01:35:18 +02:00
minpos = around ;
2006-12-08 03:14:56 +01:00
maxpos = sentence . length ( ) - around - 5 ;
2005-08-15 01:35:18 +02:00
}
2006-12-08 03:14:56 +01:00
if ( sentence . length ( ) > maxLength ) {
// trim sentence, 1st step (cut at right side)
sentence = sentence . substring ( 0 , maxpos ) . trim ( ) + " [..] " ;
2005-08-15 01:35:18 +02:00
}
2006-12-08 03:14:56 +01:00
if ( sentence . length ( ) > maxLength ) {
// trim sentence, 2nd step (cut at left side)
sentence = " [..] " + sentence . substring ( minpos ) . trim ( ) ;
2005-08-15 01:35:18 +02:00
}
2006-12-08 03:14:56 +01:00
if ( sentence . length ( ) > maxLength ) {
// trim sentence, 3rd step (cut in the middle)
sentence = sentence . substring ( 6 , 20 ) . trim ( ) + " [..] " + sentence . substring ( sentence . length ( ) - 26 , sentence . length ( ) - 6 ) . trim ( ) ;
2005-08-15 01:35:18 +02:00
}
2007-03-13 23:18:36 +01:00
return new Object [ ] { sentence , remainingHashes } ;
2005-07-20 15:03:41 +02:00
} catch ( IndexOutOfBoundsException e ) {
2005-08-30 23:32:59 +02:00
log . logSevere ( " computeSnippet: error with string generation " , e ) ;
2006-12-08 03:14:56 +01:00
return null ;
}
}
2008-01-18 18:14:02 +01:00
public static ArrayList < MediaSnippet > retrieveMediaSnippets ( yacyURL url , Set < String > queryhashes , int mediatype , boolean fetchOnline , int timeout ) {
2006-12-11 02:31:23 +01:00
if ( queryhashes . size ( ) = = 0 ) {
serverLog . logFine ( " snippet fetch " , " no query hashes given for url " + url ) ;
2008-01-18 18:14:02 +01:00
return new ArrayList < MediaSnippet > ( ) ;
2006-12-11 02:31:23 +01:00
}
2006-12-20 16:44:29 +01:00
2006-12-19 04:10:46 +01:00
plasmaParserDocument document = retrieveDocument ( url , fetchOnline , timeout , false ) ;
2008-01-18 18:14:02 +01:00
ArrayList < MediaSnippet > a = new ArrayList < MediaSnippet > ( ) ;
2006-12-11 02:31:23 +01:00
if ( document ! = null ) {
2007-09-04 01:43:55 +02:00
if ( ( mediatype = = plasmaSearchQuery . CONTENTDOM_ALL ) | | ( mediatype = = plasmaSearchQuery . CONTENTDOM_AUDIO ) ) a . addAll ( computeMediaSnippets ( document , queryhashes , plasmaSearchQuery . CONTENTDOM_AUDIO ) ) ;
if ( ( mediatype = = plasmaSearchQuery . CONTENTDOM_ALL ) | | ( mediatype = = plasmaSearchQuery . CONTENTDOM_VIDEO ) ) a . addAll ( computeMediaSnippets ( document , queryhashes , plasmaSearchQuery . CONTENTDOM_VIDEO ) ) ;
if ( ( mediatype = = plasmaSearchQuery . CONTENTDOM_ALL ) | | ( mediatype = = plasmaSearchQuery . CONTENTDOM_APP ) ) a . addAll ( computeMediaSnippets ( document , queryhashes , plasmaSearchQuery . CONTENTDOM_APP ) ) ;
if ( ( mediatype = = plasmaSearchQuery . CONTENTDOM_ALL ) | | ( mediatype = = plasmaSearchQuery . CONTENTDOM_IMAGE ) ) a . addAll ( computeImageSnippets ( document , queryhashes ) ) ;
2006-12-11 02:31:23 +01:00
}
return a ;
}
2008-01-18 18:14:02 +01:00
public static ArrayList < MediaSnippet > computeMediaSnippets ( plasmaParserDocument document , Set < String > queryhashes , int mediatype ) {
2006-12-11 02:31:23 +01:00
2008-01-18 18:14:02 +01:00
if ( document = = null ) return new ArrayList < MediaSnippet > ( ) ;
2008-01-22 12:51:43 +01:00
Map < yacyURL , String > media = null ;
2007-09-04 01:43:55 +02:00
if ( mediatype = = plasmaSearchQuery . CONTENTDOM_AUDIO ) media = document . getAudiolinks ( ) ;
else if ( mediatype = = plasmaSearchQuery . CONTENTDOM_VIDEO ) media = document . getVideolinks ( ) ;
else if ( mediatype = = plasmaSearchQuery . CONTENTDOM_APP ) media = document . getApplinks ( ) ;
2006-12-11 02:31:23 +01:00
if ( media = = null ) return null ;
2008-01-22 12:51:43 +01:00
Iterator < Map . Entry < yacyURL , String > > i = media . entrySet ( ) . iterator ( ) ;
Map . Entry < yacyURL , String > entry ;
yacyURL url ;
String desc ;
2008-01-18 18:14:02 +01:00
Set < String > s ;
ArrayList < MediaSnippet > result = new ArrayList < MediaSnippet > ( ) ;
2006-12-11 02:31:23 +01:00
while ( i . hasNext ( ) ) {
2008-01-18 18:14:02 +01:00
entry = i . next ( ) ;
url = entry . getKey ( ) ;
desc = entry . getValue ( ) ;
2008-01-22 12:51:43 +01:00
s = removeAppearanceHashes ( url . toNormalform ( false , false ) , queryhashes ) ;
2006-12-11 02:31:23 +01:00
if ( s . size ( ) = = 0 ) {
2006-12-12 03:09:25 +01:00
result . add ( new MediaSnippet ( mediatype , url , desc , null ) ) ;
2006-12-11 02:31:23 +01:00
continue ;
}
s = removeAppearanceHashes ( desc , s ) ;
if ( s . size ( ) = = 0 ) {
2006-12-12 03:09:25 +01:00
result . add ( new MediaSnippet ( mediatype , url , desc , null ) ) ;
2006-12-11 02:31:23 +01:00
continue ;
}
}
return result ;
}
2008-01-18 18:14:02 +01:00
public static ArrayList < MediaSnippet > computeImageSnippets ( plasmaParserDocument document , Set < String > queryhashes ) {
2006-12-11 02:31:23 +01:00
2008-01-18 18:14:02 +01:00
TreeSet < htmlFilterImageEntry > images = document . getImages ( ) ;
2006-12-11 02:31:23 +01:00
2008-01-18 18:14:02 +01:00
Iterator < htmlFilterImageEntry > i = images . iterator ( ) ;
2006-12-11 02:31:23 +01:00
htmlFilterImageEntry ientry ;
2008-01-22 12:51:43 +01:00
yacyURL url ;
String desc ;
2008-01-18 18:14:02 +01:00
Set < String > s ;
ArrayList < MediaSnippet > result = new ArrayList < MediaSnippet > ( ) ;
2006-12-11 02:31:23 +01:00
while ( i . hasNext ( ) ) {
2008-01-18 18:14:02 +01:00
ientry = i . next ( ) ;
2008-01-22 12:51:43 +01:00
url = ientry . url ( ) ;
2007-10-19 06:13:46 +02:00
desc = ientry . alt ( ) ;
2008-01-22 12:51:43 +01:00
s = removeAppearanceHashes ( url . toNormalform ( false , false ) , queryhashes ) ;
2006-12-11 02:31:23 +01:00
if ( s . size ( ) = = 0 ) {
2007-09-04 01:43:55 +02:00
result . add ( new MediaSnippet ( plasmaSearchQuery . CONTENTDOM_IMAGE , url , desc , ientry . width ( ) + " x " + ientry . height ( ) ) ) ;
2006-12-11 02:31:23 +01:00
continue ;
}
s = removeAppearanceHashes ( desc , s ) ;
if ( s . size ( ) = = 0 ) {
2007-09-04 01:43:55 +02:00
result . add ( new MediaSnippet ( plasmaSearchQuery . CONTENTDOM_IMAGE , url , desc , ientry . width ( ) + " x " + ientry . height ( ) ) ) ;
2006-12-11 02:31:23 +01:00
continue ;
}
}
return result ;
}
2008-01-18 18:14:02 +01:00
private static Set < String > removeAppearanceHashes ( String sentence , Set < String > queryhashes ) {
2006-12-08 03:14:56 +01:00
// remove all hashes that appear in the sentence
if ( sentence = = null ) return queryhashes ;
2008-01-18 18:14:02 +01:00
HashMap < String , Integer > hs = hashSentence ( sentence ) ;
Iterator < String > j = queryhashes . iterator ( ) ;
2006-12-08 03:14:56 +01:00
String hash ;
Integer pos ;
2008-01-18 18:14:02 +01:00
Set < String > remaininghashes = new HashSet < String > ( ) ;
2006-12-08 03:14:56 +01:00
while ( j . hasNext ( ) ) {
2008-01-18 18:14:02 +01:00
hash = j . next ( ) ;
pos = hs . get ( hash ) ;
2006-12-08 03:14:56 +01:00
if ( pos = = null ) {
remaininghashes . add ( new String ( hash ) ) ;
}
2005-07-20 15:03:41 +02:00
}
2006-12-08 03:14:56 +01:00
return remaininghashes ;
2005-06-23 14:12:12 +02:00
}
2005-06-24 09:41:07 +02:00
2008-01-18 18:14:02 +01:00
private static HashMap < String , Integer > hashSentence ( String sentence ) {
2005-06-30 20:54:00 +02:00
// generates a word-wordPos mapping
2008-01-18 18:14:02 +01:00
HashMap < String , Integer > map = new HashMap < String , Integer > ( ) ;
Enumeration < StringBuffer > words = plasmaCondenser . wordTokenizer ( sentence , " UTF-8 " , 0 ) ;
2005-06-30 20:54:00 +02:00
int pos = 0 ;
2007-01-29 02:11:22 +01:00
StringBuffer word ;
2007-09-06 03:28:35 +02:00
String hash ;
2005-06-30 20:54:00 +02:00
while ( words . hasMoreElements ( ) ) {
2008-01-18 18:14:02 +01:00
word = words . nextElement ( ) ;
2007-09-06 03:28:35 +02:00
hash = plasmaCondenser . word2hash ( new String ( word ) ) ;
if ( ! map . containsKey ( hash ) ) map . put ( hash , new Integer ( pos ) ) ; // dont overwrite old values, that leads to too far word distances
2005-06-30 20:54:00 +02:00
pos + = word . length ( ) + 1 ;
}
return map ;
2005-06-08 02:52:24 +02:00
}
2005-06-23 14:12:12 +02:00
2007-09-05 11:01:35 +02:00
public static plasmaParserDocument parseDocument ( yacyURL url , long contentLength , InputStream resourceStream ) throws ParserException {
2006-10-03 13:05:48 +02:00
return parseDocument ( url , contentLength , resourceStream , null ) ;
2006-02-23 14:29:07 +01:00
}
2006-10-03 13:05:48 +02:00
/ * *
* Parse the resource
* @param url the URL of the resource
* @param contentLength the contentLength of the resource
* @param resourceStream the resource body as stream
* @param docInfo metadata about the resource
* @return the extracted data
* @throws ParserException
* /
2007-09-05 11:01:35 +02:00
public static plasmaParserDocument parseDocument ( yacyURL url , long contentLength , InputStream resourceStream , IResourceInfo docInfo ) throws ParserException {
2006-09-03 16:59:00 +02:00
try {
2006-10-03 13:05:48 +02:00
if ( resourceStream = = null ) return null ;
2006-09-03 16:59:00 +02:00
2006-10-03 13:05:48 +02:00
// STEP 1: if no resource metadata is available, try to load it from cache
2006-09-06 16:31:17 +02:00
if ( docInfo = = null ) {
2006-09-22 13:40:46 +02:00
// try to get the header from the htcache directory
try {
2007-08-15 23:31:31 +02:00
docInfo = plasmaHTCache . loadResourceInfo ( url ) ;
2006-09-22 13:55:28 +02:00
} catch ( Exception e ) {
// ignore this. resource info loading failed
2006-10-02 03:15:02 +02:00
}
}
2006-10-03 13:05:48 +02:00
// STEP 2: if the metadata is still null try to download it from web
if ( ( docInfo = = null ) & & ( url . getProtocol ( ) . startsWith ( " http " ) ) ) {
2006-09-22 13:55:28 +02:00
// TODO: we need a better solution here
2006-10-03 13:05:48 +02:00
// e.g. encapsulate this in the crawlLoader class
// getting URL mimeType
try {
2007-08-15 13:36:59 +02:00
httpHeader header = httpc . whead ( url , url . getHost ( ) , 10000 , null , null , plasmaSwitchboard . getSwitchboard ( ) . remoteProxyConfig ) ;
2007-08-15 23:31:31 +02:00
docInfo = plasmaHTCache . getResourceInfoFactory ( ) . buildResourceInfoObj ( url , header ) ;
2006-10-03 13:05:48 +02:00
} catch ( Exception e ) {
// ingore this. http header download failed
}
}
2005-11-14 11:25:43 +01:00
2006-10-03 13:05:48 +02:00
// STEP 3: if the metadata is still null try to guess the mimeType of the resource
2006-09-06 16:31:17 +02:00
if ( docInfo = = null ) {
2007-08-15 23:31:31 +02:00
String filename = plasmaHTCache . getCachePath ( url ) . getName ( ) ;
2006-09-03 16:59:00 +02:00
int p = filename . lastIndexOf ( '.' ) ;
if ( // if no extension is available
( p < 0 ) | |
// or the extension is supported by one of the parsers
( ( p > = 0 ) & & ( plasmaParser . supportedFileExtContains ( filename . substring ( p + 1 ) ) ) )
) {
String supposedMime = " text/html " ;
// if the mimeType Parser is installed we can set the mimeType to null to force
// a mimetype detection
if ( plasmaParser . supportedMimeTypesContains ( " application/octet-stream " ) ) {
supposedMime = null ;
} else if ( p ! = - 1 ) {
// otherwise we try to determine the mimeType per file Extension
supposedMime = plasmaParser . getMimeTypeByFileExt ( filename . substring ( p + 1 ) ) ;
}
2007-08-15 13:36:59 +02:00
return parser . parseSource ( url , supposedMime , null , contentLength , resourceStream ) ;
2006-09-03 16:59:00 +02:00
}
2005-06-23 14:12:12 +02:00
return null ;
2006-10-03 13:05:48 +02:00
}
2006-09-06 16:31:17 +02:00
if ( plasmaParser . supportedMimeTypesContains ( docInfo . getMimeType ( ) ) ) {
2007-08-15 13:36:59 +02:00
return parser . parseSource ( url , docInfo . getMimeType ( ) , docInfo . getCharacterEncoding ( ) , contentLength , resourceStream ) ;
2005-06-23 14:12:12 +02:00
}
2006-09-03 16:59:00 +02:00
return null ;
} catch ( InterruptedException e ) {
// interruption of thread detected
return null ;
2005-06-08 02:52:24 +02:00
}
}
2006-10-03 13:05:48 +02:00
/ * *
*
* @param url
* @param fetchOnline
* @param socketTimeout
* @return an Object array containing
* < table >
* < tr > < td > [ 0 ] < / td > < td > the content as { @link InputStream } < / td > < / tr >
* < tr > < td > [ 1 ] < / td > < td > the content - length as { @link Integer } < / td > < / tr >
* < / table >
* /
2007-09-05 11:01:35 +02:00
public static Object [ ] getResource ( yacyURL url , boolean fetchOnline , int socketTimeout , boolean forText ) {
2005-06-08 02:52:24 +02:00
// load the url as resource from the web
2006-10-03 13:05:48 +02:00
long contentLength = - 1 ;
2006-09-20 14:25:07 +02:00
2006-10-03 13:05:48 +02:00
// trying to load the resource body from cache
2007-08-15 23:31:31 +02:00
InputStream resource = plasmaHTCache . getResourceContentStream ( url ) ;
2006-10-03 13:05:48 +02:00
if ( resource ! = null ) {
2007-08-15 23:31:31 +02:00
contentLength = plasmaHTCache . getResourceContentLength ( url ) ;
2006-10-03 13:05:48 +02:00
} else if ( fetchOnline ) {
// if the content is not available in cache try to download it from web
2006-09-20 14:25:07 +02:00
// try to download the resource using a crawler
2007-10-29 02:43:20 +01:00
plasmaHTCache . Entry entry = plasmaSwitchboard . getSwitchboard ( ) . crawlQueues . loadResourceFromWeb ( url , ( socketTimeout < 0 ) ? - 1 : socketTimeout , true , forText ) ;
2007-10-29 13:14:18 +01:00
if ( entry = = null ) return null ; // not found in web
2006-09-20 14:25:07 +02:00
2006-10-03 13:05:48 +02:00
// read resource body (if it is there)
byte [ ] resourceArray = entry . cacheArray ( ) ;
2007-10-29 13:14:18 +01:00
// in case that the resource was not in ram, read it from disk
2006-10-03 13:05:48 +02:00
if ( resourceArray = = null ) {
2007-08-15 23:31:31 +02:00
resource = plasmaHTCache . getResourceContentStream ( url ) ;
contentLength = plasmaHTCache . getResourceContentLength ( url ) ;
2006-10-03 13:05:48 +02:00
} else {
resource = new ByteArrayInputStream ( resourceArray ) ;
contentLength = resourceArray . length ;
}
} else {
return null ;
2005-06-08 02:52:24 +02:00
}
2006-10-03 13:05:48 +02:00
return new Object [ ] { resource , new Long ( contentLength ) } ;
2005-06-08 02:52:24 +02:00
}
2007-03-13 23:18:36 +01:00
2007-08-26 20:18:35 +02:00
public static String failConsequences ( TextSnippet snippet , String eventID ) {
2007-03-13 23:18:36 +01:00
// problems with snippet fetch
2007-10-01 14:30:23 +02:00
if ( yacyCore . seedDB . mySeed ( ) . isVirgin ( ) ) return snippet . getError ( ) + " (no consequences, no network connection) " ; // no consequences if we do not have a network connection
2007-09-05 11:01:35 +02:00
String urlHash = snippet . getUrl ( ) . hash ( ) ;
2007-03-13 23:18:36 +01:00
String querystring = kelondroMSetTools . setToString ( snippet . getRemainingHashes ( ) , ' ' ) ;
if ( ( snippet . getErrorCode ( ) = = ERROR_SOURCE_LOADING ) | |
( snippet . getErrorCode ( ) = = ERROR_RESOURCE_LOADING ) | |
( snippet . getErrorCode ( ) = = ERROR_PARSER_FAILED ) | |
( snippet . getErrorCode ( ) = = ERROR_PARSER_NO_LINES ) ) {
2007-07-19 17:32:10 +02:00
log . logInfo ( " error: ' " + snippet . getError ( ) + " ', remove url = " + snippet . getUrl ( ) . toNormalform ( false , true ) + " , cause: " + snippet . getError ( ) ) ;
2007-08-15 13:36:59 +02:00
plasmaSwitchboard . getSwitchboard ( ) . wordIndex . loadedURL . remove ( urlHash ) ;
2007-08-26 20:18:35 +02:00
plasmaSearchEvent event = plasmaSearchEvent . getEvent ( eventID ) ;
plasmaSwitchboard . getSwitchboard ( ) . wordIndex . removeEntryMultiple ( event . getQuery ( ) . queryHashes , urlHash ) ;
event . remove ( urlHash ) ;
2005-07-12 17:09:35 +02:00
}
2007-03-13 23:18:36 +01:00
if ( snippet . getErrorCode ( ) = = ERROR_NO_MATCH ) {
2007-07-19 17:32:10 +02:00
log . logInfo ( " error: ' " + snippet . getError ( ) + " ', remove words ' " + querystring + " ' for url = " + snippet . getUrl ( ) . toNormalform ( false , true ) + " , cause: " + snippet . getError ( ) ) ;
2007-08-26 20:18:35 +02:00
plasmaSwitchboard . getSwitchboard ( ) . wordIndex . removeEntryMultiple ( snippet . remaingHashes , urlHash ) ;
plasmaSearchEvent . getEvent ( eventID ) . remove ( urlHash ) ;
2005-07-12 17:09:35 +02:00
}
2007-03-13 23:18:36 +01:00
return snippet . getError ( ) ;
2005-07-12 17:09:35 +02:00
}
2007-03-13 23:18:36 +01:00
2006-12-11 02:31:23 +01:00
}