2009-08-25 23:27:01 +02:00
// SearchEvent.java
// (C) 2005 by Michael Peter Christen; mc@yacy.net, Frankfurt a. M., Germany
// first published 10.10.2005 on http://yacy.net
//
// This is a part of YaCy, a peer-to-peer based web search engine
//
2009-09-05 22:41:21 +02:00
// $LastChangedDate$
// $LastChangedRevision$
// $LastChangedBy$
2009-08-25 23:27:01 +02:00
//
// LICENSE
2011-06-01 21:31:56 +02:00
//
2009-08-25 23:27:01 +02:00
// This program is free software; you can redistribute it and/or modify
// it under the terms of the GNU General Public License as published by
// the Free Software Foundation; either version 2 of the License, or
// (at your option) any later version.
//
// This program is distributed in the hope that it will be useful,
// but WITHOUT ANY WARRANTY; without even the implied warranty of
// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
// GNU General Public License for more details.
//
// You should have received a copy of the GNU General Public License
// along with this program; if not, write to the Free Software
// Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
2011-09-25 18:59:06 +02:00
package net.yacy.search.query ;
2009-08-25 23:27:01 +02:00
2012-07-30 10:38:23 +02:00
import java.io.IOException ;
2009-08-25 23:27:01 +02:00
import java.util.ArrayList ;
2009-11-09 20:14:51 +01:00
import java.util.Iterator ;
2010-11-28 03:57:31 +01:00
import java.util.List ;
2011-09-13 16:39:41 +02:00
import net.yacy.cora.document.ASCII ;
2012-04-22 00:04:36 +02:00
import net.yacy.cora.document.Classification ;
2010-05-25 14:54:57 +02:00
import net.yacy.cora.document.MultiProtocolURI ;
2011-05-07 09:37:46 +02:00
import net.yacy.cora.protocol.ResponseHeader ;
2011-06-13 23:44:03 +02:00
import net.yacy.cora.services.federated.yacy.CacheStrategy ;
2011-12-16 23:59:29 +01:00
import net.yacy.cora.sorting.ScoreMap ;
import net.yacy.cora.sorting.WeakPriorityBlockingQueue ;
import net.yacy.cora.sorting.WeakPriorityBlockingQueue.Element ;
import net.yacy.cora.sorting.WeakPriorityBlockingQueue.ReverseElement ;
2012-07-27 12:13:53 +02:00
import net.yacy.cora.storage.HandleSet ;
import net.yacy.cora.util.SpaceExceededException ;
2009-10-18 02:53:43 +02:00
import net.yacy.document.Condenser ;
2012-07-22 13:18:45 +02:00
import net.yacy.kelondro.data.meta.URIMetadata ;
2012-08-17 14:22:07 +02:00
import net.yacy.kelondro.data.meta.URIMetadataNode ;
2009-11-09 20:14:51 +01:00
import net.yacy.kelondro.data.word.Word ;
2012-07-27 12:13:53 +02:00
import net.yacy.kelondro.index.RowHandleSet ;
2009-10-10 01:13:30 +02:00
import net.yacy.kelondro.logging.Log ;
2011-06-01 21:31:56 +02:00
import net.yacy.kelondro.util.MemoryControl ;
2011-10-04 11:06:24 +02:00
import net.yacy.peers.SeedDB ;
2011-09-25 18:59:06 +02:00
import net.yacy.peers.graphics.ProfilingGraph ;
2010-03-20 11:28:03 +01:00
import net.yacy.repository.LoaderDispatcher ;
2012-05-04 17:28:27 +02:00
import net.yacy.search.EventTracker ;
2011-09-25 18:59:06 +02:00
import net.yacy.search.Switchboard ;
2012-08-17 15:33:02 +02:00
import net.yacy.search.index.Fulltext ;
2012-05-19 01:06:33 +02:00
import net.yacy.search.index.Segment ;
2011-09-25 18:59:06 +02:00
import net.yacy.search.snippet.MediaSnippet ;
import net.yacy.search.snippet.ResultEntry ;
import net.yacy.search.snippet.TextSnippet ;
2011-09-13 16:39:41 +02:00
import org.apache.solr.common.SolrDocument ;
2012-04-21 13:34:07 +02:00
import de.anomic.crawler.Cache ;
2010-12-06 15:34:58 +01:00
import de.anomic.data.WorkTables ;
2009-08-25 23:27:01 +02:00
2011-09-26 23:42:28 +02:00
public class SnippetProcess {
2011-06-01 21:31:56 +02:00
2012-07-12 19:54:54 +02:00
public static Log log = new Log ( " SEARCH " ) ;
2012-07-24 17:23:29 +02:00
2012-05-19 01:06:33 +02:00
private final static int SNIPPET_WORKER_THREADS = Math . max ( 4 , Runtime . getRuntime ( ) . availableProcessors ( ) * 2 ) ;
2009-08-25 23:27:01 +02:00
// input values
2011-09-26 23:42:28 +02:00
final RWIProcess rankingProcess ; // ordered search results, grows dynamically as all the query threads enrich this container
2009-08-30 12:28:23 +02:00
QueryParams query ;
2011-10-04 11:06:24 +02:00
private final SeedDB peers ;
2010-12-06 15:34:58 +01:00
private final WorkTables workTables ;
2011-06-01 21:31:56 +02:00
2009-08-25 23:27:01 +02:00
// result values
2010-03-20 11:28:03 +01:00
protected final LoaderDispatcher loader ;
2009-08-27 16:34:41 +02:00
protected Worker [ ] workerThreads ;
2010-10-04 13:54:48 +02:00
protected final WeakPriorityBlockingQueue < ResultEntry > result ;
protected final WeakPriorityBlockingQueue < MediaSnippet > images ; // container to sort images by size
2010-04-15 15:22:59 +02:00
protected final HandleSet snippetFetchWordHashes ; // a set of word hashes that are used to match with the snippets
2009-08-25 23:27:01 +02:00
long urlRetrievalAllTime ;
long snippetComputationAllTime ;
2009-09-18 11:19:52 +02:00
int taketimeout ;
2012-01-10 03:00:55 +01:00
private final boolean deleteIfSnippetFail , remote ;
2011-06-23 13:57:17 +02:00
private boolean cleanupState ;
2011-06-01 21:31:56 +02:00
2011-09-26 23:42:28 +02:00
public SnippetProcess (
2010-03-20 11:28:03 +01:00
final LoaderDispatcher loader ,
2011-09-26 23:42:28 +02:00
final RWIProcess rankedCache ,
2009-08-25 23:27:01 +02:00
final QueryParams query ,
2011-10-04 11:06:24 +02:00
final SeedDB peers ,
2010-12-06 15:34:58 +01:00
final WorkTables workTables ,
2011-03-21 08:50:34 +01:00
final int taketimeout ,
2012-01-10 03:00:55 +01:00
final boolean deleteIfSnippetFail ,
final boolean remote ) {
2010-10-05 19:49:53 +02:00
assert query ! = null ;
2010-03-20 11:28:03 +01:00
this . loader = loader ;
2010-10-04 13:54:48 +02:00
this . rankingProcess = rankedCache ;
2009-08-25 23:27:01 +02:00
this . query = query ;
this . peers = peers ;
2010-12-06 15:34:58 +01:00
this . workTables = workTables ;
2009-09-18 11:19:52 +02:00
this . taketimeout = taketimeout ;
2011-03-21 08:50:34 +01:00
this . deleteIfSnippetFail = deleteIfSnippetFail ;
2012-01-10 03:00:55 +01:00
this . remote = remote ;
2011-06-23 13:57:17 +02:00
this . cleanupState = false ;
2011-06-01 21:31:56 +02:00
2009-08-25 23:27:01 +02:00
this . urlRetrievalAllTime = 0 ;
this . snippetComputationAllTime = 0 ;
2012-06-08 09:14:54 +02:00
this . result = new WeakPriorityBlockingQueue < ResultEntry > ( Math . max ( 1000 , 10 * query . itemsPerPage ( ) ) , true ) ; // this is the result, enriched with snippets, ranked and ordered by ranking
this . images = new WeakPriorityBlockingQueue < MediaSnippet > ( Math . max ( 1000 , 10 * query . itemsPerPage ( ) ) , true ) ;
2011-06-01 21:31:56 +02:00
2009-08-25 23:27:01 +02:00
// snippets do not need to match with the complete query hashes,
// only with the query minus the stopwords which had not been used for the search
2010-04-15 15:22:59 +02:00
HandleSet filtered ;
try {
2012-07-27 12:13:53 +02:00
filtered = RowHandleSet . joinConstructive ( query . query_include_hashes , Switchboard . stopwordHashes ) ;
} catch ( final SpaceExceededException e ) {
2010-04-15 15:22:59 +02:00
Log . logException ( e ) ;
2012-07-27 12:13:53 +02:00
filtered = new RowHandleSet ( query . query_include_hashes . keylen ( ) , query . query_include_hashes . comparator ( ) , 0 ) ;
2010-04-15 15:22:59 +02:00
}
2012-07-09 11:14:50 +02:00
this . snippetFetchWordHashes = query . query_include_hashes . clone ( ) ;
2009-12-02 01:37:59 +01:00
if ( filtered ! = null & & ! filtered . isEmpty ( ) ) {
2010-04-15 15:22:59 +02:00
this . snippetFetchWordHashes . excludeDestructive ( Switchboard . stopwordHashes ) ;
2009-08-25 23:27:01 +02:00
}
2011-06-01 21:31:56 +02:00
2009-08-25 23:27:01 +02:00
// start worker threads to fetch urls and snippets
2009-08-30 12:28:23 +02:00
this . workerThreads = null ;
2012-05-19 01:06:33 +02:00
deployWorker ( Math . min ( SNIPPET_WORKER_THREADS , query . itemsPerPage ) , query . neededResults ( ) ) ;
2011-07-14 09:07:06 +02:00
EventTracker . update ( EventTracker . EClass . SEARCH , new ProfilingGraph . EventSearch ( query . id ( true ) , SearchEvent . Type . SNIPPETFETCH_START , ( ( this . workerThreads = = null ) ? " no " : this . workerThreads . length ) + " online snippet fetch threads started " , 0 , 0 ) , false ) ;
2009-08-25 23:27:01 +02:00
}
2011-06-01 21:31:56 +02:00
2011-06-23 13:57:17 +02:00
public void setCleanupState ( ) {
this . cleanupState = true ;
}
2010-12-06 00:54:00 +01:00
public long getURLRetrievalTime ( ) {
return this . urlRetrievalAllTime ;
}
2011-06-01 21:31:56 +02:00
2010-12-06 00:54:00 +01:00
public long getSnippetComputationTime ( ) {
return this . snippetComputationAllTime ;
}
2011-06-01 21:31:56 +02:00
public ResultEntry oneResult ( final int item , final long timeout ) {
2010-12-06 00:54:00 +01:00
// check if we already retrieved this item
// (happens if a search pages is accessed a second time)
2011-06-01 21:31:56 +02:00
final long finishTime = System . currentTimeMillis ( ) + timeout ;
2011-07-14 09:07:06 +02:00
EventTracker . update ( EventTracker . EClass . SEARCH , new ProfilingGraph . EventSearch ( this . query . id ( true ) , SearchEvent . Type . ONERESULT , " started, item = " + item + " , available = " + this . result . sizeAvailable ( ) , 0 , 0 ) , false ) ;
2012-01-10 03:00:55 +01:00
//Log.logInfo("SnippetProcess", "*start method for item = " + item + "; anyWorkerAlive=" + anyWorkerAlive() + "; this.rankingProcess.isAlive() = " + this.rankingProcess.isAlive() + "; this.rankingProcess.feedingIsFinished() = " + this.rankingProcess.feedingIsFinished() + "; this.result.sizeAvailable() = " + this.result.sizeAvailable() + ", this.rankingProcess.sizeQueue() = " + this.rankingProcess.sizeQueue());
2011-09-21 12:43:08 +02:00
// we must wait some time until the first result page is full to get enough elements for ranking
final long waittimeout = System . currentTimeMillis ( ) + 300 ;
2012-01-10 03:00:55 +01:00
if ( this . remote & & item < 10 & & ! this . rankingProcess . feedingIsFinished ( ) ) {
// the first 10 results have a very special timing to get most of the remote results ordered
// before they are presented on the first lines .. yes sleeps seem to be bad. but how shall we predict how long other
// peers will take until they respond?
2012-08-18 17:48:20 +02:00
long sleep = item = = 0 ? 400 : ( 10 - item ) * 9 ; // the first result takes the longest time
2012-01-10 03:00:55 +01:00
//Log.logInfo("SnippetProcess", "SLEEP = " + sleep);
try { Thread . sleep ( sleep ) ; } catch ( final InterruptedException e1 ) { Log . logException ( e1 ) ; }
}
2012-01-17 16:44:30 +01:00
int thisRankingQueueSize , lastRankingQueueSize = 0 ;
if ( item < 10 ) {
2012-01-09 03:02:35 +01:00
while (
2012-01-17 16:44:30 +01:00
( ( thisRankingQueueSize = this . rankingProcess . sizeQueue ( ) ) > 0 | | ! this . rankingProcess . feedingIsFinished ( ) ) & &
( thisRankingQueueSize > lastRankingQueueSize | | this . result . sizeAvailable ( ) < item + 1 ) & &
2012-01-10 03:00:55 +01:00
System . currentTimeMillis ( ) < waittimeout & &
anyWorkerAlive ( )
2012-01-09 03:02:35 +01:00
) {
// wait a little time to get first results in the search
2012-01-17 16:44:30 +01:00
lastRankingQueueSize = thisRankingQueueSize ;
try { Thread . sleep ( 20 ) ; } catch ( final InterruptedException e1 ) { }
2012-01-09 03:02:35 +01:00
}
2011-09-21 12:43:08 +02:00
}
2010-12-06 00:54:00 +01:00
if ( this . result . sizeAvailable ( ) > item ) {
// we have the wanted result already in the result array .. return that
2011-06-01 21:31:56 +02:00
final ResultEntry re = this . result . element ( item ) . getElement ( ) ;
2011-07-14 09:07:06 +02:00
EventTracker . update ( EventTracker . EClass . SEARCH , new ProfilingGraph . EventSearch ( this . query . id ( true ) , SearchEvent . Type . ONERESULT , " prefetched, item = " + item + " , available = " + this . result . sizeAvailable ( ) + " : " + re . urlstring ( ) , 0 , 0 ) , false ) ;
2010-12-06 00:54:00 +01:00
return re ;
}
2011-06-01 21:31:56 +02:00
2010-12-06 00:54:00 +01:00
// finally wait until enough results are there produced from the snippet fetch process
WeakPriorityBlockingQueue . Element < ResultEntry > entry = null ;
while ( System . currentTimeMillis ( ) < finishTime ) {
2012-01-09 03:02:35 +01:00
2012-01-10 03:00:55 +01:00
//Log.logInfo("SnippetProcess", "item = " + item + "; anyWorkerAlive=" + anyWorkerAlive() + "; this.rankingProcess.isAlive() = " + this.rankingProcess.isAlive() + "; this.rankingProcess.feedingIsFinished() = " + this.rankingProcess.feedingIsFinished() + "; this.result.sizeAvailable() = " + this.result.sizeAvailable() + ", this.rankingProcess.sizeQueue() = " + this.rankingProcess.sizeQueue());
2012-01-09 03:02:35 +01:00
2011-11-24 02:30:12 +01:00
if ( ! anyWorkerAlive ( ) & & ! this . rankingProcess . isAlive ( ) & & this . result . sizeAvailable ( ) + this . rankingProcess . sizeQueue ( ) < = item & & this . rankingProcess . feedingIsFinished ( ) ) {
2012-01-10 03:00:55 +01:00
//Log.logInfo("SnippetProcess", "interrupted result fetching; item = " + item + "; this.result.sizeAvailable() = " + this.result.sizeAvailable() + ", this.rankingProcess.sizeQueue() = " + this.rankingProcess.sizeQueue() + "; this.rankingProcess.feedingIsFinished() = " + this.rankingProcess.feedingIsFinished());
2011-11-24 02:30:12 +01:00
break ; // the fail case
}
2011-11-24 00:39:34 +01:00
// deploy worker to get more results
if ( ! anyWorkerAlive ( ) ) {
2012-05-19 01:06:33 +02:00
final int neededInclPrefetch = this . query . neededResults ( ) + ( ( MemoryControl . available ( ) > 100 * 1024 * 1024 & & SNIPPET_WORKER_THREADS > = 8 ) ? this . query . itemsPerPage : 0 ) ;
deployWorker ( Math . min ( SNIPPET_WORKER_THREADS , this . query . itemsPerPage ) , neededInclPrefetch ) ;
2011-11-24 00:39:34 +01:00
}
2011-06-23 13:57:17 +02:00
try { entry = this . result . element ( item , 50 ) ; } catch ( final InterruptedException e ) { break ; }
2012-01-09 03:02:35 +01:00
if ( entry ! = null ) {
break ;
}
2010-12-06 00:54:00 +01:00
}
2011-06-01 21:31:56 +02:00
2010-12-06 00:54:00 +01:00
// finally, if there is something, return the result
if ( entry = = null ) {
2011-07-14 09:07:06 +02:00
EventTracker . update ( EventTracker . EClass . SEARCH , new ProfilingGraph . EventSearch ( this . query . id ( true ) , SearchEvent . Type . ONERESULT , " not found, item = " + item + " , available = " + this . result . sizeAvailable ( ) , 0 , 0 ) , false ) ;
2012-06-04 23:43:30 +02:00
//Log.logInfo("SnippetProcess", "NO ENTRY computed (possible timeout); anyWorkerAlive=" + anyWorkerAlive() + "; this.rankingProcess.isAlive() = " + this.rankingProcess.isAlive() + "; this.rankingProcess.feedingIsFinished() = " + this.rankingProcess.feedingIsFinished() + "; this.result.sizeAvailable() = " + this.result.sizeAvailable() + ", this.rankingProcess.sizeQueue() = " + this.rankingProcess.sizeQueue());
2010-12-06 00:54:00 +01:00
return null ;
}
2011-06-01 21:31:56 +02:00
final ResultEntry re = entry . getElement ( ) ;
2011-07-14 09:07:06 +02:00
EventTracker . update ( EventTracker . EClass . SEARCH , new ProfilingGraph . EventSearch ( this . query . id ( true ) , SearchEvent . Type . ONERESULT , " retrieved, item = " + item + " , available = " + this . result . sizeAvailable ( ) + " : " + re . urlstring ( ) , 0 , 0 ) , false ) ;
2012-06-04 23:43:30 +02:00
if ( item = = this . query . offset + this . query . itemsPerPage - 1 ) {
2012-01-09 03:02:35 +01:00
stopAllWorker ( ) ; // we don't need more
}
2010-12-06 00:54:00 +01:00
return re ;
}
2011-06-01 21:31:56 +02:00
2010-12-06 00:54:00 +01:00
private int resultCounter = 0 ;
public ResultEntry nextResult ( ) {
2012-06-05 15:04:23 +02:00
final ResultEntry re = oneResult ( this . resultCounter , Math . max ( 3000 , this . query . timeout - System . currentTimeMillis ( ) ) ) ;
2011-06-01 21:31:56 +02:00
this . resultCounter + + ;
2010-12-06 00:54:00 +01:00
return re ;
}
2011-06-01 21:31:56 +02:00
2010-12-06 00:54:00 +01:00
public MediaSnippet oneImage ( final int item ) {
// always look for a next object if there are way too less
2012-01-09 03:02:35 +01:00
if ( this . images . sizeAvailable ( ) < = item + 10 ) {
fillImagesCache ( ) ;
}
2010-12-06 00:54:00 +01:00
// check if we already retrieved the item
2012-01-09 03:02:35 +01:00
if ( this . images . sizeDrained ( ) > item ) {
return this . images . element ( item ) . getElement ( ) ;
}
2011-06-01 21:31:56 +02:00
2010-12-06 00:54:00 +01:00
// look again if there are not enough for presentation
while ( this . images . sizeAvailable ( ) < = item ) {
2012-01-09 03:02:35 +01:00
if ( fillImagesCache ( ) = = 0 ) {
break ;
}
}
if ( this . images . sizeAvailable ( ) < = item ) {
return null ;
2011-06-01 21:31:56 +02:00
}
2010-12-06 00:54:00 +01:00
// now take the specific item from the image stack
return this . images . element ( item ) . getElement ( ) ;
}
2011-06-01 21:31:56 +02:00
2010-12-06 00:54:00 +01:00
private int fillImagesCache ( ) {
2011-06-01 21:31:56 +02:00
final ResultEntry result = nextResult ( ) ;
2010-12-06 00:54:00 +01:00
int c = 0 ;
2012-01-09 03:02:35 +01:00
if ( result = = null ) {
return c ;
}
2010-12-06 00:54:00 +01:00
// iterate over all images in the result
final List < MediaSnippet > imagemedia = result . mediaSnippets ( ) ;
if ( imagemedia ! = null ) {
2011-08-02 01:32:29 +02:00
ResponseHeader header ;
2011-05-07 09:37:46 +02:00
feedloop : for ( final MediaSnippet ms : imagemedia ) {
// check cache to see if the mime type of the image url is correct
2011-08-02 01:32:29 +02:00
header = Cache . getResponseHeader ( ms . href . hash ( ) ) ;
2011-05-07 09:37:46 +02:00
if ( header ! = null ) {
// this does not work for all urls since some of them may not be in the cache
2012-01-09 03:02:35 +01:00
if ( header . mime ( ) . startsWith ( " text " ) | | header . mime ( ) . startsWith ( " application " ) ) {
continue feedloop ;
}
2011-05-07 09:37:46 +02:00
}
2011-06-01 21:31:56 +02:00
this . images . put ( new ReverseElement < MediaSnippet > ( ms , ms . ranking ) ) ; // remove smallest in case of overflow
2010-12-06 00:54:00 +01:00
c + + ;
2011-03-07 21:36:40 +01:00
//System.out.println("*** image " + UTF8.String(ms.href.hash()) + " images.size = " + images.size() + "/" + images.size());
2010-10-04 13:54:48 +02:00
}
2010-12-06 00:54:00 +01:00
}
return c ;
}
2011-06-01 21:31:56 +02:00
2010-12-06 00:54:00 +01:00
public ArrayList < WeakPriorityBlockingQueue . Element < ResultEntry > > completeResults ( final long waitingtime ) {
final long timeout = System . currentTimeMillis ( ) + waitingtime ;
2011-06-01 21:31:56 +02:00
while ( this . result . sizeAvailable ( ) < this . query . neededResults ( ) & &
2010-12-06 00:54:00 +01:00
anyWorkerAlive ( ) & &
System . currentTimeMillis ( ) < timeout ) {
2011-06-26 23:17:02 +02:00
try { Thread . sleep ( 10 ) ; } catch ( final InterruptedException e ) { }
2010-12-06 00:54:00 +01:00
//System.out.println("+++DEBUG-completeResults+++ sleeping " + 200);
}
2011-06-01 21:31:56 +02:00
return this . result . list ( Math . min ( this . query . neededResults ( ) , this . result . sizeAvailable ( ) ) ) ;
2010-12-06 00:54:00 +01:00
}
public long postRanking (
final ResultEntry rentry ,
2011-03-13 02:41:44 +01:00
final ScoreMap < String > topwords ) {
2010-12-06 00:54:00 +01:00
long r = 0 ;
2011-06-01 21:31:56 +02:00
2010-12-06 00:54:00 +01:00
// for media search: prefer pages with many links
2012-04-22 00:04:36 +02:00
r + = rentry . limage ( ) < < this . query . ranking . coeff_cathasimage ;
r + = rentry . laudio ( ) < < this . query . ranking . coeff_cathasaudio ;
r + = rentry . lvideo ( ) < < this . query . ranking . coeff_cathasvideo ;
r + = rentry . lapp ( ) < < this . query . ranking . coeff_cathasapp ;
2011-06-01 21:31:56 +02:00
2012-04-13 06:47:33 +02:00
// apply citation count
//System.out.println("POSTRANKING CITATION: references = " + rentry.referencesCount() + ", inbound = " + rentry.llocal() + ", outbound = " + rentry.lother());
2012-04-16 23:43:50 +02:00
r + = ( 128 * rentry . referencesCount ( ) / ( 1 + 2 * rentry . llocal ( ) + rentry . lother ( ) ) ) < < this . query . ranking . coeff_citation ;
2012-04-22 00:04:36 +02:00
2010-12-06 00:54:00 +01:00
// prefer hit with 'prefer' pattern
2012-01-09 03:02:35 +01:00
if ( this . query . prefer . matcher ( rentry . url ( ) . toNormalform ( true , true ) ) . matches ( ) ) {
r + = 256 < < this . query . ranking . coeff_prefer ;
}
if ( this . query . prefer . matcher ( rentry . title ( ) ) . matches ( ) ) {
r + = 256 < < this . query . ranking . coeff_prefer ;
}
2011-06-01 21:31:56 +02:00
2010-12-06 00:54:00 +01:00
// apply 'common-sense' heuristic using references
final String urlstring = rentry . url ( ) . toNormalform ( true , true ) ;
final String [ ] urlcomps = MultiProtocolURI . urlComps ( urlstring ) ;
final String [ ] descrcomps = MultiProtocolURI . splitpattern . split ( rentry . title ( ) . toLowerCase ( ) ) ;
int tc ;
2011-06-01 21:31:56 +02:00
for ( final String urlcomp : urlcomps ) {
tc = topwords . get ( urlcomp ) ;
2012-01-09 03:02:35 +01:00
if ( tc > 0 ) {
r + = Math . max ( 1 , tc ) < < this . query . ranking . coeff_urlcompintoplist ;
}
2010-12-06 00:54:00 +01:00
}
2011-06-01 21:31:56 +02:00
for ( final String descrcomp : descrcomps ) {
tc = topwords . get ( descrcomp ) ;
2012-01-09 03:02:35 +01:00
if ( tc > 0 ) {
r + = Math . max ( 1 , tc ) < < this . query . ranking . coeff_descrcompintoplist ;
}
2010-12-06 00:54:00 +01:00
}
2011-06-01 21:31:56 +02:00
2010-12-06 00:54:00 +01:00
// apply query-in-result matching
final HandleSet urlcomph = Word . words2hashesHandles ( urlcomps ) ;
final HandleSet descrcomph = Word . words2hashesHandles ( descrcomps ) ;
2012-07-09 11:14:50 +02:00
final Iterator < byte [ ] > shi = this . query . query_include_hashes . iterator ( ) ;
2010-12-06 00:54:00 +01:00
byte [ ] queryhash ;
while ( shi . hasNext ( ) ) {
queryhash = shi . next ( ) ;
2012-01-09 03:02:35 +01:00
if ( urlcomph . has ( queryhash ) ) {
r + = 256 < < this . query . ranking . coeff_appurl ;
}
if ( descrcomph . has ( queryhash ) ) {
r + = 256 < < this . query . ranking . coeff_app_dc_title ;
}
2010-12-06 00:54:00 +01:00
}
2011-06-01 21:31:56 +02:00
2010-12-06 00:54:00 +01:00
return r ;
}
public void deployWorker ( int deployCount , final int neededResults ) {
2012-01-10 03:00:55 +01:00
if ( this . cleanupState | |
( this . rankingProcess . feedingIsFinished ( ) & & this . rankingProcess . sizeQueue ( ) = = 0 ) | |
this . result . sizeAvailable ( ) > = neededResults ) {
2012-01-09 03:02:35 +01:00
return ;
}
2011-08-02 01:32:29 +02:00
Worker worker ;
2010-12-06 00:54:00 +01:00
if ( this . workerThreads = = null ) {
this . workerThreads = new Worker [ deployCount ] ;
2012-05-31 22:39:53 +02:00
synchronized ( this . workerThreads ) { try {
2011-06-01 21:31:56 +02:00
for ( int i = 0 ; i < this . workerThreads . length ; i + + ) {
2012-01-10 03:00:55 +01:00
if ( this . result . sizeAvailable ( ) > = neededResults | |
( this . rankingProcess . feedingIsFinished ( ) & & this . rankingProcess . sizeQueue ( ) = = 0 ) ) {
break ;
}
2012-07-05 09:21:27 +02:00
worker = new Worker ( this . query . maxtime , this . query . snippetCacheStrategy , neededResults ) ;
2010-12-06 00:54:00 +01:00
worker . start ( ) ;
this . workerThreads [ i ] = worker ;
2011-12-06 15:28:48 +01:00
if ( this . rankingProcess . expectMoreRemoteReferences ( ) ) {
long wait = this . rankingProcess . waitTimeRecommendation ( ) ;
2012-01-09 03:02:35 +01:00
if ( wait > 0 ) {
try { Thread . sleep ( wait ) ; } catch ( InterruptedException e ) { }
}
2011-12-06 15:28:48 +01:00
}
2010-12-06 00:54:00 +01:00
}
2012-05-31 22:39:53 +02:00
} catch ( OutOfMemoryError e ) { } }
2010-12-06 00:54:00 +01:00
} else {
// there are still worker threads running, but some may be dead.
// if we find dead workers, reanimate them
synchronized ( this . workerThreads ) {
for ( int i = 0 ; i < this . workerThreads . length ; i + + ) {
2012-01-10 03:00:55 +01:00
if ( deployCount < = 0 | |
this . result . sizeAvailable ( ) > = neededResults | |
( this . rankingProcess . feedingIsFinished ( ) & & this . rankingProcess . sizeQueue ( ) = = 0 ) ) {
break ;
}
if ( this . workerThreads [ i ] = = null | | ! this . workerThreads [ i ] . isAlive ( ) ) {
2012-07-05 09:21:27 +02:00
worker = new Worker ( this . query . maxtime , this . query . snippetCacheStrategy , neededResults ) ;
2012-01-10 03:00:55 +01:00
worker . start ( ) ;
this . workerThreads [ i ] = worker ;
deployCount - - ;
}
if ( this . rankingProcess . expectMoreRemoteReferences ( ) ) {
long wait = this . rankingProcess . waitTimeRecommendation ( ) ;
if ( wait > 0 ) {
try { Thread . sleep ( wait ) ; } catch ( InterruptedException e ) { }
}
2012-01-09 03:02:35 +01:00
}
2010-12-06 00:54:00 +01:00
}
}
}
2009-08-25 23:27:01 +02:00
}
2012-01-09 03:02:35 +01:00
2011-12-07 00:32:42 +01:00
public void stopAllWorker ( ) {
synchronized ( this . workerThreads ) {
for ( int i = 0 ; i < this . workerThreads . length ; i + + ) {
2012-01-09 03:02:35 +01:00
if ( this . workerThreads [ i ] = = null | | ! this . workerThreads [ i ] . isAlive ( ) ) {
continue ;
}
2011-12-07 00:32:42 +01:00
this . workerThreads [ i ] . pleaseStop ( ) ;
this . workerThreads [ i ] . interrupt ( ) ;
}
}
}
2011-06-01 21:31:56 +02:00
2010-12-06 00:54:00 +01:00
private boolean anyWorkerAlive ( ) {
2012-01-09 03:02:35 +01:00
if ( this . workerThreads = = null | | this . workerThreads . length = = 0 ) {
return false ;
}
2010-10-04 13:54:48 +02:00
synchronized ( this . workerThreads ) {
2011-06-01 21:31:56 +02:00
for ( final Worker workerThread : this . workerThreads ) {
if ( ( workerThread ! = null ) & &
( workerThread . isAlive ( ) ) & &
2012-01-09 03:02:35 +01:00
( workerThread . busytime ( ) < 10000 ) ) {
return true ;
}
2010-10-04 13:54:48 +02:00
}
2009-08-25 23:27:01 +02:00
}
return false ;
}
protected class Worker extends Thread {
2011-06-01 21:31:56 +02:00
2009-08-25 23:27:01 +02:00
private final long timeout ; // the date until this thread should try to work
private long lastLifeSign ; // when the last time the run()-loop was executed
2011-06-13 23:44:03 +02:00
private final CacheStrategy cacheStrategy ;
2010-01-11 00:09:48 +01:00
private final int neededResults ;
2011-06-23 13:57:17 +02:00
private boolean shallrun ;
2012-08-17 15:33:02 +02:00
private final Fulltext metadata ;
2011-06-01 21:31:56 +02:00
2012-07-05 09:21:27 +02:00
public Worker ( final long maxlifetime , final CacheStrategy cacheStrategy , final int neededResults ) {
2010-06-21 16:54:54 +02:00
this . cacheStrategy = cacheStrategy ;
2009-08-25 23:27:01 +02:00
this . lastLifeSign = System . currentTimeMillis ( ) ;
this . timeout = System . currentTimeMillis ( ) + Math . max ( 1000 , maxlifetime ) ;
2009-08-30 12:28:23 +02:00
this . neededResults = neededResults ;
2011-06-23 13:57:17 +02:00
this . shallrun = true ;
2012-08-17 15:52:33 +02:00
this . metadata = SnippetProcess . this . rankingProcess . getQuery ( ) . getSegment ( ) . fulltext ( ) ;
2009-08-25 23:27:01 +02:00
}
2010-11-28 03:57:31 +01:00
@Override
2009-08-25 23:27:01 +02:00
public void run ( ) {
// start fetching urls and snippets
2012-07-22 13:18:45 +02:00
URIMetadata page ;
2011-08-02 01:32:29 +02:00
ResultEntry resultEntry ;
2009-11-19 00:56:05 +01:00
//final int fetchAhead = snippetMode == 0 ? 0 : 10;
2011-11-25 12:23:52 +01:00
final boolean nav_topics = SnippetProcess . this . query . navigators . equals ( " all " ) | | SnippetProcess . this . query . navigators . indexOf ( " topics " , 0 ) > = 0 ;
2009-08-25 23:27:01 +02:00
try {
2010-10-04 13:54:48 +02:00
//System.out.println("DEPLOYED WORKER " + id + " FOR " + this.neededResults + " RESULTS, timeoutd = " + (this.timeout - System.currentTimeMillis()));
2011-06-23 13:57:17 +02:00
while ( this . shallrun & & System . currentTimeMillis ( ) < this . timeout ) {
2012-01-17 16:44:30 +01:00
//Log.logInfo("SnippetProcess", "***** timeleft = " + (this.timeout - System.currentTimeMillis()));
2010-11-28 03:57:31 +01:00
this . lastLifeSign = System . currentTimeMillis ( ) ;
2011-08-24 14:12:48 +02:00
2011-08-02 01:32:29 +02:00
if ( MemoryControl . shortStatus ( ) ) {
break ;
}
2010-11-28 03:57:31 +01:00
2012-01-17 16:44:30 +01:00
// check if we have enough; we stop only if we can fetch online; otherwise its better to run this to get better navigation
2012-01-17 23:39:57 +01:00
if ( ( this . cacheStrategy = = null | | this . cacheStrategy . isAllowedToFetchOnline ( ) ) & & SnippetProcess . this . result . sizeAvailable ( ) > = this . neededResults ) {
2011-11-18 14:09:07 +01:00
//Log.logWarning("ResultFetcher", SnippetProcess.this.result.sizeAvailable() + " = result.sizeAvailable() >= this.neededResults = " + this.neededResults);
2010-10-04 13:54:48 +02:00
break ;
}
2010-11-28 03:57:31 +01:00
// check if we can succeed if we try to take another url
2011-09-26 23:42:28 +02:00
if ( SnippetProcess . this . rankingProcess . feedingIsFinished ( ) & & SnippetProcess . this . rankingProcess . sizeQueue ( ) = = 0 ) {
2011-05-26 18:34:35 +02:00
//Log.logWarning("ResultFetcher", "rankingProcess.feedingIsFinished() && rankingProcess.sizeQueue() == 0");
2010-11-28 03:57:31 +01:00
break ;
}
2011-06-01 21:31:56 +02:00
2009-08-25 23:27:01 +02:00
// get next entry
2012-01-17 16:44:30 +01:00
page = SnippetProcess . this . rankingProcess . takeURL ( true , Math . min ( 500 , Math . max ( 20 , this . timeout - System . currentTimeMillis ( ) ) ) ) ;
2011-11-24 00:39:34 +01:00
//if (page != null) Log.logInfo("ResultFetcher", "got one page: " + page.metadata().url().toNormalform(true, false));
2010-10-04 13:54:48 +02:00
//if (page == null) page = rankedCache.takeURL(false, this.timeout - System.currentTimeMillis());
if ( page = = null ) {
2011-11-18 14:09:07 +01:00
//Log.logWarning("ResultFetcher", "page == null");
2010-10-04 13:54:48 +02:00
break ; // no more available
}
2012-01-10 03:00:55 +01:00
this . setName ( page . url ( ) . toNormalform ( true , false ) ) ; // to support debugging
2012-01-09 03:02:35 +01:00
if ( SnippetProcess . this . query . filterfailurls & & SnippetProcess . this . workTables . failURLsContains ( page . hash ( ) ) ) {
continue ;
}
2010-10-04 13:54:48 +02:00
2011-09-13 16:39:41 +02:00
// in case that we have an attached solr, we load also the solr document
String solrContent = null ;
2012-08-17 14:22:07 +02:00
SolrDocument sd = null ;
if ( page instanceof URIMetadataNode ) {
sd = ( ( URIMetadataNode ) page ) . getDocument ( ) ;
} else {
2012-07-30 10:38:23 +02:00
try {
2012-08-17 14:22:07 +02:00
sd = this . metadata . getSolr ( ) . get ( ASCII . String ( page . hash ( ) ) ) ;
2012-07-30 10:38:23 +02:00
} catch ( IOException e ) {
Log . logException ( e ) ;
}
2012-08-17 14:22:07 +02:00
}
if ( sd ! = null ) {
solrContent = this . metadata . getSolrScheme ( ) . solrGetText ( sd ) ;
2011-09-13 16:39:41 +02:00
}
resultEntry = fetchSnippet ( page , solrContent , this . cacheStrategy ) ; // does not fetch snippets if snippetMode == 0
2012-01-09 03:02:35 +01:00
if ( resultEntry = = null )
{
continue ; // the entry had some problems, cannot be used
//final String rawLine = resultEntry.textSnippet() == null ? null : resultEntry.textSnippet().getLineRaw();
//System.out.println("***SNIPPET*** raw='" + rawLine + "', pattern='" + this.snippetPattern.toString() + "'");
//if (rawLine != null && !this.snippetPattern.matcher(rawLine).matches()) continue;
}
2011-06-01 21:31:56 +02:00
2011-05-05 02:25:14 +02:00
//if (result.contains(resultEntry)) continue;
2011-09-26 23:42:28 +02:00
SnippetProcess . this . urlRetrievalAllTime + = resultEntry . dbRetrievalTime ;
SnippetProcess . this . snippetComputationAllTime + = resultEntry . snippetComputationTime ;
2011-06-01 21:31:56 +02:00
2009-08-25 23:27:01 +02:00
// place the result to the result vector
2009-11-19 14:49:28 +01:00
// apply post-ranking
2011-09-26 23:42:28 +02:00
long ranking = Long . valueOf ( SnippetProcess . this . rankingProcess . getOrder ( ) . cardinal ( resultEntry . word ( ) ) ) ;
ranking + = postRanking ( resultEntry , SnippetProcess . this . rankingProcess . getTopicNavigator ( 10 ) ) ;
2011-04-28 13:18:14 +02:00
resultEntry . ranking = ranking ;
2011-09-26 23:42:28 +02:00
SnippetProcess . this . result . put ( new ReverseElement < ResultEntry > ( resultEntry , ranking ) ) ; // remove smallest in case of overflow
2012-01-09 03:02:35 +01:00
if ( nav_topics ) {
SnippetProcess . this . rankingProcess . addTopics ( resultEntry ) ;
}
2009-08-25 23:27:01 +02:00
}
2012-06-04 15:37:39 +02:00
if ( System . currentTimeMillis ( ) > = this . timeout ) {
2012-06-11 23:49:30 +02:00
Log . logWarning ( " SnippetProcess " , " worker ended with timeout " ) ;
2012-06-04 15:37:39 +02:00
}
2010-10-04 13:54:48 +02:00
//System.out.println("FINISHED WORKER " + id + " FOR " + this.neededResults + " RESULTS, loops = " + loops);
2012-07-30 10:38:23 +02:00
} catch ( final Exception e ) { Log . logException ( e ) ; }
2011-11-24 00:39:34 +01:00
//Log.logInfo("SEARCH", "resultWorker thread " + this.id + " terminated");
2009-08-25 23:27:01 +02:00
}
2011-06-01 21:31:56 +02:00
2011-06-23 13:57:17 +02:00
public void pleaseStop ( ) {
this . shallrun = false ;
}
2010-10-04 13:54:48 +02:00
/ * *
* calculate the time since the worker has had the latest activity
* @return time in milliseconds lasted since latest activity
* /
2009-08-25 23:27:01 +02:00
public long busytime ( ) {
return System . currentTimeMillis ( ) - this . lastLifeSign ;
}
}
2011-06-01 21:31:56 +02:00
2012-07-22 13:18:45 +02:00
protected ResultEntry fetchSnippet ( final URIMetadata page , final String solrText , final CacheStrategy cacheStrategy ) {
2009-08-27 22:20:07 +02:00
// Snippet Fetching can has 3 modes:
// 0 - do not fetch snippets
// 1 - fetch snippets offline only
// 2 - online snippet fetch
2011-06-01 21:31:56 +02:00
2009-08-27 22:20:07 +02:00
// load only urls if there was not yet a root url of that hash
// find the url entry
long startTime = System . currentTimeMillis ( ) ;
2012-01-09 03:02:35 +01:00
if ( page = = null ) {
return null ;
}
2009-08-27 22:20:07 +02:00
final long dbRetrievalTime = System . currentTimeMillis ( ) - startTime ;
2011-06-01 21:31:56 +02:00
2010-06-21 16:54:54 +02:00
if ( cacheStrategy = = null ) {
2011-04-15 17:54:19 +02:00
final TextSnippet snippet = new TextSnippet (
null ,
2011-09-13 16:39:41 +02:00
solrText ,
2011-12-17 01:27:08 +01:00
page ,
2011-06-01 21:31:56 +02:00
this . snippetFetchWordHashes ,
2012-07-09 11:14:50 +02:00
//this.query.queryString,
2011-04-15 17:54:19 +02:00
null ,
2011-06-01 21:31:56 +02:00
( ( this . query . constraint ! = null ) & & ( this . query . constraint . get ( Condenser . flag_cat_indexof ) ) ) ,
2011-05-06 01:05:38 +02:00
220 ,
2011-06-01 21:31:56 +02:00
! this . query . isLocal ( ) ) ;
return new ResultEntry ( page , this . query . getSegment ( ) , this . peers , snippet , null , dbRetrievalTime , 0 ) ; // result without snippet
2009-08-27 22:20:07 +02:00
}
2011-06-01 21:31:56 +02:00
2009-08-27 22:20:07 +02:00
// load snippet
2012-05-20 02:59:47 +02:00
if ( page . url ( ) . getContentDomain ( ) = = Classification . ContentDomain . TEXT | | page . url ( ) . getContentDomain ( ) = = Classification . ContentDomain . ALL ) {
2009-08-27 22:20:07 +02:00
// attach text snippet
startTime = System . currentTimeMillis ( ) ;
2010-09-22 22:50:02 +02:00
final TextSnippet snippet = new TextSnippet (
2010-03-20 11:28:03 +01:00
this . loader ,
2011-09-13 16:39:41 +02:00
solrText ,
2011-12-17 01:27:08 +01:00
page ,
2011-06-01 21:31:56 +02:00
this . snippetFetchWordHashes ,
2010-06-21 16:54:54 +02:00
cacheStrategy ,
2011-06-01 21:31:56 +02:00
( ( this . query . constraint ! = null ) & & ( this . query . constraint . get ( Condenser . flag_cat_indexof ) ) ) ,
2010-03-20 11:28:03 +01:00
180 ,
2011-06-01 21:31:56 +02:00
! this . query . isLocal ( ) ) ;
2009-08-27 22:20:07 +02:00
final long snippetComputationTime = System . currentTimeMillis ( ) - startTime ;
2012-07-12 19:54:54 +02:00
log . logInfo ( " text snippet load time for " + page . url ( ) + " : " + snippetComputationTime + " , " + ( ! snippet . getErrorCode ( ) . fail ( ) ? " snippet found " : ( " no snippet found ( " + snippet . getError ( ) + " ) " ) ) ) ;
2011-06-01 21:31:56 +02:00
2011-03-21 08:50:34 +01:00
if ( ! snippet . getErrorCode ( ) . fail ( ) ) {
2009-08-27 22:20:07 +02:00
// we loaded the file and found the snippet
2011-06-01 21:31:56 +02:00
return new ResultEntry ( page , this . query . getSegment ( ) , this . peers , snippet , null , dbRetrievalTime , snippetComputationTime ) ; // result with snippet attached
2010-06-21 16:54:54 +02:00
} else if ( cacheStrategy . mustBeOffline ( ) ) {
2009-08-27 22:20:07 +02:00
// we did not demand online loading, therefore a failure does not mean that the missing snippet causes a rejection of this result
// this may happen during a remote search, because snippet loading is omitted to retrieve results faster
2011-06-01 21:31:56 +02:00
return new ResultEntry ( page , this . query . getSegment ( ) , this . peers , null , null , dbRetrievalTime , snippetComputationTime ) ; // result without snippet
2009-08-27 22:20:07 +02:00
} else {
// problems with snippet fetch
2012-05-17 05:18:52 +02:00
if ( this . snippetFetchWordHashes . has ( Segment . catchallHash ) ) {
// we accept that because the word cannot be on the page
return new ResultEntry ( page , this . query . getSegment ( ) , this . peers , null , null , dbRetrievalTime , 0 ) ;
}
2011-06-01 21:31:56 +02:00
final String reason = " no text snippet; errorCode = " + snippet . getErrorCode ( ) ;
2012-01-09 03:02:35 +01:00
if ( this . deleteIfSnippetFail ) {
2012-07-09 11:14:50 +02:00
this . workTables . failURLsRegisterMissingWord ( this . query . getSegment ( ) . termIndex ( ) , page . url ( ) , this . query . query_include_hashes , reason ) ;
2012-01-09 03:02:35 +01:00
}
2012-07-12 19:54:54 +02:00
log . logInfo ( " sorted out url " + page . url ( ) . toNormalform ( true , false ) + " during search: " + reason ) ;
2009-08-27 22:20:07 +02:00
return null ;
}
}
2012-07-05 08:44:39 +02:00
return new ResultEntry ( page , this . query . getSegment ( ) , this . peers , null , null , dbRetrievalTime , 0 ) ; // result without snippet
2009-08-27 22:20:07 +02:00
}
2011-11-24 17:05:09 +01:00
/ * *
* delete a specific entry from the search results
* this is used if the user clicks on a '-' sign beside the search result
* @param urlhash
* @return true if an entry was deleted , false otherwise
* /
public boolean delete ( final String urlhash ) {
final Iterator < Element < ResultEntry > > i = this . result . iterator ( ) ;
Element < ResultEntry > entry ;
while ( i . hasNext ( ) ) {
entry = i . next ( ) ;
if ( urlhash . equals ( ASCII . String ( entry . getElement ( ) . url ( ) . hash ( ) ) ) ) {
i . remove ( ) ;
return true ;
}
}
return false ;
}
2009-08-25 23:27:01 +02:00
}