2009-08-24 17:24:02 +02:00
// SearchEvent.java
2007-08-06 02:56:56 +02:00
// (C) 2005 by Michael Peter Christen; mc@yacy.net, Frankfurt a. M., Germany
// first published 10.10.2005 on http://yacy.net
2005-10-10 02:32:15 +02:00
//
2007-08-06 02:56:56 +02:00
// This is a part of YaCy, a peer-to-peer based web search engine
//
2009-09-05 22:41:21 +02:00
// $LastChangedDate$
// $LastChangedRevision$
// $LastChangedBy$
2007-08-06 02:56:56 +02:00
//
// LICENSE
2011-06-23 13:57:17 +02:00
//
2005-10-10 02:32:15 +02:00
// This program is free software; you can redistribute it and/or modify
// it under the terms of the GNU General Public License as published by
// the Free Software Foundation; either version 2 of the License, or
// (at your option) any later version.
//
// This program is distributed in the hope that it will be useful,
// but WITHOUT ANY WARRANTY; without even the implied warranty of
// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
// GNU General Public License for more details.
//
// You should have received a copy of the GNU General Public License
// along with this program; if not, write to the Free Software
// Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
2011-09-25 18:59:06 +02:00
package net.yacy.search.query ;
2005-10-10 02:32:15 +02:00
2005-10-11 09:06:33 +02:00
import java.util.Iterator ;
2006-09-11 00:36:47 +02:00
import java.util.Map ;
2010-11-28 03:57:31 +01:00
import java.util.SortedMap ;
import java.util.SortedSet ;
2006-09-11 12:39:25 +02:00
import java.util.TreeMap ;
2010-06-20 02:11:12 +02:00
import java.util.TreeSet ;
import java.util.concurrent.Semaphore ;
import java.util.concurrent.TimeUnit ;
2005-10-12 14:28:49 +02:00
2011-05-27 10:24:54 +02:00
import net.yacy.cora.document.ASCII ;
2011-03-07 21:36:40 +01:00
import net.yacy.cora.document.UTF8 ;
2011-08-23 00:37:53 +02:00
import net.yacy.cora.ranking.ScoreMap ;
2010-10-09 10:55:57 +02:00
import net.yacy.document.LargeNumberCache ;
2009-10-11 02:12:19 +02:00
import net.yacy.kelondro.data.word.WordReference ;
2011-05-16 00:57:31 +02:00
import net.yacy.kelondro.data.word.WordReferenceFactory ;
2009-10-10 01:13:30 +02:00
import net.yacy.kelondro.logging.Log ;
2009-10-10 01:22:22 +02:00
import net.yacy.kelondro.order.Base64Order ;
2009-10-10 02:39:15 +02:00
import net.yacy.kelondro.rwi.ReferenceContainer ;
2009-12-08 15:25:51 +01:00
import net.yacy.kelondro.util.EventTracker ;
2009-10-10 03:14:19 +02:00
import net.yacy.kelondro.util.MemoryControl ;
import net.yacy.kelondro.util.SetTools ;
2011-10-04 11:06:24 +02:00
import net.yacy.peers.RemoteSearch ;
import net.yacy.peers.SeedDB ;
2011-09-25 18:59:06 +02:00
import net.yacy.peers.dht.FlatWordPartitionScheme ;
import net.yacy.peers.graphics.ProfilingGraph ;
2010-03-20 11:28:03 +01:00
import net.yacy.repository.LoaderDispatcher ;
2011-09-25 18:59:06 +02:00
import net.yacy.search.Switchboard ;
2011-09-26 23:42:28 +02:00
import net.yacy.search.query.SnippetProcess.Worker ;
2011-09-25 18:59:06 +02:00
import net.yacy.search.ranking.ReferenceOrder ;
import net.yacy.search.snippet.ResultEntry ;
2010-12-06 15:34:58 +01:00
import de.anomic.data.WorkTables ;
2005-10-10 02:32:15 +02:00
2009-08-24 17:24:02 +02:00
public final class SearchEvent {
2011-06-23 13:57:17 +02:00
2010-09-13 11:33:04 +02:00
public enum Type {
INITIALIZATION , COLLECTION , JOIN , PRESORT , URLFETCH , NORMALIZING , FINALIZATION ,
REMOTESEARCH_START , REMOTESEARCH_TERMINATE , ABSTRACTS , CLEANUP , SNIPPETFETCH_START , ONERESULT , REFERENCECOLLECTION , RESULTLIST ;
}
2011-06-23 13:57:17 +02:00
2010-01-12 16:01:44 +01:00
public static final int max_results_preparation = 3000 ;
2011-06-23 13:57:17 +02:00
2009-08-26 17:59:55 +02:00
// class variables that may be implemented with an abstract class
private long eventTime ;
private QueryParams query ;
2011-10-04 11:06:24 +02:00
private final SeedDB peers ;
2010-12-06 15:34:58 +01:00
private final WorkTables workTables ;
2011-11-24 02:30:12 +01:00
private final RWIProcess rankingProcess ; // ordered search results, grows dynamically as all the query threads enrich this container
private final SnippetProcess resultFetcher ;
2011-06-23 13:57:17 +02:00
private final SecondarySearchSuperviser secondarySearchSuperviser ;
2009-08-26 17:59:55 +02:00
// class variables for remote searches
2011-10-04 11:06:24 +02:00
private RemoteSearch [ ] primarySearchThreads , secondarySearchThreads ;
2010-11-28 03:57:31 +01:00
private final SortedMap < byte [ ] , String > preselectedPeerHashes ;
2010-01-11 00:09:48 +01:00
private final Thread localSearchThread ;
2010-11-28 03:57:31 +01:00
private final SortedMap < byte [ ] , Integer > IACount ;
private final SortedMap < byte [ ] , String > IAResults ;
private final SortedMap < byte [ ] , HeuristicResult > heuristics ;
2009-08-26 17:59:55 +02:00
private byte [ ] IAmaxcounthash , IAneardhthash ;
2009-12-03 13:25:03 +01:00
private final ReferenceOrder order ;
2011-06-23 13:57:17 +02:00
2011-03-01 10:03:33 +01:00
protected SearchEvent ( final QueryParams query ,
2011-10-04 11:06:24 +02:00
final SeedDB peers ,
2010-12-06 15:34:58 +01:00
final WorkTables workTables ,
2010-11-28 03:57:31 +01:00
final SortedMap < byte [ ] , String > preselectedPeerHashes ,
2010-03-20 11:28:03 +01:00
final boolean generateAbstracts ,
2011-02-13 18:37:28 +01:00
final LoaderDispatcher loader ,
2011-03-04 14:44:00 +01:00
final int remote_maxcount ,
final long remote_maxtime ,
2011-02-13 18:37:28 +01:00
final int burstRobinsonPercent ,
2011-03-21 08:50:34 +01:00
final int burstMultiwordPercent ,
final boolean deleteIfSnippetFail ) {
2011-03-01 10:03:33 +01:00
if ( MemoryControl . available ( ) < 1024 * 1024 * 100 ) SearchEventCache . cleanupEvents ( true ) ;
2007-08-25 01:12:59 +02:00
this . eventTime = System . currentTimeMillis ( ) ; // for lifetime check
2009-05-28 16:26:05 +02:00
this . peers = peers ;
2010-12-06 15:34:58 +01:00
this . workTables = workTables ;
2005-10-11 09:06:33 +02:00
this . query = query ;
2011-11-17 02:05:45 +01:00
this . secondarySearchSuperviser = ( this . query . queryHashes . size ( ) > 1 ) ? new SecondarySearchSuperviser ( ) : null ; // generate abstracts only for combined searches
2010-06-20 02:11:12 +02:00
if ( this . secondarySearchSuperviser ! = null ) this . secondarySearchSuperviser . start ( ) ;
2006-09-13 19:13:28 +02:00
this . primarySearchThreads = null ;
this . secondarySearchThreads = null ;
2007-04-26 11:51:51 +02:00
this . preselectedPeerHashes = preselectedPeerHashes ;
2009-04-16 17:29:00 +02:00
this . IAResults = new TreeMap < byte [ ] , String > ( Base64Order . enhancedCoder ) ;
this . IACount = new TreeMap < byte [ ] , Integer > ( Base64Order . enhancedCoder ) ;
2010-06-25 18:44:57 +02:00
this . heuristics = new TreeMap < byte [ ] , HeuristicResult > ( Base64Order . enhancedCoder ) ;
2007-08-28 14:15:46 +02:00
this . IAmaxcounthash = null ;
this . IAneardhthash = null ;
2008-02-01 00:40:47 +01:00
this . localSearchThread = null ;
2011-11-17 02:05:45 +01:00
this . order = new ReferenceOrder ( this . query . ranking , UTF8 . getBytes ( this . query . targetlang ) ) ;
2011-11-23 23:21:14 +01:00
final boolean remote = peers . sizeConnected ( ) > 0 & & ( this . query . domType = = QueryParams . Searchdom . CLUSTER | | ( this . query . domType = = QueryParams . Searchdom . GLOBAL & & peers . mySeed ( ) . getFlagAcceptRemoteIndex ( ) ) ) ;
2008-08-02 14:12:04 +02:00
final long start = System . currentTimeMillis ( ) ;
2011-06-23 13:57:17 +02:00
2011-11-24 02:30:12 +01:00
// initialize a ranking process that is the target for data
// that is generated concurrently from local and global search threads
this . rankingProcess = new RWIProcess ( this . query , this . order , max_results_preparation ) ;
// start a local search concurrently
this . rankingProcess . start ( ) ;
2011-06-23 13:57:17 +02:00
2011-11-24 02:30:12 +01:00
if ( remote ) {
2009-06-04 22:58:47 +02:00
// start global searches
2008-08-02 14:12:04 +02:00
final long timer = System . currentTimeMillis ( ) ;
2011-11-17 02:05:45 +01:00
this . primarySearchThreads = ( this . query . queryHashes . isEmpty ( ) ) ? null : RemoteSearch . primaryRemoteSearches (
QueryParams . hashSet2hashString ( this . query . queryHashes ) ,
QueryParams . hashSet2hashString ( this . query . excludeHashes ) ,
this . query . prefer ,
this . query . urlMask ,
this . query . snippetMatcher ,
this . query . targetlang = = null ? " " : this . query . targetlang ,
this . query . sitehash = = null ? " " : this . query . sitehash ,
this . query . authorhash = = null ? " " : this . query . authorhash ,
2011-03-04 14:44:00 +01:00
remote_maxcount ,
remote_maxtime ,
2011-11-17 02:05:45 +01:00
this . query . maxDistance ,
this . query . getSegment ( ) ,
2009-05-28 16:26:05 +02:00
peers ,
2011-06-23 13:57:17 +02:00
this . rankingProcess ,
this . secondarySearchSuperviser ,
2009-07-19 22:37:44 +02:00
Switchboard . urlBlacklist ,
2011-11-17 02:05:45 +01:00
this . query . ranking ,
this . query . constraint ,
( this . query . domType = = QueryParams . Searchdom . GLOBAL ) ? null : preselectedPeerHashes ,
2011-02-13 18:37:28 +01:00
burstRobinsonPercent ,
burstMultiwordPercent ) ;
2009-09-05 22:31:39 +02:00
if ( this . primarySearchThreads ! = null ) {
2011-03-23 01:44:38 +01:00
Log . logFine ( " SEARCH_EVENT " , " STARTING " + this . primarySearchThreads . length + " THREADS TO CATCH EACH " + remote_maxcount + " URLs " ) ;
2010-10-04 13:54:48 +02:00
this . rankingProcess . moreFeeders ( this . primarySearchThreads . length ) ;
2011-11-17 02:05:45 +01:00
EventTracker . update ( EventTracker . EClass . SEARCH , new ProfilingGraph . EventSearch ( this . query . id ( true ) , Type . REMOTESEARCH_START , " " , this . primarySearchThreads . length , System . currentTimeMillis ( ) - timer ) , false ) ;
2009-09-05 22:31:39 +02:00
// finished searching
2011-06-23 13:57:17 +02:00
Log . logFine ( " SEARCH_EVENT " , " SEARCH TIME AFTER GLOBAL-TRIGGER TO " + this . primarySearchThreads . length + " PEERS: " + ( ( System . currentTimeMillis ( ) - start ) / 1000 ) + " seconds " ) ;
2009-09-05 22:31:39 +02:00
} else {
// no search since query is empty, user might have entered no data or filters have removed all search words
Log . logFine ( " SEARCH_EVENT " , " NO SEARCH STARTED DUE TO EMPTY SEARCH REQUEST. " ) ;
}
2007-08-25 01:12:59 +02:00
} else {
2007-08-28 14:15:46 +02:00
if ( generateAbstracts ) {
2011-11-24 02:30:12 +01:00
// we need the results now
try {
this . rankingProcess . join ( ) ;
} catch ( final Throwable e ) {
}
2007-08-28 14:15:46 +02:00
// compute index abstracts
2008-08-02 14:12:04 +02:00
final long timer = System . currentTimeMillis ( ) ;
2007-08-28 14:15:46 +02:00
int maxcount = - 1 ;
2008-11-03 01:27:23 +01:00
long mindhtdistance = Long . MAX_VALUE , l ;
2009-04-16 17:29:00 +02:00
byte [ ] wordhash ;
2010-10-04 13:54:48 +02:00
assert this . rankingProcess . searchContainerMap ( ) ! = null ;
2010-11-28 03:57:31 +01:00
for ( final Map . Entry < byte [ ] , ReferenceContainer < WordReference > > entry : this . rankingProcess . searchContainerMap ( ) . entrySet ( ) ) {
2008-01-18 18:14:02 +01:00
wordhash = entry . getKey ( ) ;
2010-06-25 18:44:57 +02:00
final ReferenceContainer < WordReference > container = entry . getValue ( ) ;
2011-05-27 10:24:54 +02:00
assert ( Base64Order . enhancedCoder . equal ( container . getTermHash ( ) , wordhash ) ) : " container.getTermHash() = " + ASCII . String ( container . getTermHash ( ) ) + " , wordhash = " + ASCII . String ( wordhash ) ;
2007-08-28 14:15:46 +02:00
if ( container . size ( ) > maxcount ) {
2011-06-23 13:57:17 +02:00
this . IAmaxcounthash = wordhash ;
2007-08-28 14:15:46 +02:00
maxcount = container . size ( ) ;
}
2009-05-28 16:26:05 +02:00
l = FlatWordPartitionScheme . std . dhtDistance ( wordhash , null , peers . mySeed ( ) ) ;
2008-11-03 01:27:23 +01:00
if ( l < mindhtdistance ) {
2007-08-28 14:15:46 +02:00
// calculate the word hash that is closest to our dht position
2008-11-03 01:27:23 +01:00
mindhtdistance = l ;
2011-06-23 13:57:17 +02:00
this . IAneardhthash = wordhash ;
2007-08-28 14:15:46 +02:00
}
2011-06-23 13:57:17 +02:00
this . IACount . put ( wordhash , LargeNumberCache . valueOf ( container . size ( ) ) ) ;
this . IAResults . put ( wordhash , WordReferenceFactory . compressIndex ( container , null , 1000 ) . toString ( ) ) ;
2007-08-28 14:15:46 +02:00
}
2011-11-17 02:05:45 +01:00
EventTracker . update ( EventTracker . EClass . SEARCH , new ProfilingGraph . EventSearch ( this . query . id ( true ) , Type . ABSTRACTS , " " , this . rankingProcess . searchContainerMap ( ) . size ( ) , System . currentTimeMillis ( ) - timer ) , false ) ;
2010-09-21 23:48:42 +02:00
} else {
2011-11-24 02:30:12 +01:00
// give process time to accumulate a certain amount of data
2010-10-04 13:54:48 +02:00
// before a reading process wants to get results from it
2011-11-24 02:30:12 +01:00
try {
this . rankingProcess . join ( 100 ) ;
} catch ( final Throwable e ) {
2010-10-04 13:54:48 +02:00
}
// this will reduce the maximum waiting time until results are available to 100 milliseconds
// while we always get a good set of ranked data
2007-08-28 14:15:46 +02:00
}
2007-08-25 01:12:59 +02:00
}
2011-06-23 13:57:17 +02:00
2011-11-24 02:30:12 +01:00
// start worker threads to fetch urls and snippets
this . resultFetcher = new SnippetProcess ( loader , this . rankingProcess , this . query , this . peers , this . workTables , 5000 , deleteIfSnippetFail ) ;
2007-09-07 13:45:38 +02:00
// clean up events
2009-08-24 17:24:02 +02:00
SearchEventCache . cleanupEvents ( false ) ;
2011-11-17 02:05:45 +01:00
EventTracker . update ( EventTracker . EClass . SEARCH , new ProfilingGraph . EventSearch ( this . query . id ( true ) , Type . CLEANUP , " " , 0 , 0 ) , false ) ;
2011-06-23 13:57:17 +02:00
2007-08-25 01:12:59 +02:00
// store this search to a cache so it can be re-used
2011-03-01 10:03:33 +01:00
if ( MemoryControl . available ( ) < 1024 * 1024 * 100 ) SearchEventCache . cleanupEvents ( true ) ;
2011-11-17 02:05:45 +01:00
SearchEventCache . put ( this . query . id ( false ) , this ) ;
2009-12-03 13:25:03 +01:00
}
2011-06-23 13:57:17 +02:00
2009-12-03 13:25:03 +01:00
public ReferenceOrder getOrder ( ) {
return this . order ;
}
2011-06-23 13:57:17 +02:00
2009-08-26 17:59:55 +02:00
public long getEventTime ( ) {
return this . eventTime ;
}
2011-06-23 13:57:17 +02:00
2009-08-26 17:59:55 +02:00
public void resetEventTime ( ) {
this . eventTime = System . currentTimeMillis ( ) ;
}
2011-06-23 13:57:17 +02:00
2009-08-26 17:59:55 +02:00
public QueryParams getQuery ( ) {
return this . query ;
}
2011-06-23 13:57:17 +02:00
public void setQuery ( final QueryParams query ) {
2009-08-26 17:59:55 +02:00
this . query = query ;
2010-10-04 13:54:48 +02:00
this . resultFetcher . query = query ;
2009-08-26 17:59:55 +02:00
}
2011-06-23 13:57:17 +02:00
2009-08-26 17:59:55 +02:00
public void cleanup ( ) {
2011-06-23 13:57:17 +02:00
this . resultFetcher . setCleanupState ( ) ;
2010-09-06 18:05:19 +02:00
// stop all threads
2011-06-23 13:57:17 +02:00
if ( this . primarySearchThreads ! = null ) {
2011-10-04 11:06:24 +02:00
for ( final RemoteSearch search : this . primarySearchThreads ) {
2010-10-04 13:54:48 +02:00
if ( search ! = null ) synchronized ( search ) {
if ( search . isAlive ( ) ) search . interrupt ( ) ;
}
2010-09-06 18:05:19 +02:00
}
}
2011-06-23 13:57:17 +02:00
if ( this . secondarySearchThreads ! = null ) {
2011-10-04 11:06:24 +02:00
for ( final RemoteSearch search : this . secondarySearchThreads ) {
2010-10-04 13:54:48 +02:00
if ( search ! = null ) synchronized ( search ) {
if ( search . isAlive ( ) ) search . interrupt ( ) ;
}
2010-09-06 18:05:19 +02:00
}
}
2011-06-23 13:57:17 +02:00
// call the worker threads and ask them to stop
for ( final Worker w : this . resultFetcher . workerThreads ) {
if ( w ! = null & & w . isAlive ( ) ) {
w . pleaseStop ( ) ;
w . interrupt ( ) ;
// the interrupt may occur during a MD5 computation which is resistant against interruption
// therefore set some more interrupts on the process
int ic = 10 ;
while ( ic - - > 0 & w . isAlive ( ) ) w . interrupt ( ) ;
}
}
2010-09-06 18:05:19 +02:00
// clear all data structures
if ( this . preselectedPeerHashes ! = null ) this . preselectedPeerHashes . clear ( ) ;
if ( this . localSearchThread ! = null ) if ( this . localSearchThread . isAlive ( ) ) this . localSearchThread . interrupt ( ) ;
if ( this . IACount ! = null ) this . IACount . clear ( ) ;
if ( this . IAResults ! = null ) this . IAResults . clear ( ) ;
if ( this . heuristics ! = null ) this . heuristics . clear ( ) ;
2009-08-26 17:59:55 +02:00
}
2011-06-23 13:57:17 +02:00
2009-08-26 17:59:55 +02:00
public Iterator < Map . Entry < byte [ ] , String > > abstractsString ( ) {
return this . IAResults . entrySet ( ) . iterator ( ) ;
}
2011-06-23 13:57:17 +02:00
public String abstractsString ( final byte [ ] hash ) {
2009-08-26 17:59:55 +02:00
return this . IAResults . get ( hash ) ;
}
2011-06-23 13:57:17 +02:00
2009-08-26 17:59:55 +02:00
public Iterator < Map . Entry < byte [ ] , Integer > > abstractsCount ( ) {
return this . IACount . entrySet ( ) . iterator ( ) ;
}
2011-06-23 13:57:17 +02:00
public int abstractsCount ( final byte [ ] hash ) {
final Integer i = this . IACount . get ( hash ) ;
2009-08-26 17:59:55 +02:00
if ( i = = null ) return - 1 ;
return i . intValue ( ) ;
}
2011-06-23 13:57:17 +02:00
2009-08-26 17:59:55 +02:00
public byte [ ] getAbstractsMaxCountHash ( ) {
return this . IAmaxcounthash ;
}
2011-06-23 13:57:17 +02:00
2009-08-26 17:59:55 +02:00
public byte [ ] getAbstractsNearDHTHash ( ) {
return this . IAneardhthash ;
}
2011-06-23 13:57:17 +02:00
2009-08-26 17:59:55 +02:00
boolean anyRemoteSearchAlive ( ) {
2007-09-08 13:50:19 +02:00
// check primary search threads
if ( ( this . primarySearchThreads ! = null ) & & ( this . primarySearchThreads . length ! = 0 ) ) {
2011-10-04 11:06:24 +02:00
for ( final RemoteSearch primarySearchThread : this . primarySearchThreads ) {
2010-11-28 03:57:31 +01:00
if ( ( primarySearchThread ! = null ) & & ( primarySearchThread . isAlive ( ) ) ) return true ;
2007-09-08 13:50:19 +02:00
}
}
2007-12-12 15:18:42 +01:00
// maybe a secondary search thread is alive, check this
2007-09-08 13:50:19 +02:00
if ( ( this . secondarySearchThreads ! = null ) & & ( this . secondarySearchThreads . length ! = 0 ) ) {
2011-10-04 11:06:24 +02:00
for ( final RemoteSearch secondarySearchThread : this . secondarySearchThreads ) {
2010-11-28 03:57:31 +01:00
if ( ( secondarySearchThread ! = null ) & & ( secondarySearchThread . isAlive ( ) ) ) return true ;
2007-09-08 13:50:19 +02:00
}
}
return false ;
}
2011-06-23 13:57:17 +02:00
2011-10-04 11:06:24 +02:00
public RemoteSearch [ ] getPrimarySearchThreads ( ) {
2011-06-23 13:57:17 +02:00
return this . primarySearchThreads ;
2006-09-13 19:13:28 +02:00
}
2011-06-23 13:57:17 +02:00
2011-10-04 11:06:24 +02:00
public RemoteSearch [ ] getSecondarySearchThreads ( ) {
2011-06-23 13:57:17 +02:00
return this . secondarySearchThreads ;
2005-10-24 02:34:15 +02:00
}
2011-06-23 13:57:17 +02:00
2011-09-26 23:42:28 +02:00
public RWIProcess getRankingResult ( ) {
2010-10-04 13:54:48 +02:00
return this . rankingProcess ;
2007-01-15 02:50:57 +01:00
}
2007-09-04 01:43:55 +02:00
2011-03-13 02:41:44 +01:00
public ScoreMap < String > getNamespaceNavigator ( ) {
2010-10-16 01:45:12 +02:00
return this . rankingProcess . getNamespaceNavigator ( ) ;
2009-06-04 01:49:06 +02:00
}
2011-06-23 13:57:17 +02:00
2011-03-13 02:41:44 +01:00
public ScoreMap < String > getHostNavigator ( ) {
2010-10-16 01:45:12 +02:00
return this . rankingProcess . getHostNavigator ( ) ;
2010-03-05 22:25:49 +01:00
}
2011-06-23 13:57:17 +02:00
public ScoreMap < String > getTopicNavigator ( final int count ) {
2009-06-04 01:49:06 +02:00
// returns a set of words that are computed as toplist
2010-10-16 01:45:12 +02:00
return this . rankingProcess . getTopicNavigator ( count ) ;
2007-09-04 01:43:55 +02:00
}
2011-06-23 13:57:17 +02:00
2011-03-13 02:41:44 +01:00
public ScoreMap < String > getAuthorNavigator ( ) {
2009-06-09 00:01:26 +02:00
// returns a list of authors so far seen on result set
2010-10-16 01:45:12 +02:00
return this . rankingProcess . getAuthorNavigator ( ) ;
2009-06-09 00:01:26 +02:00
}
2011-06-23 13:57:17 +02:00
2011-06-23 17:39:52 +02:00
public ScoreMap < String > getProtocolNavigator ( ) {
return this . rankingProcess . getProtocolNavigator ( ) ;
}
public ScoreMap < String > getFiletypeNavigator ( ) {
return this . rankingProcess . getFiletypeNavigator ( ) ;
}
2011-06-23 13:57:17 +02:00
public void addHeuristic ( final byte [ ] urlhash , final String heuristicName , final boolean redundant ) {
2010-06-25 18:44:57 +02:00
synchronized ( this . heuristics ) {
this . heuristics . put ( urlhash , new HeuristicResult ( urlhash , heuristicName , redundant ) ) ;
}
}
2011-06-23 13:57:17 +02:00
public HeuristicResult getHeuristic ( final byte [ ] urlhash ) {
2010-06-25 18:44:57 +02:00
synchronized ( this . heuristics ) {
return this . heuristics . get ( urlhash ) ;
}
}
2011-06-23 13:57:17 +02:00
public ResultEntry oneResult ( final int item , final long timeout ) {
2011-11-24 00:39:34 +01:00
/ *
if ( this . query . domType = = QueryParams . Searchdom . GLOBAL | | this . query . domType = = QueryParams . Searchdom . CLUSTER ) {
2009-01-12 16:06:22 +01:00
// this is a search using remote search threads. Also the local
// search thread is started as background process
2011-11-24 00:39:34 +01:00
if ( this . localSearchThread ! = null & & this . localSearchThread . isAlive ( ) ) {
2009-01-12 16:06:22 +01:00
// in case that the local search takes longer than some other
2009-06-04 22:58:47 +02:00
// remote search requests, wait that the local process terminates first
2011-11-18 14:09:07 +01:00
try { this . localSearchThread . join ( 300 ) ; } catch ( final InterruptedException e ) { }
2009-01-12 16:06:22 +01:00
}
2007-09-04 01:43:55 +02:00
}
2011-11-24 00:39:34 +01:00
* /
2010-10-04 13:54:48 +02:00
return this . resultFetcher . oneResult ( item , timeout ) ;
2007-09-04 01:43:55 +02:00
}
2011-06-23 13:57:17 +02:00
2007-09-04 01:43:55 +02:00
boolean secondarySearchStartet = false ;
2011-06-23 13:57:17 +02:00
2010-06-25 18:44:57 +02:00
public static class HeuristicResult /*implements Comparable<HeuristicResult>*/ {
public final byte [ ] urlhash ; public final String heuristicName ; public final boolean redundant ;
2011-06-23 13:57:17 +02:00
public HeuristicResult ( final byte [ ] urlhash , final String heuristicName , final boolean redundant ) {
2010-06-25 18:44:57 +02:00
this . urlhash = urlhash ; this . heuristicName = heuristicName ; this . redundant = redundant ;
} / *
public int compareTo ( HeuristicResult o ) {
return Base64Order . enhancedCoder . compare ( this . urlhash , o . urlhash ) ;
}
public int hashCode ( ) {
return ( int ) Base64Order . enhancedCoder . cardinal ( this . urlhash ) ;
}
public boolean equals ( Object o ) {
return Base64Order . enhancedCoder . equal ( this . urlhash , ( ( HeuristicResult ) o ) . urlhash ) ;
} * /
}
2011-06-23 13:57:17 +02:00
2010-06-18 11:44:21 +02:00
public class SecondarySearchSuperviser extends Thread {
2011-06-23 13:57:17 +02:00
2010-06-18 11:44:21 +02:00
// cache for index abstracts; word:TreeMap mapping where the embedded TreeMap is a urlhash:peerlist relation
// this relation contains the information where specific urls can be found in specific peers
2011-03-08 00:12:39 +01:00
SortedMap < String , SortedMap < String , StringBuilder > > abstractsCache ;
2010-11-28 03:57:31 +01:00
SortedSet < String > checkedPeers ;
2010-06-20 02:11:12 +02:00
Semaphore trigger ;
2011-06-23 13:57:17 +02:00
2010-06-18 11:44:21 +02:00
public SecondarySearchSuperviser ( ) {
2011-03-08 00:12:39 +01:00
this . abstractsCache = new TreeMap < String , SortedMap < String , StringBuilder > > ( ) ;
2010-06-20 02:11:12 +02:00
this . checkedPeers = new TreeSet < String > ( ) ;
this . trigger = new Semaphore ( 0 ) ;
2006-09-13 19:13:28 +02:00
}
2011-06-23 13:57:17 +02:00
2010-06-18 11:44:21 +02:00
/ * *
* add a single abstract to the existing set of abstracts
* @param wordhash
* @param singleAbstract // a mapping from url-hashes to a string of peer-hashes
2007-12-22 03:58:38 +01:00
* /
2011-06-23 13:57:17 +02:00
public void addAbstract ( final String wordhash , final TreeMap < String , StringBuilder > singleAbstract ) {
2011-03-08 00:12:39 +01:00
final SortedMap < String , StringBuilder > oldAbstract ;
2011-06-23 13:57:17 +02:00
synchronized ( this . abstractsCache ) {
oldAbstract = this . abstractsCache . get ( wordhash ) ;
2010-06-18 11:44:21 +02:00
if ( oldAbstract = = null ) {
// new abstracts in the cache
2011-06-23 13:57:17 +02:00
this . abstractsCache . put ( wordhash , singleAbstract ) ;
2011-03-03 12:30:04 +01:00
return ;
}
}
// extend the abstracts in the cache: join the single abstracts
2011-03-04 14:44:00 +01:00
new Thread ( ) {
public void run ( ) {
2011-03-08 00:12:39 +01:00
for ( final Map . Entry < String , StringBuilder > oneref : singleAbstract . entrySet ( ) ) {
2011-03-04 14:44:00 +01:00
final String urlhash = oneref . getKey ( ) ;
2011-03-08 00:12:39 +01:00
final StringBuilder peerlistNew = oneref . getValue ( ) ;
2011-03-04 14:44:00 +01:00
synchronized ( oldAbstract ) {
2011-03-08 00:12:39 +01:00
final StringBuilder peerlistOld = oldAbstract . put ( urlhash , peerlistNew ) ;
if ( peerlistOld ! = null ) peerlistOld . append ( peerlistNew ) ;
2011-03-04 14:44:00 +01:00
}
2010-06-18 11:44:21 +02:00
}
}
2011-03-04 14:44:00 +01:00
} . start ( ) ;
2011-03-03 12:30:04 +01:00
// abstractsCache.put(wordhash, oldAbstract); // put not necessary since it is sufficient to just change the value content (it stays assigned)
2010-06-18 11:44:21 +02:00
}
2011-06-23 13:57:17 +02:00
2010-06-20 02:11:12 +02:00
public void commitAbstract ( ) {
this . trigger . release ( ) ;
}
2011-06-23 13:57:17 +02:00
2011-03-08 00:32:01 +01:00
private String wordsFromPeer ( final String peerhash , final StringBuilder urls ) {
2011-03-08 00:12:39 +01:00
Map . Entry < String , SortedMap < String , StringBuilder > > entry ;
String word , url , wordlist = " " ;
StringBuilder peerlist ;
SortedMap < String , StringBuilder > urlPeerlist ;
2010-06-18 11:44:21 +02:00
int p ;
boolean hasURL ;
synchronized ( this ) {
2011-03-08 00:12:39 +01:00
final Iterator < Map . Entry < String , SortedMap < String , StringBuilder > > > i = this . abstractsCache . entrySet ( ) . iterator ( ) ;
2010-06-18 11:44:21 +02:00
while ( i . hasNext ( ) ) {
entry = i . next ( ) ;
word = entry . getKey ( ) ;
urlPeerlist = entry . getValue ( ) ;
hasURL = true ;
for ( int j = 0 ; j < urls . length ( ) ; j = j + 12 ) {
url = urls . substring ( j , j + 12 ) ;
peerlist = urlPeerlist . get ( url ) ;
p = ( peerlist = = null ) ? - 1 : peerlist . indexOf ( peerhash ) ;
if ( ( p < 0 ) | | ( p % 12 ! = 0 ) ) {
hasURL = false ;
break ;
}
}
if ( hasURL ) wordlist + = word ;
}
}
return wordlist ;
}
2011-06-23 13:57:17 +02:00
2010-11-28 03:57:31 +01:00
@Override
2010-06-18 11:44:21 +02:00
public void run ( ) {
2010-06-20 02:11:12 +02:00
try {
2010-11-03 21:58:50 +01:00
int t = 0 ;
2010-06-20 02:11:12 +02:00
while ( this . trigger . tryAcquire ( 10000 , TimeUnit . MILLISECONDS ) ) {
// a trigger was released
prepareSecondarySearch ( ) ;
2010-11-03 21:58:50 +01:00
t + + ;
if ( t > 10 ) break ;
2010-06-20 02:11:12 +02:00
}
2011-06-23 13:57:17 +02:00
} catch ( final InterruptedException e ) {
2010-06-20 02:11:12 +02:00
// the thread was interrupted
// do nohing
}
// the time-out was reached
2010-06-18 11:44:21 +02:00
}
2011-06-23 13:57:17 +02:00
2010-06-18 11:44:21 +02:00
private void prepareSecondarySearch ( ) {
2011-06-23 13:57:17 +02:00
if ( this . abstractsCache = = null | | this . abstractsCache . size ( ) ! = SearchEvent . this . query . queryHashes . size ( ) ) return ; // secondary search not possible (yet)
2010-06-18 11:44:21 +02:00
// catch up index abstracts and join them; then call peers again to submit their urls
2010-06-20 02:11:12 +02:00
/ *
2010-06-18 11:44:21 +02:00
System . out . println ( " DEBUG-INDEXABSTRACT: " + abstractsCache . size ( ) + " word references caught, " + query . queryHashes . size ( ) + " needed " ) ;
for ( Map . Entry < String , TreeMap < String , String > > entry : abstractsCache . entrySet ( ) ) {
System . out . println ( " DEBUG-INDEXABSTRACT: hash " + entry . getKey ( ) + " : " + ( ( query . queryHashes . has ( entry . getKey ( ) . getBytes ( ) ) ? " NEEDED " : " NOT NEEDED " ) + " ; " + entry . getValue ( ) . size ( ) + " entries " ) ) ;
}
2010-06-20 02:11:12 +02:00
* /
2011-06-23 13:57:17 +02:00
2010-06-20 02:11:12 +02:00
// find out if there are enough references for all words that are searched
2011-06-23 13:57:17 +02:00
if ( this . abstractsCache . size ( ) ! = SearchEvent . this . query . queryHashes . size ( ) ) return ;
2010-06-20 02:11:12 +02:00
// join all the urlhash:peerlist relations: the resulting map has values with a combined peer-list list
2011-06-23 13:57:17 +02:00
final SortedMap < String , StringBuilder > abstractJoin = SetTools . joinConstructive ( this . abstractsCache . values ( ) , true ) ;
2010-06-18 11:44:21 +02:00
if ( abstractJoin . isEmpty ( ) ) return ;
2010-06-20 02:11:12 +02:00
// the join result is now a urlhash: peer-list relation
2011-06-23 13:57:17 +02:00
2010-06-20 02:11:12 +02:00
// generate a list of peers that have the urls for the joined search result
2011-03-08 00:32:01 +01:00
final SortedMap < String , StringBuilder > secondarySearchURLs = new TreeMap < String , StringBuilder > ( ) ; // a (peerhash:urlhash-liststring) mapping
String url , peer ;
StringBuilder urls , peerlist ;
2011-06-23 13:57:17 +02:00
final String mypeerhash = SearchEvent . this . peers . mySeed ( ) . hash ;
2006-09-16 02:07:09 +02:00
boolean mypeerinvolved = false ;
2006-10-31 03:45:41 +01:00
int mypeercount ;
2011-06-23 13:57:17 +02:00
for ( final Map . Entry < String , StringBuilder > entry : abstractJoin . entrySet ( ) ) {
2010-06-20 02:11:12 +02:00
url = entry . getKey ( ) ;
peerlist = entry . getValue ( ) ;
//System.out.println("DEBUG-INDEXABSTRACT: url " + url + ": from peers " + peerlist);
2006-10-31 03:45:41 +01:00
mypeercount = 0 ;
2010-06-20 02:11:12 +02:00
for ( int j = 0 ; j < peerlist . length ( ) ; j + = 12 ) {
peer = peerlist . substring ( j , j + 12 ) ;
2006-10-31 03:45:41 +01:00
if ( ( peer . equals ( mypeerhash ) ) & & ( mypeercount + + > 1 ) ) continue ;
//if (peers.indexOf(peer) < j) continue; // avoid doubles that may appear in the abstractJoin
2008-06-06 18:01:27 +02:00
urls = secondarySearchURLs . get ( peer ) ;
2011-03-08 00:32:01 +01:00
if ( urls = = null ) {
urls = new StringBuilder ( 24 ) ;
urls . append ( url ) ;
secondarySearchURLs . put ( peer , urls ) ;
} else {
urls . append ( url ) ;
}
2006-09-13 19:13:28 +02:00
secondarySearchURLs . put ( peer , urls ) ;
}
2006-10-31 03:45:41 +01:00
if ( mypeercount = = 1 ) mypeerinvolved = true ;
2006-09-13 19:13:28 +02:00
}
2011-06-23 13:57:17 +02:00
2006-09-13 19:13:28 +02:00
// compute words for secondary search and start the secondary searches
String words ;
2011-10-04 11:06:24 +02:00
SearchEvent . this . secondarySearchThreads = new RemoteSearch [ ( mypeerinvolved ) ? secondarySearchURLs . size ( ) - 1 : secondarySearchURLs . size ( ) ] ;
2006-09-13 19:13:28 +02:00
int c = 0 ;
2011-06-23 13:57:17 +02:00
for ( final Map . Entry < String , StringBuilder > entry : secondarySearchURLs . entrySet ( ) ) {
2010-06-20 02:11:12 +02:00
peer = entry . getKey ( ) ;
if ( peer . equals ( mypeerhash ) ) continue ; // we don't need to ask ourself
2011-06-23 13:57:17 +02:00
if ( this . checkedPeers . contains ( peer ) ) continue ; // do not ask a peer again
2010-06-20 02:11:12 +02:00
urls = entry . getValue ( ) ;
2010-06-18 11:44:21 +02:00
words = wordsFromPeer ( peer , urls ) ;
2010-09-28 12:20:46 +02:00
if ( words . length ( ) = = 0 ) continue ; // ???
2009-06-04 01:49:06 +02:00
assert words . length ( ) > = 12 : " words = " + words ;
2010-06-20 02:11:12 +02:00
//System.out.println("DEBUG-INDEXABSTRACT ***: peer " + peer + " has urls: " + urls + " from words: " + words);
2011-06-23 13:57:17 +02:00
SearchEvent . this . rankingProcess . moreFeeders ( 1 ) ;
this . checkedPeers . add ( peer ) ;
2011-10-04 11:06:24 +02:00
SearchEvent . this . secondarySearchThreads [ c + + ] = RemoteSearch . secondaryRemoteSearch (
2011-06-23 13:57:17 +02:00
words , urls . toString ( ) , 6000 , SearchEvent . this . query . getSegment ( ) , SearchEvent . this . peers , SearchEvent . this . rankingProcess , peer , Switchboard . urlBlacklist ,
SearchEvent . this . query . ranking , SearchEvent . this . query . constraint , SearchEvent . this . preselectedPeerHashes ) ;
2006-09-13 19:13:28 +02:00
}
2011-06-23 13:57:17 +02:00
2006-09-13 19:13:28 +02:00
}
2011-06-23 13:57:17 +02:00
2006-09-13 19:13:28 +02:00
}
2011-06-23 13:57:17 +02:00
2011-09-26 23:42:28 +02:00
public SnippetProcess result ( ) {
2010-10-04 13:54:48 +02:00
return this . resultFetcher ;
2009-08-26 17:59:55 +02:00
}
2011-06-23 13:57:17 +02:00
public boolean workerAlive ( ) {
if ( this . resultFetcher = = null | | this . resultFetcher . workerThreads = = null ) return false ;
for ( final Worker w : this . resultFetcher . workerThreads ) if ( w ! = null & & w . isAlive ( ) ) return true ;
return false ;
}
2005-10-10 02:32:15 +02:00
}