yacy_search_server/source/net/yacy/search/query/SearchEvent.java

582 lines
27 KiB
Java
Raw Normal View History

// SearchEvent.java
// (C) 2005 by Michael Peter Christen; mc@yacy.net, Frankfurt a. M., Germany
// first published 10.10.2005 on http://yacy.net
//
// This is a part of YaCy, a peer-to-peer based web search engine
//
// $LastChangedDate$
// $LastChangedRevision$
// $LastChangedBy$
//
// LICENSE
//
// This program is free software; you can redistribute it and/or modify
// it under the terms of the GNU General Public License as published by
// the Free Software Foundation; either version 2 of the License, or
// (at your option) any later version.
//
// This program is distributed in the hope that it will be useful,
// but WITHOUT ANY WARRANTY; without even the implied warranty of
// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
// GNU General Public License for more details.
//
// You should have received a copy of the GNU General Public License
// along with this program; if not, write to the Free Software
// Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
package net.yacy.search.query;
import java.util.Iterator;
import java.util.Map;
import java.util.SortedMap;
import java.util.SortedSet;
import java.util.TreeMap;
import java.util.TreeSet;
import java.util.concurrent.Semaphore;
import java.util.concurrent.TimeUnit;
import net.yacy.cora.document.ASCII;
import net.yacy.cora.document.UTF8;
import net.yacy.cora.ranking.ScoreMap;
import net.yacy.document.LargeNumberCache;
import net.yacy.kelondro.data.word.WordReference;
import net.yacy.kelondro.data.word.WordReferenceFactory;
import net.yacy.kelondro.logging.Log;
import net.yacy.kelondro.order.Base64Order;
import net.yacy.kelondro.rwi.ReferenceContainer;
import net.yacy.kelondro.util.EventTracker;
import net.yacy.kelondro.util.MemoryControl;
import net.yacy.kelondro.util.SetTools;
import net.yacy.peers.RemoteSearch;
import net.yacy.peers.SeedDB;
import net.yacy.peers.dht.FlatWordPartitionScheme;
import net.yacy.peers.graphics.ProfilingGraph;
import net.yacy.repository.LoaderDispatcher;
import net.yacy.search.Switchboard;
import net.yacy.search.query.SnippetProcess.Worker;
import net.yacy.search.ranking.ReferenceOrder;
import net.yacy.search.snippet.ResultEntry;
import de.anomic.data.WorkTables;
public final class SearchEvent {
public enum Type {
INITIALIZATION, COLLECTION, JOIN, PRESORT, URLFETCH, NORMALIZING, FINALIZATION,
REMOTESEARCH_START, REMOTESEARCH_TERMINATE, ABSTRACTS, CLEANUP, SNIPPETFETCH_START, ONERESULT, REFERENCECOLLECTION, RESULTLIST;
}
public static final int max_results_preparation = 3000;
// class variables that may be implemented with an abstract class
private long eventTime;
private QueryParams query;
private final SeedDB peers;
private final WorkTables workTables;
private final RWIProcess rankingProcess; // ordered search results, grows dynamically as all the query threads enrich this container
private final SnippetProcess resultFetcher;
private final SecondarySearchSuperviser secondarySearchSuperviser;
// class variables for remote searches
private RemoteSearch[] primarySearchThreads, secondarySearchThreads;
private final SortedMap<byte[], String> preselectedPeerHashes;
private final Thread localSearchThread;
private final SortedMap<byte[], Integer> IACount;
private final SortedMap<byte[], String> IAResults;
private final SortedMap<byte[], HeuristicResult> heuristics;
private byte[] IAmaxcounthash, IAneardhthash;
private final ReferenceOrder order;
protected SearchEvent(final QueryParams query,
final SeedDB peers,
final WorkTables workTables,
final SortedMap<byte[], String> preselectedPeerHashes,
final boolean generateAbstracts,
final LoaderDispatcher loader,
final int remote_maxcount,
final long remote_maxtime,
final int burstRobinsonPercent,
final int burstMultiwordPercent,
final boolean deleteIfSnippetFail) {
if (MemoryControl.available() < 1024 * 1024 * 100) SearchEventCache.cleanupEvents(true);
this.eventTime = System.currentTimeMillis(); // for lifetime check
this.peers = peers;
this.workTables = workTables;
this.query = query;
this.secondarySearchSuperviser = (this.query.queryHashes.size() > 1) ? new SecondarySearchSuperviser() : null; // generate abstracts only for combined searches
if (this.secondarySearchSuperviser != null) this.secondarySearchSuperviser.start();
this.primarySearchThreads = null;
this.secondarySearchThreads = null;
this.preselectedPeerHashes = preselectedPeerHashes;
this.IAResults = new TreeMap<byte[], String>(Base64Order.enhancedCoder);
this.IACount = new TreeMap<byte[], Integer>(Base64Order.enhancedCoder);
added a 'transition feature' that shall lower the barrier to move from g**gle to yacy (yes!): Here a new concept called 'search heuristics' is introduced. A heuristic is a kind of 'shortcut' to good results in IT, here for good search results. In this case it will be used to get a very transparent way to compare what YaCy is able to produce as search result and what g**gle produces as search result. Here is what your can do now: - add the phrase 'heuristic:scroogle' to your search query, like 'oil spill heuristic:scroogle' and then a call to scroogle is made to get anonymous search results from g**gle. - these results are _not_ taken as meta-search results, but are used to instantly feed a crawling and indexing process. This happens very fast, here 20 results from scroogle are taken and loaded all simultanously, parsed and indexed immediately and from the results of the parsed content the search result is feeded, along to the normal p2p search - when new results from that heuristic (more to come) get part of the search results, then it is verified if such results are redundant to existing (they had been part of the normal YaCy search result anyway) or if they had been completely new to YaCy. - in the search results the new search results from heuristics are marked with a 'H ++' and search results from heuristics that had been already found by YaCy are marked with a 'H ='. That means: - you can now see YaCy and Scroogle search results in one result page but you also see that you would not have 'missed' the g**gle results when you would only have used YaCy. - to make it short: YaCy now subsumes g**gle results. If you use only YaCy, you miss nothing. to come: a configuration page that let you configure the usage of heuristics and get this feature by default. git-svn-id: https://svn.berlios.de/svnroot/repos/yacy/trunk@6944 6c8d7289-2bf4-0310-a012-ef5d649a1542
2010-06-25 18:44:57 +02:00
this.heuristics = new TreeMap<byte[], HeuristicResult>(Base64Order.enhancedCoder);
this.IAmaxcounthash = null;
this.IAneardhthash = null;
this.localSearchThread = null;
this.order = new ReferenceOrder(this.query.ranking, UTF8.getBytes(this.query.targetlang));
final boolean remote = peers.sizeConnected() > 0 && (this.query.domType == QueryParams.Searchdom.CLUSTER || (this.query.domType == QueryParams.Searchdom.GLOBAL && peers.mySeed().getFlagAcceptRemoteIndex()));
final long start = System.currentTimeMillis();
// initialize a ranking process that is the target for data
// that is generated concurrently from local and global search threads
this.rankingProcess = new RWIProcess(this.query, this.order, max_results_preparation);
// start a local search concurrently
this.rankingProcess.start();
if (remote) {
// start global searches
final long timer = System.currentTimeMillis();
this.primarySearchThreads = (this.query.queryHashes.isEmpty()) ? null : RemoteSearch.primaryRemoteSearches(
QueryParams.hashSet2hashString(this.query.queryHashes),
QueryParams.hashSet2hashString(this.query.excludeHashes),
this.query.prefer,
this.query.urlMask,
this.query.snippetMatcher,
this.query.targetlang == null ? "" : this.query.targetlang,
this.query.sitehash == null ? "" : this.query.sitehash,
this.query.authorhash == null ? "" : this.query.authorhash,
remote_maxcount,
remote_maxtime,
this.query.maxDistance,
this.query.getSegment(),
peers,
this.rankingProcess,
this.secondarySearchSuperviser,
Switchboard.urlBlacklist,
this.query.ranking,
this.query.constraint,
(this.query.domType == QueryParams.Searchdom.GLOBAL) ? null : preselectedPeerHashes,
burstRobinsonPercent,
burstMultiwordPercent);
if (this.primarySearchThreads != null) {
Log.logFine("SEARCH_EVENT", "STARTING " + this.primarySearchThreads.length + " THREADS TO CATCH EACH " + remote_maxcount + " URLs");
this.rankingProcess.moreFeeders(this.primarySearchThreads.length);
EventTracker.update(EventTracker.EClass.SEARCH, new ProfilingGraph.EventSearch(this.query.id(true), Type.REMOTESEARCH_START, "", this.primarySearchThreads.length, System.currentTimeMillis() - timer), false);
// finished searching
Log.logFine("SEARCH_EVENT", "SEARCH TIME AFTER GLOBAL-TRIGGER TO " + this.primarySearchThreads.length + " PEERS: " + ((System.currentTimeMillis() - start) / 1000) + " seconds");
} else {
// no search since query is empty, user might have entered no data or filters have removed all search words
Log.logFine("SEARCH_EVENT", "NO SEARCH STARTED DUE TO EMPTY SEARCH REQUEST.");
}
} else {
if (generateAbstracts) {
// we need the results now
try {
this.rankingProcess.join();
} catch (final Throwable e) {
}
// compute index abstracts
final long timer = System.currentTimeMillis();
int maxcount = -1;
long mindhtdistance = Long.MAX_VALUE, l;
byte[] wordhash;
assert this.rankingProcess.searchContainerMap() != null;
for (final Map.Entry<byte[], ReferenceContainer<WordReference>> entry : this.rankingProcess.searchContainerMap().entrySet()) {
wordhash = entry.getKey();
added a 'transition feature' that shall lower the barrier to move from g**gle to yacy (yes!): Here a new concept called 'search heuristics' is introduced. A heuristic is a kind of 'shortcut' to good results in IT, here for good search results. In this case it will be used to get a very transparent way to compare what YaCy is able to produce as search result and what g**gle produces as search result. Here is what your can do now: - add the phrase 'heuristic:scroogle' to your search query, like 'oil spill heuristic:scroogle' and then a call to scroogle is made to get anonymous search results from g**gle. - these results are _not_ taken as meta-search results, but are used to instantly feed a crawling and indexing process. This happens very fast, here 20 results from scroogle are taken and loaded all simultanously, parsed and indexed immediately and from the results of the parsed content the search result is feeded, along to the normal p2p search - when new results from that heuristic (more to come) get part of the search results, then it is verified if such results are redundant to existing (they had been part of the normal YaCy search result anyway) or if they had been completely new to YaCy. - in the search results the new search results from heuristics are marked with a 'H ++' and search results from heuristics that had been already found by YaCy are marked with a 'H ='. That means: - you can now see YaCy and Scroogle search results in one result page but you also see that you would not have 'missed' the g**gle results when you would only have used YaCy. - to make it short: YaCy now subsumes g**gle results. If you use only YaCy, you miss nothing. to come: a configuration page that let you configure the usage of heuristics and get this feature by default. git-svn-id: https://svn.berlios.de/svnroot/repos/yacy/trunk@6944 6c8d7289-2bf4-0310-a012-ef5d649a1542
2010-06-25 18:44:57 +02:00
final ReferenceContainer<WordReference> container = entry.getValue();
assert (Base64Order.enhancedCoder.equal(container.getTermHash(), wordhash)) : "container.getTermHash() = " + ASCII.String(container.getTermHash()) + ", wordhash = " + ASCII.String(wordhash);
if (container.size() > maxcount) {
this.IAmaxcounthash = wordhash;
maxcount = container.size();
}
l = FlatWordPartitionScheme.std.dhtDistance(wordhash, null, peers.mySeed());
if (l < mindhtdistance) {
// calculate the word hash that is closest to our dht position
mindhtdistance = l;
this.IAneardhthash = wordhash;
}
this.IACount.put(wordhash, LargeNumberCache.valueOf(container.size()));
this.IAResults.put(wordhash, WordReferenceFactory.compressIndex(container, null, 1000).toString());
}
EventTracker.update(EventTracker.EClass.SEARCH, new ProfilingGraph.EventSearch(this.query.id(true), Type.ABSTRACTS, "", this.rankingProcess.searchContainerMap().size(), System.currentTimeMillis() - timer), false);
} else {
// give process time to accumulate a certain amount of data
// before a reading process wants to get results from it
try {
this.rankingProcess.join(100);
} catch (final Throwable e) {
}
// this will reduce the maximum waiting time until results are available to 100 milliseconds
// while we always get a good set of ranked data
}
}
// start worker threads to fetch urls and snippets
this.resultFetcher = new SnippetProcess(loader, this.rankingProcess, this.query, this.peers, this.workTables, 5000, deleteIfSnippetFail);
// clean up events
SearchEventCache.cleanupEvents(false);
EventTracker.update(EventTracker.EClass.SEARCH, new ProfilingGraph.EventSearch(this.query.id(true), Type.CLEANUP, "", 0, 0), false);
// store this search to a cache so it can be re-used
if (MemoryControl.available() < 1024 * 1024 * 100) SearchEventCache.cleanupEvents(true);
SearchEventCache.put(this.query.id(false), this);
}
public ReferenceOrder getOrder() {
return this.order;
}
public long getEventTime() {
return this.eventTime;
}
public void resetEventTime() {
this.eventTime = System.currentTimeMillis();
}
public QueryParams getQuery() {
return this.query;
}
public void setQuery(final QueryParams query) {
this.query = query;
this.resultFetcher.query = query;
}
public void cleanup() {
this.resultFetcher.setCleanupState();
// stop all threads
if (this.primarySearchThreads != null) {
for (final RemoteSearch search : this.primarySearchThreads) {
if (search != null) synchronized (search) {
if (search.isAlive()) search.interrupt();
}
}
}
if (this.secondarySearchThreads != null) {
for (final RemoteSearch search : this.secondarySearchThreads) {
if (search != null) synchronized (search) {
if (search.isAlive()) search.interrupt();
}
}
}
// call the worker threads and ask them to stop
for (final Worker w: this.resultFetcher.workerThreads) {
if (w != null && w.isAlive()) {
w.pleaseStop();
w.interrupt();
// the interrupt may occur during a MD5 computation which is resistant against interruption
// therefore set some more interrupts on the process
int ic = 10;
while (ic-- > 0 & w.isAlive()) w.interrupt();
}
}
// clear all data structures
if (this.preselectedPeerHashes != null) this.preselectedPeerHashes.clear();
if (this.localSearchThread != null) if (this.localSearchThread.isAlive()) this.localSearchThread.interrupt();
if (this.IACount != null) this.IACount.clear();
if (this.IAResults != null) this.IAResults.clear();
if (this.heuristics != null) this.heuristics.clear();
}
public Iterator<Map.Entry<byte[], String>> abstractsString() {
return this.IAResults.entrySet().iterator();
}
public String abstractsString(final byte[] hash) {
return this.IAResults.get(hash);
}
public Iterator<Map.Entry<byte[], Integer>> abstractsCount() {
return this.IACount.entrySet().iterator();
}
public int abstractsCount(final byte[] hash) {
final Integer i = this.IACount.get(hash);
if (i == null) return -1;
return i.intValue();
}
public byte[] getAbstractsMaxCountHash() {
return this.IAmaxcounthash;
}
public byte[] getAbstractsNearDHTHash() {
return this.IAneardhthash;
}
boolean anyRemoteSearchAlive() {
// check primary search threads
if ((this.primarySearchThreads != null) && (this.primarySearchThreads.length != 0)) {
for (final RemoteSearch primarySearchThread : this.primarySearchThreads) {
if ((primarySearchThread != null) && (primarySearchThread.isAlive())) return true;
}
}
// maybe a secondary search thread is alive, check this
if ((this.secondarySearchThreads != null) && (this.secondarySearchThreads.length != 0)) {
for (final RemoteSearch secondarySearchThread : this.secondarySearchThreads) {
if ((secondarySearchThread != null) && (secondarySearchThread.isAlive())) return true;
}
}
return false;
}
public RemoteSearch[] getPrimarySearchThreads() {
return this.primarySearchThreads;
}
public RemoteSearch[] getSecondarySearchThreads() {
return this.secondarySearchThreads;
}
public RWIProcess getRankingResult() {
return this.rankingProcess;
}
public ScoreMap<String> getNamespaceNavigator() {
return this.rankingProcess.getNamespaceNavigator();
}
public ScoreMap<String> getHostNavigator() {
return this.rankingProcess.getHostNavigator();
}
public ScoreMap<String> getTopicNavigator(final int count) {
// returns a set of words that are computed as toplist
return this.rankingProcess.getTopicNavigator(count);
}
public ScoreMap<String> getAuthorNavigator() {
// returns a list of authors so far seen on result set
return this.rankingProcess.getAuthorNavigator();
}
public ScoreMap<String> getProtocolNavigator() {
return this.rankingProcess.getProtocolNavigator();
}
public ScoreMap<String> getFiletypeNavigator() {
return this.rankingProcess.getFiletypeNavigator();
}
public void addHeuristic(final byte[] urlhash, final String heuristicName, final boolean redundant) {
added a 'transition feature' that shall lower the barrier to move from g**gle to yacy (yes!): Here a new concept called 'search heuristics' is introduced. A heuristic is a kind of 'shortcut' to good results in IT, here for good search results. In this case it will be used to get a very transparent way to compare what YaCy is able to produce as search result and what g**gle produces as search result. Here is what your can do now: - add the phrase 'heuristic:scroogle' to your search query, like 'oil spill heuristic:scroogle' and then a call to scroogle is made to get anonymous search results from g**gle. - these results are _not_ taken as meta-search results, but are used to instantly feed a crawling and indexing process. This happens very fast, here 20 results from scroogle are taken and loaded all simultanously, parsed and indexed immediately and from the results of the parsed content the search result is feeded, along to the normal p2p search - when new results from that heuristic (more to come) get part of the search results, then it is verified if such results are redundant to existing (they had been part of the normal YaCy search result anyway) or if they had been completely new to YaCy. - in the search results the new search results from heuristics are marked with a 'H ++' and search results from heuristics that had been already found by YaCy are marked with a 'H ='. That means: - you can now see YaCy and Scroogle search results in one result page but you also see that you would not have 'missed' the g**gle results when you would only have used YaCy. - to make it short: YaCy now subsumes g**gle results. If you use only YaCy, you miss nothing. to come: a configuration page that let you configure the usage of heuristics and get this feature by default. git-svn-id: https://svn.berlios.de/svnroot/repos/yacy/trunk@6944 6c8d7289-2bf4-0310-a012-ef5d649a1542
2010-06-25 18:44:57 +02:00
synchronized (this.heuristics) {
this.heuristics.put(urlhash, new HeuristicResult(urlhash, heuristicName, redundant));
}
}
public HeuristicResult getHeuristic(final byte[] urlhash) {
added a 'transition feature' that shall lower the barrier to move from g**gle to yacy (yes!): Here a new concept called 'search heuristics' is introduced. A heuristic is a kind of 'shortcut' to good results in IT, here for good search results. In this case it will be used to get a very transparent way to compare what YaCy is able to produce as search result and what g**gle produces as search result. Here is what your can do now: - add the phrase 'heuristic:scroogle' to your search query, like 'oil spill heuristic:scroogle' and then a call to scroogle is made to get anonymous search results from g**gle. - these results are _not_ taken as meta-search results, but are used to instantly feed a crawling and indexing process. This happens very fast, here 20 results from scroogle are taken and loaded all simultanously, parsed and indexed immediately and from the results of the parsed content the search result is feeded, along to the normal p2p search - when new results from that heuristic (more to come) get part of the search results, then it is verified if such results are redundant to existing (they had been part of the normal YaCy search result anyway) or if they had been completely new to YaCy. - in the search results the new search results from heuristics are marked with a 'H ++' and search results from heuristics that had been already found by YaCy are marked with a 'H ='. That means: - you can now see YaCy and Scroogle search results in one result page but you also see that you would not have 'missed' the g**gle results when you would only have used YaCy. - to make it short: YaCy now subsumes g**gle results. If you use only YaCy, you miss nothing. to come: a configuration page that let you configure the usage of heuristics and get this feature by default. git-svn-id: https://svn.berlios.de/svnroot/repos/yacy/trunk@6944 6c8d7289-2bf4-0310-a012-ef5d649a1542
2010-06-25 18:44:57 +02:00
synchronized (this.heuristics) {
return this.heuristics.get(urlhash);
}
}
public ResultEntry oneResult(final int item, final long timeout) {
/*
if (this.query.domType == QueryParams.Searchdom.GLOBAL || this.query.domType == QueryParams.Searchdom.CLUSTER) {
// this is a search using remote search threads. Also the local
// search thread is started as background process
if (this.localSearchThread != null && this.localSearchThread.isAlive()) {
// in case that the local search takes longer than some other
// remote search requests, wait that the local process terminates first
try {this.localSearchThread.join(300);} catch (final InterruptedException e) {}
}
}
*/
return this.resultFetcher.oneResult(item, timeout);
}
boolean secondarySearchStartet = false;
added a 'transition feature' that shall lower the barrier to move from g**gle to yacy (yes!): Here a new concept called 'search heuristics' is introduced. A heuristic is a kind of 'shortcut' to good results in IT, here for good search results. In this case it will be used to get a very transparent way to compare what YaCy is able to produce as search result and what g**gle produces as search result. Here is what your can do now: - add the phrase 'heuristic:scroogle' to your search query, like 'oil spill heuristic:scroogle' and then a call to scroogle is made to get anonymous search results from g**gle. - these results are _not_ taken as meta-search results, but are used to instantly feed a crawling and indexing process. This happens very fast, here 20 results from scroogle are taken and loaded all simultanously, parsed and indexed immediately and from the results of the parsed content the search result is feeded, along to the normal p2p search - when new results from that heuristic (more to come) get part of the search results, then it is verified if such results are redundant to existing (they had been part of the normal YaCy search result anyway) or if they had been completely new to YaCy. - in the search results the new search results from heuristics are marked with a 'H ++' and search results from heuristics that had been already found by YaCy are marked with a 'H ='. That means: - you can now see YaCy and Scroogle search results in one result page but you also see that you would not have 'missed' the g**gle results when you would only have used YaCy. - to make it short: YaCy now subsumes g**gle results. If you use only YaCy, you miss nothing. to come: a configuration page that let you configure the usage of heuristics and get this feature by default. git-svn-id: https://svn.berlios.de/svnroot/repos/yacy/trunk@6944 6c8d7289-2bf4-0310-a012-ef5d649a1542
2010-06-25 18:44:57 +02:00
public static class HeuristicResult /*implements Comparable<HeuristicResult>*/ {
public final byte[] urlhash; public final String heuristicName; public final boolean redundant;
public HeuristicResult(final byte[] urlhash, final String heuristicName, final boolean redundant) {
added a 'transition feature' that shall lower the barrier to move from g**gle to yacy (yes!): Here a new concept called 'search heuristics' is introduced. A heuristic is a kind of 'shortcut' to good results in IT, here for good search results. In this case it will be used to get a very transparent way to compare what YaCy is able to produce as search result and what g**gle produces as search result. Here is what your can do now: - add the phrase 'heuristic:scroogle' to your search query, like 'oil spill heuristic:scroogle' and then a call to scroogle is made to get anonymous search results from g**gle. - these results are _not_ taken as meta-search results, but are used to instantly feed a crawling and indexing process. This happens very fast, here 20 results from scroogle are taken and loaded all simultanously, parsed and indexed immediately and from the results of the parsed content the search result is feeded, along to the normal p2p search - when new results from that heuristic (more to come) get part of the search results, then it is verified if such results are redundant to existing (they had been part of the normal YaCy search result anyway) or if they had been completely new to YaCy. - in the search results the new search results from heuristics are marked with a 'H ++' and search results from heuristics that had been already found by YaCy are marked with a 'H ='. That means: - you can now see YaCy and Scroogle search results in one result page but you also see that you would not have 'missed' the g**gle results when you would only have used YaCy. - to make it short: YaCy now subsumes g**gle results. If you use only YaCy, you miss nothing. to come: a configuration page that let you configure the usage of heuristics and get this feature by default. git-svn-id: https://svn.berlios.de/svnroot/repos/yacy/trunk@6944 6c8d7289-2bf4-0310-a012-ef5d649a1542
2010-06-25 18:44:57 +02:00
this.urlhash = urlhash; this.heuristicName = heuristicName; this.redundant = redundant;
}/*
public int compareTo(HeuristicResult o) {
return Base64Order.enhancedCoder.compare(this.urlhash, o.urlhash);
}
public int hashCode() {
return (int) Base64Order.enhancedCoder.cardinal(this.urlhash);
}
public boolean equals(Object o) {
return Base64Order.enhancedCoder.equal(this.urlhash, ((HeuristicResult) o).urlhash);
}*/
}
public class SecondarySearchSuperviser extends Thread {
// cache for index abstracts; word:TreeMap mapping where the embedded TreeMap is a urlhash:peerlist relation
// this relation contains the information where specific urls can be found in specific peers
SortedMap<String, SortedMap<String, StringBuilder>> abstractsCache;
SortedSet<String> checkedPeers;
Semaphore trigger;
public SecondarySearchSuperviser() {
this.abstractsCache = new TreeMap<String, SortedMap<String, StringBuilder>>();
this.checkedPeers = new TreeSet<String>();
this.trigger = new Semaphore(0);
}
/**
* add a single abstract to the existing set of abstracts
* @param wordhash
* @param singleAbstract // a mapping from url-hashes to a string of peer-hashes
*/
public void addAbstract(final String wordhash, final TreeMap<String, StringBuilder> singleAbstract) {
final SortedMap<String, StringBuilder> oldAbstract;
synchronized (this.abstractsCache) {
oldAbstract = this.abstractsCache.get(wordhash);
if (oldAbstract == null) {
// new abstracts in the cache
this.abstractsCache.put(wordhash, singleAbstract);
return;
}
}
// extend the abstracts in the cache: join the single abstracts
new Thread() {
public void run() {
for (final Map.Entry<String, StringBuilder> oneref: singleAbstract.entrySet()) {
final String urlhash = oneref.getKey();
final StringBuilder peerlistNew = oneref.getValue();
synchronized (oldAbstract) {
final StringBuilder peerlistOld = oldAbstract.put(urlhash, peerlistNew);
if (peerlistOld != null) peerlistOld.append(peerlistNew);
}
}
}
}.start();
// abstractsCache.put(wordhash, oldAbstract); // put not necessary since it is sufficient to just change the value content (it stays assigned)
}
public void commitAbstract() {
this.trigger.release();
}
private String wordsFromPeer(final String peerhash, final StringBuilder urls) {
Map.Entry<String, SortedMap<String, StringBuilder>> entry;
String word, url, wordlist = "";
StringBuilder peerlist;
SortedMap<String, StringBuilder> urlPeerlist;
int p;
boolean hasURL;
synchronized (this) {
final Iterator<Map.Entry <String, SortedMap<String, StringBuilder>>> i = this.abstractsCache.entrySet().iterator();
while (i.hasNext()) {
entry = i.next();
word = entry.getKey();
urlPeerlist = entry.getValue();
hasURL = true;
for (int j = 0; j < urls.length(); j = j + 12) {
url = urls.substring(j, j + 12);
peerlist = urlPeerlist.get(url);
p = (peerlist == null) ? -1 : peerlist.indexOf(peerhash);
if ((p < 0) || (p % 12 != 0)) {
hasURL = false;
break;
}
}
if (hasURL) wordlist += word;
}
}
return wordlist;
}
@Override
public void run() {
try {
int t = 0;
while (this.trigger.tryAcquire(10000, TimeUnit.MILLISECONDS)) {
// a trigger was released
prepareSecondarySearch();
t++;
if (t > 10) break;
}
} catch (final InterruptedException e) {
// the thread was interrupted
// do nohing
}
// the time-out was reached
}
private void prepareSecondarySearch() {
if (this.abstractsCache == null || this.abstractsCache.size() != SearchEvent.this.query.queryHashes.size()) return; // secondary search not possible (yet)
// catch up index abstracts and join them; then call peers again to submit their urls
/*
System.out.println("DEBUG-INDEXABSTRACT: " + abstractsCache.size() + " word references caught, " + query.queryHashes.size() + " needed");
for (Map.Entry<String, TreeMap<String, String>> entry: abstractsCache.entrySet()) {
System.out.println("DEBUG-INDEXABSTRACT: hash " + entry.getKey() + ": " + ((query.queryHashes.has(entry.getKey().getBytes()) ? "NEEDED" : "NOT NEEDED") + "; " + entry.getValue().size() + " entries"));
}
*/
// find out if there are enough references for all words that are searched
if (this.abstractsCache.size() != SearchEvent.this.query.queryHashes.size()) return;
// join all the urlhash:peerlist relations: the resulting map has values with a combined peer-list list
final SortedMap<String, StringBuilder> abstractJoin = SetTools.joinConstructive(this.abstractsCache.values(), true);
if (abstractJoin.isEmpty()) return;
// the join result is now a urlhash: peer-list relation
// generate a list of peers that have the urls for the joined search result
final SortedMap<String, StringBuilder> secondarySearchURLs = new TreeMap<String, StringBuilder>(); // a (peerhash:urlhash-liststring) mapping
String url, peer;
StringBuilder urls, peerlist;
final String mypeerhash = SearchEvent.this.peers.mySeed().hash;
boolean mypeerinvolved = false;
int mypeercount;
for (final Map.Entry<String, StringBuilder> entry: abstractJoin.entrySet()) {
url = entry.getKey();
peerlist = entry.getValue();
//System.out.println("DEBUG-INDEXABSTRACT: url " + url + ": from peers " + peerlist);
mypeercount = 0;
for (int j = 0; j < peerlist.length(); j += 12) {
peer = peerlist.substring(j, j + 12);
if ((peer.equals(mypeerhash)) && (mypeercount++ > 1)) continue;
//if (peers.indexOf(peer) < j) continue; // avoid doubles that may appear in the abstractJoin
urls = secondarySearchURLs.get(peer);
if (urls == null) {
urls = new StringBuilder(24);
urls.append(url);
secondarySearchURLs.put(peer, urls);
} else {
urls.append(url);
}
secondarySearchURLs.put(peer, urls);
}
if (mypeercount == 1) mypeerinvolved = true;
}
// compute words for secondary search and start the secondary searches
String words;
SearchEvent.this.secondarySearchThreads = new RemoteSearch[(mypeerinvolved) ? secondarySearchURLs.size() - 1 : secondarySearchURLs.size()];
int c = 0;
for (final Map.Entry<String, StringBuilder> entry: secondarySearchURLs.entrySet()) {
peer = entry.getKey();
if (peer.equals(mypeerhash)) continue; // we don't need to ask ourself
if (this.checkedPeers.contains(peer)) continue; // do not ask a peer again
urls = entry.getValue();
words = wordsFromPeer(peer, urls);
if (words.length() == 0) continue; // ???
assert words.length() >= 12 : "words = " + words;
//System.out.println("DEBUG-INDEXABSTRACT ***: peer " + peer + " has urls: " + urls + " from words: " + words);
SearchEvent.this.rankingProcess.moreFeeders(1);
this.checkedPeers.add(peer);
SearchEvent.this.secondarySearchThreads[c++] = RemoteSearch.secondaryRemoteSearch(
words, urls.toString(), 6000, SearchEvent.this.query.getSegment(), SearchEvent.this.peers, SearchEvent.this.rankingProcess, peer, Switchboard.urlBlacklist,
SearchEvent.this.query.ranking, SearchEvent.this.query.constraint, SearchEvent.this.preselectedPeerHashes);
}
}
}
public SnippetProcess result() {
return this.resultFetcher;
}
public boolean workerAlive() {
if (this.resultFetcher== null || this.resultFetcher.workerThreads == null) return false;
for (final Worker w: this.resultFetcher.workerThreads) if (w != null && w.isAlive()) return true;
return false;
}
}