yacy_search_server/source/de/anomic/search/SearchEvent.java
orbiter 93ea0a4789 enhanced remove operation in search consequences (which are triggered when the snippet fetch proves that the word has disappeared from the page that was stored in the index)
- no direct deletion of referenced during search (shifted to time after search)
- bundling of all deletions for the references of a single word into one remove operation
- enhanced remove operation by caring that the collection is stored sorted (experimental)
- more String -> byte[] transition for search word lists
- clean up of unused code
- enhanced memory allocation of RowSet Objects (will use a little bit less memory which was wasted before)


git-svn-id: https://svn.berlios.de/svnroot/repos/yacy/trunk@6823 6c8d7289-2bf4-0310-a012-ef5d649a1542
2010-04-20 13:45:22 +00:00

412 lines
19 KiB
Java

// SearchEvent.java
// (C) 2005 by Michael Peter Christen; mc@yacy.net, Frankfurt a. M., Germany
// first published 10.10.2005 on http://yacy.net
//
// This is a part of YaCy, a peer-to-peer based web search engine
//
// $LastChangedDate$
// $LastChangedRevision$
// $LastChangedBy$
//
// LICENSE
//
// This program is free software; you can redistribute it and/or modify
// it under the terms of the GNU General Public License as published by
// the Free Software Foundation; either version 2 of the License, or
// (at your option) any later version.
//
// This program is distributed in the hope that it will be useful,
// but WITHOUT ANY WARRANTY; without even the implied warranty of
// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
// GNU General Public License for more details.
//
// You should have received a copy of the GNU General Public License
// along with this program; if not, write to the Free Software
// Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
package de.anomic.search;
import java.io.IOException;
import java.util.ArrayList;
import java.util.Iterator;
import java.util.List;
import java.util.Map;
import java.util.TreeMap;
import net.yacy.kelondro.data.word.WordReference;
import net.yacy.kelondro.data.word.WordReferenceVars;
import net.yacy.kelondro.index.HandleSet;
import net.yacy.kelondro.index.RowSpaceExceededException;
import net.yacy.kelondro.logging.Log;
import net.yacy.kelondro.order.Base64Order;
import net.yacy.kelondro.rwi.ReferenceContainer;
import net.yacy.kelondro.util.EventTracker;
import net.yacy.kelondro.util.MemoryControl;
import net.yacy.kelondro.util.SetTools;
import net.yacy.repository.LoaderDispatcher;
import de.anomic.crawler.ResultURLs;
import de.anomic.yacy.yacySearch;
import de.anomic.yacy.yacySeedDB;
import de.anomic.yacy.dht.FlatWordPartitionScheme;
import de.anomic.yacy.graphics.ProfilingGraph;
public final class SearchEvent {
public static final String INITIALIZATION = "initialization";
public static final String COLLECTION = "collection";
public static final String JOIN = "join";
public static final String PRESORT = "presort";
public static final String URLFETCH = "urlfetch";
public static final String NORMALIZING = "normalizing";
public static final String FINALIZATION = "finalization";
public static final int max_results_preparation = 3000;
// class variables that may be implemented with an abstract class
private long eventTime;
private QueryParams query;
private final yacySeedDB peers;
private RankingProcess rankedCache; // ordered search results, grows dynamically as all the query threads enrich this container
private ResultFetcher results;
// class variables for search abstracts
private final IndexAbstracts rcAbstracts; // cache for index abstracts; word:TreeMap mapping where the embedded TreeMap is a urlhash:peerlist relation
// class variables for remote searches
private yacySearch[] primarySearchThreads, secondarySearchThreads;
private final TreeMap<byte[], String> preselectedPeerHashes;
private final ResultURLs crawlResults;
private final Thread localSearchThread;
private final TreeMap<byte[], String> IAResults;
private final TreeMap<byte[], Integer> IACount;
private byte[] IAmaxcounthash, IAneardhthash;
private final ReferenceOrder order;
@SuppressWarnings("unchecked") SearchEvent(final QueryParams query,
final yacySeedDB peers,
final ResultURLs crawlResults,
final TreeMap<byte[], String> preselectedPeerHashes,
final boolean generateAbstracts,
final LoaderDispatcher loader) {
this.eventTime = System.currentTimeMillis(); // for lifetime check
this.peers = peers;
this.crawlResults = crawlResults;
this.query = query;
this.rcAbstracts = (query.queryHashes.size() > 1) ? new IndexAbstracts() : null; // generate abstracts only for combined searches
this.primarySearchThreads = null;
this.secondarySearchThreads = null;
this.preselectedPeerHashes = preselectedPeerHashes;
this.IAResults = new TreeMap<byte[], String>(Base64Order.enhancedCoder);
this.IACount = new TreeMap<byte[], Integer>(Base64Order.enhancedCoder);
this.IAmaxcounthash = null;
this.IAneardhthash = null;
this.localSearchThread = null;
this.order = new ReferenceOrder(query.ranking, query.targetlang);
final long start = System.currentTimeMillis();
if ((query.domType == QueryParams.SEARCHDOM_GLOBALDHT) ||
(query.domType == QueryParams.SEARCHDOM_CLUSTERALL)) {
final int fetchpeers = 12;
// initialize a ranking process that is the target for data
// that is generated concurrently from local and global search threads
this.rankedCache = new RankingProcess(this.query, this.order, max_results_preparation, fetchpeers + 1);
// start a local search concurrently
this.rankedCache.start();
// start global searches
final long timer = System.currentTimeMillis();
Log.logFine("SEARCH_EVENT", "STARTING " + fetchpeers + " THREADS TO CATCH EACH " + query.displayResults() + " URLs");
this.primarySearchThreads = (query.queryHashes.isEmpty()) ? null : yacySearch.primaryRemoteSearches(
QueryParams.hashSet2hashString(query.queryHashes),
QueryParams.hashSet2hashString(query.excludeHashes),
"",
query.prefer,
query.urlMask,
query.targetlang == null ? "" : query.targetlang,
query.sitehash == null ? "" : query.sitehash,
query.authorhash == null ? "" : query.authorhash,
query.displayResults(),
query.maxDistance,
query.getSegment(),
peers,
crawlResults,
rankedCache,
rcAbstracts,
fetchpeers,
Switchboard.urlBlacklist,
query.ranking,
query.constraint,
(query.domType == QueryParams.SEARCHDOM_GLOBALDHT) ? null : preselectedPeerHashes);
if (this.primarySearchThreads != null) {
if (this.primarySearchThreads.length > fetchpeers) this.rankedCache.moreFeeders(this.primarySearchThreads.length - fetchpeers);
EventTracker.update("SEARCH", new ProfilingGraph.searchEvent(query.id(true), "remote search thread start", this.primarySearchThreads.length, System.currentTimeMillis() - timer), false, 30000, ProfilingGraph.maxTime);
// finished searching
Log.logFine("SEARCH_EVENT", "SEARCH TIME AFTER GLOBAL-TRIGGER TO " + primarySearchThreads.length + " PEERS: " + ((System.currentTimeMillis() - start) / 1000) + " seconds");
} else {
// no search since query is empty, user might have entered no data or filters have removed all search words
Log.logFine("SEARCH_EVENT", "NO SEARCH STARTED DUE TO EMPTY SEARCH REQUEST.");
}
// start worker threads to fetch urls and snippets
this.results = new ResultFetcher(loader, rankedCache, query, peers, 10000);
} else {
// do a local search
this.rankedCache = new RankingProcess(this.query, this.order, max_results_preparation, 2);
this.rankedCache.run();
//CrawlSwitchboard.Finding finding = wordIndex.retrieveURLs(query, false, 2, ranking, process);
if (generateAbstracts) {
// compute index abstracts
final long timer = System.currentTimeMillis();
int maxcount = -1;
long mindhtdistance = Long.MAX_VALUE, l;
byte[] wordhash;
assert this.rankedCache.searchContainerMap() != null;
for (Map.Entry<byte[], ReferenceContainer<WordReference>> entry : this.rankedCache.searchContainerMap().entrySet()) {
wordhash = entry.getKey();
final ReferenceContainer container = entry.getValue();
assert (Base64Order.enhancedCoder.equal(container.getTermHash(), wordhash)) : "container.getTermHash() = " + new String(container.getTermHash()) + ", wordhash = " + new String(wordhash);
if (container.size() > maxcount) {
IAmaxcounthash = wordhash;
maxcount = container.size();
}
l = FlatWordPartitionScheme.std.dhtDistance(wordhash, null, peers.mySeed());
if (l < mindhtdistance) {
// calculate the word hash that is closest to our dht position
mindhtdistance = l;
IAneardhthash = wordhash;
}
IACount.put(wordhash, Integer.valueOf(container.size()));
IAResults.put(wordhash, ReferenceContainer.compressIndex(container, null, 1000).toString());
}
EventTracker.update("SEARCH", new ProfilingGraph.searchEvent(query.id(true), "abstract generation", this.rankedCache.searchContainerMap().size(), System.currentTimeMillis() - timer), false, 30000, ProfilingGraph.maxTime);
}
// start worker threads to fetch urls and snippets
this.results = new ResultFetcher(loader, rankedCache, query, peers, 300);
}
// clean up events
SearchEventCache.cleanupEvents(false);
EventTracker.update("SEARCH", new ProfilingGraph.searchEvent(query.id(true), "event-cleanup", 0, 0), false, 30000, ProfilingGraph.maxTime);
// store this search to a cache so it can be re-used
if (MemoryControl.available() < 1024 * 1024 * 10) SearchEventCache.cleanupEvents(true);
SearchEventCache.put(query.id(false), this);
}
public ReferenceOrder getOrder() {
return this.order;
}
public long getEventTime() {
return this.eventTime;
}
public void resetEventTime() {
this.eventTime = System.currentTimeMillis();
}
public QueryParams getQuery() {
return this.query;
}
public void setQuery(QueryParams query) {
this.query = query;
this.results.query = query;
}
public void cleanup() {
// execute deletion of failed words
int rw = this.results.failedURLs.size();
if (rw > 0) {
long start = System.currentTimeMillis();
final HandleSet removeWords = query.queryHashes;
try {
removeWords.putAll(query.excludeHashes);
} catch (RowSpaceExceededException e1) {
Log.logException(e1);
}
try {
final Iterator<byte[]> j = removeWords.iterator();
// remove the same url hashes for multiple words
while (j.hasNext()) {
this.query.getSegment().termIndex().remove(j.next(), this.results.failedURLs);
}
} catch (IOException e) {
Log.logException(e);
}
Log.logInfo("SearchEvents", "cleaning up event " + query.id(true) + ", removed " + rw + " URL references on " + removeWords.size() + " words in " + (System.currentTimeMillis() - start) + " milliseconds");
}
}
public Iterator<Map.Entry<byte[], String>> abstractsString() {
return this.IAResults.entrySet().iterator();
}
public String abstractsString(byte[] hash) {
return this.IAResults.get(hash);
}
public Iterator<Map.Entry<byte[], Integer>> abstractsCount() {
return this.IACount.entrySet().iterator();
}
public int abstractsCount(byte[] hash) {
Integer i = this.IACount.get(hash);
if (i == null) return -1;
return i.intValue();
}
public byte[] getAbstractsMaxCountHash() {
return this.IAmaxcounthash;
}
public byte[] getAbstractsNearDHTHash() {
return this.IAneardhthash;
}
boolean anyRemoteSearchAlive() {
// check primary search threads
if ((this.primarySearchThreads != null) && (this.primarySearchThreads.length != 0)) {
for (int i = 0; i < this.primarySearchThreads.length; i++) {
if ((this.primarySearchThreads[i] != null) && (this.primarySearchThreads[i].isAlive())) return true;
}
}
// maybe a secondary search thread is alive, check this
if ((this.secondarySearchThreads != null) && (this.secondarySearchThreads.length != 0)) {
for (int i = 0; i < this.secondarySearchThreads.length; i++) {
if ((this.secondarySearchThreads[i] != null) && (this.secondarySearchThreads[i].isAlive())) return true;
}
}
return false;
}
public yacySearch[] getPrimarySearchThreads() {
return primarySearchThreads;
}
public yacySearch[] getSecondarySearchThreads() {
return secondarySearchThreads;
}
public RankingProcess getRankingResult() {
return this.rankedCache;
}
public ArrayList<Navigator.Item> getNamespaceNavigator(int maxentries) {
return this.rankedCache.getNamespaceNavigator(maxentries);
}
public List<Navigator.Item> getHostNavigator(int maxentries) {
return this.rankedCache.getHostNavigator(maxentries);
}
public List<Navigator.Item> getTopicNavigator(final int maxentries) {
// returns a set of words that are computed as toplist
return this.rankedCache.getTopicNavigator(maxentries);
}
public List<Navigator.Item> getAuthorNavigator(final int maxentries) {
// returns a list of authors so far seen on result set
return this.rankedCache.getAuthorNavigator(maxentries);
}
public ResultEntry oneResult(final int item) {
if ((query.domType == QueryParams.SEARCHDOM_GLOBALDHT) ||
(query.domType == QueryParams.SEARCHDOM_CLUSTERALL)) {
// this is a search using remote search threads. Also the local
// search thread is started as background process
if ((localSearchThread != null) && (localSearchThread.isAlive())) {
// in case that the local search takes longer than some other
// remote search requests, wait that the local process terminates first
try {localSearchThread.join();} catch (InterruptedException e) {}
}
}
return this.results.oneResult(item);
}
boolean secondarySearchStartet = false;
void prepareSecondarySearch() {
if (secondarySearchStartet) return; // don't do this twice
if ((rcAbstracts == null) || (rcAbstracts.size() != query.queryHashes.size())) return; // secondary search not possible (yet)
this.secondarySearchStartet = true;
/*
// catch up index abstracts and join them; then call peers again to submit their urls
System.out.println("DEBUG-INDEXABSTRACT: " + rcAbstracts.size() + " word references caught, " + query.queryHashes.size() + " needed");
Iterator i = rcAbstracts.entrySet().iterator();
Map.Entry entry;
while (i.hasNext()) {
entry = (Map.Entry) i.next();
System.out.println("DEBUG-INDEXABSTRACT: hash " + (String) entry.getKey() + ": " + ((query.queryHashes.contains((String) entry.getKey())) ? "NEEDED" : "NOT NEEDED") + "; " + ((TreeMap) entry.getValue()).size() + " entries");
}
*/
final TreeMap<String, String> abstractJoin = (rcAbstracts.size() == query.queryHashes.size()) ? SetTools.joinConstructive(rcAbstracts.values(), true) : new TreeMap<String, String>();
if (!abstractJoin.isEmpty()) {
//System.out.println("DEBUG-INDEXABSTRACT: index abstracts delivered " + abstractJoin.size() + " additional results for secondary search");
// generate query for secondary search
final TreeMap<String, String> secondarySearchURLs = new TreeMap<String, String>(); // a (peerhash:urlhash-liststring) mapping
Iterator<Map.Entry<String, String>> i1 = abstractJoin.entrySet().iterator();
Map.Entry<String, String> entry1;
String url, urls, peer, ps;
final String mypeerhash = peers.mySeed().hash;
boolean mypeerinvolved = false;
int mypeercount;
while (i1.hasNext()) {
entry1 = i1.next();
url = entry1.getKey();
ps = entry1.getValue();
//System.out.println("DEBUG-INDEXABSTRACT: url " + url + ": from peers " + peers);
mypeercount = 0;
for (int j = 0; j < ps.length(); j = j + 12) {
peer = ps.substring(j, j + 12);
if ((peer.equals(mypeerhash)) && (mypeercount++ > 1)) continue;
//if (peers.indexOf(peer) < j) continue; // avoid doubles that may appear in the abstractJoin
urls = secondarySearchURLs.get(peer);
urls = (urls == null) ? url : urls + url;
secondarySearchURLs.put(peer, urls);
}
if (mypeercount == 1) mypeerinvolved = true;
}
// compute words for secondary search and start the secondary searches
i1 = secondarySearchURLs.entrySet().iterator();
String words;
secondarySearchThreads = new yacySearch[(mypeerinvolved) ? secondarySearchURLs.size() - 1 : secondarySearchURLs.size()];
int c = 0;
while (i1.hasNext()) {
entry1 = i1.next();
peer = entry1.getKey();
if (peer.equals(mypeerhash)) continue; // we dont need to ask ourself
urls = entry1.getValue();
words = rcAbstracts.wordsFromPeer(peer, urls);
assert words.length() >= 12 : "words = " + words;
//System.out.println("DEBUG-INDEXABSTRACT ***: peer " + peer + " has urls: " + urls);
//System.out.println("DEBUG-INDEXABSTRACT ***: peer " + peer + " from words: " + words);
secondarySearchThreads[c++] = yacySearch.secondaryRemoteSearch(
words, "", urls, this.query.getSegment(), peers, crawlResults, this.rankedCache, peer, Switchboard.urlBlacklist,
query.ranking, query.constraint, preselectedPeerHashes);
}
//} else {
//System.out.println("DEBUG-INDEXABSTRACT: no success using index abstracts from remote peers");
}
}
public void remove(final WordReferenceVars reference) {
this.rankedCache.remove(reference);
}
public ResultFetcher result() {
return this.results;
}
}