yacy_search_server/source/de/anomic/plasma/plasmaSearchEvent.java
orbiter 01e0669264 re-designed some parts of DHT position calculation (effect is the same as before)
and replaced old fist hash computation by new method that tries to find a gap in the current dht
to do this, it is necessary that the network bootstraping is done before the own hash is computed
this made further redesigns in peer initialization order necessary

git-svn-id: https://svn.berlios.de/svnroot/repos/yacy/trunk@4117 6c8d7289-2bf4-0310-a012-ef5d649a1542
2007-10-01 12:30:23 +00:00

1007 lines
46 KiB
Java

// plasmaSearchEvent.java
// (C) 2005 by Michael Peter Christen; mc@yacy.net, Frankfurt a. M., Germany
// first published 10.10.2005 on http://yacy.net
//
// This is a part of YaCy, a peer-to-peer based web search engine
//
// $LastChangedDate: 2006-04-02 22:40:07 +0200 (So, 02 Apr 2006) $
// $LastChangedRevision: 1986 $
// $LastChangedBy: orbiter $
//
// LICENSE
//
// This program is free software; you can redistribute it and/or modify
// it under the terms of the GNU General Public License as published by
// the Free Software Foundation; either version 2 of the License, or
// (at your option) any later version.
//
// This program is distributed in the hope that it will be useful,
// but WITHOUT ANY WARRANTY; without even the implied warranty of
// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
// GNU General Public License for more details.
//
// You should have received a copy of the GNU General Public License
// along with this program; if not, write to the Free Software
// Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
package de.anomic.plasma;
import java.io.UnsupportedEncodingException;
import java.util.ArrayList;
import java.util.Date;
import java.util.HashMap;
import java.util.Iterator;
import java.util.Map;
import java.util.Set;
import java.util.TreeMap;
import java.util.TreeSet;
import de.anomic.index.indexContainer;
import de.anomic.index.indexRWIEntry;
import de.anomic.index.indexURLEntry;
import de.anomic.kelondro.kelondroBitfield;
import de.anomic.kelondro.kelondroMSetTools;
import de.anomic.kelondro.kelondroRow;
import de.anomic.server.logging.serverLog;
import de.anomic.yacy.yacyCore;
import de.anomic.yacy.yacyDHTAction;
import de.anomic.yacy.yacySearch;
import de.anomic.yacy.yacySeed;
import de.anomic.yacy.yacyURL;
public final class plasmaSearchEvent {
public static int workerThreadCount = 10;
public static String lastEventID = "";
private static HashMap lastEvents = new HashMap(); // a cache for objects from this class: re-use old search requests
public static final long eventLifetime = 600000; // the time an event will stay in the cache, 10 Minutes
private long eventTime;
private plasmaSearchQuery query;
private plasmaSearchRankingProfile ranking;
private plasmaWordIndex wordIndex;
private plasmaSearchContainer rankedCache; // ordered search results, grows dynamically as all the query threads enrich this container
private Map rcAbstracts; // cache for index abstracts; word:TreeMap mapping where the embedded TreeMap is a urlhash:peerlist relation
private plasmaSearchProcessing process;
private yacySearch[] primarySearchThreads, secondarySearchThreads;
private Thread localSearchThread;
private TreeMap preselectedPeerHashes;
//private Object[] references;
public TreeMap IAResults, IACount;
public String IAmaxcounthash, IAneardhthash;
private int localcount;
private resultWorker[] workerThreads;
private ArrayList resultList; // list of this.Entry objects
//private int resultListLock; // a pointer that shows that all elements below this pointer are fixed and may not be changed again
private HashMap failedURLs; // a mapping from a urlhash to a fail reason string
TreeSet snippetFetchWordHashes; // a set of word hashes that are used to match with the snippets
private long urlRetrievalAllTime;
private long snippetComputationAllTime;
private plasmaSearchEvent(plasmaSearchQuery query,
plasmaSearchRankingProfile ranking,
plasmaSearchProcessing localTiming,
plasmaWordIndex wordIndex,
TreeMap preselectedPeerHashes,
boolean generateAbstracts,
TreeSet abstractSet) {
this.eventTime = System.currentTimeMillis(); // for lifetime check
this.wordIndex = wordIndex;
this.query = query;
this.ranking = ranking;
this.rcAbstracts = (query.queryHashes.size() > 1) ? new TreeMap() : null; // generate abstracts only for combined searches
this.process = localTiming;
this.primarySearchThreads = null;
this.secondarySearchThreads = null;
this.preselectedPeerHashes = preselectedPeerHashes;
this.IAResults = new TreeMap();
this.IACount = new TreeMap();
this.IAmaxcounthash = null;
this.IAneardhthash = null;
this.localcount = 0;
this.urlRetrievalAllTime = 0;
this.snippetComputationAllTime = 0;
this.workerThreads = null;
this.resultList = new ArrayList(10); // this is the result set which is filled up with search results, enriched with snippets
//this.resultListLock = 0; // no locked elements until now
this.failedURLs = new HashMap(); // a map of urls to reason strings where a worker thread tried to work on, but failed.
// snippets do not need to match with the complete query hashes,
// only with the query minus the stopwords which had not been used for the search
final TreeSet filtered = kelondroMSetTools.joinConstructive(query.queryHashes, plasmaSwitchboard.stopwords);
this.snippetFetchWordHashes = (TreeSet) query.queryHashes.clone();
if ((filtered != null) && (filtered.size() > 0)) {
kelondroMSetTools.excludeDestructive(this.snippetFetchWordHashes, plasmaSwitchboard.stopwords);
}
long start = System.currentTimeMillis();
if ((query.domType == plasmaSearchQuery.SEARCHDOM_GLOBALDHT) ||
(query.domType == plasmaSearchQuery.SEARCHDOM_CLUSTERALL)) {
this.rankedCache = new plasmaSearchContainer(query, ranking, plasmaSearchQuery.cleanQuery(query.queryString)[0]);
int fetchpeers = (int) (query.maximumTime / 500L); // number of target peers; means 10 peers in 10 seconds
if (fetchpeers > 50) fetchpeers = 50;
if (fetchpeers < 30) fetchpeers = 30;
// do a global search
// the result of the fetch is then in the rcGlobal
process.startTimer();
serverLog.logFine("SEARCH_EVENT", "STARTING " + fetchpeers + " THREADS TO CATCH EACH " + query.displayResults() + " URLs");
this.primarySearchThreads = yacySearch.primaryRemoteSearches(
plasmaSearchQuery.hashSet2hashString(query.queryHashes),
plasmaSearchQuery.hashSet2hashString(query.excludeHashes),
"",
query.prefer,
query.urlMask,
query.displayResults(),
query.maxDistance,
wordIndex,
rankedCache,
rcAbstracts,
fetchpeers,
plasmaSwitchboard.urlBlacklist,
ranking,
query.constraint,
(query.domType == plasmaSearchQuery.SEARCHDOM_GLOBALDHT) ? null : preselectedPeerHashes);
process.yield("remote search thread start", this.primarySearchThreads.length);
// meanwhile do a local search
localSearchThread = new localSearchProcess();
localSearchThread.start();
// finished searching
serverLog.logFine("SEARCH_EVENT", "SEARCH TIME AFTER GLOBAL-TRIGGER TO " + primarySearchThreads.length + " PEERS: " + ((System.currentTimeMillis() - start) / 1000) + " seconds");
} else {
Map[] searchContainerMaps = process.localSearchContainers(query, wordIndex, null);
if (generateAbstracts) {
// compute index abstracts
process.startTimer();
Iterator ci = searchContainerMaps[0].entrySet().iterator();
Map.Entry entry;
int maxcount = -1;
double mindhtdistance = 1.1, d;
String wordhash;
while (ci.hasNext()) {
entry = (Map.Entry) ci.next();
wordhash = (String) entry.getKey();
indexContainer container = (indexContainer) entry.getValue();
assert (container.getWordHash().equals(wordhash));
if (container.size() > maxcount) {
IAmaxcounthash = wordhash;
maxcount = container.size();
}
d = yacyDHTAction.dhtDistance(yacyCore.seedDB.mySeed().hash, wordhash);
if (d < mindhtdistance) {
// calculate the word hash that is closest to our dht position
mindhtdistance = d;
IAneardhthash = wordhash;
}
IACount.put(wordhash, new Integer(container.size()));
IAResults.put(wordhash, plasmaSearchProcessing.compressIndex(container, null, 1000).toString());
}
process.yield("abstract generation", searchContainerMaps[0].size());
}
indexContainer rcLocal =
(searchContainerMaps == null) ?
plasmaWordIndex.emptyContainer(null, 0) :
process.localSearchJoinExclude(
searchContainerMaps[0].values(),
searchContainerMaps[1].values(),
query.maxDistance);
this.localcount = rcLocal.size();
plasmaSearchPreOrder sort = new plasmaSearchPreOrder(query, process, ranking, rcLocal);
process.startTimer();
rcLocal = sort.strippedContainer(200);
process.yield("result strip", rcLocal.size());
this.rankedCache = new plasmaSearchContainer(query, ranking, plasmaSearchQuery.cleanQuery(query.queryString)[0], rcLocal);
}
if (query.onlineSnippetFetch) {
// start worker threads to fetch urls and snippets
this.workerThreads = new resultWorker[workerThreadCount];
for (int i = 0; i < workerThreadCount; i++) {
this.workerThreads[i] = new resultWorker(i, process.getTargetTime() * 3);
this.workerThreads[i].start();
}
} else {
// prepare result vector directly without worker threads
int rankedIndex = 0;
process.startTimer();
while ((rankedIndex < rankedCache.container().size()) && (resultList.size() < (query.neededResults()))) {
// fetch next entry to work on
indexContainer c = rankedCache.container();
indexRWIEntry entry = new indexRWIEntry(c.get(rankedIndex++));
indexURLEntry page = wordIndex.loadedURL.load(entry.urlHash(), entry);
if (page == null) {
registerFailure(entry.urlHash(), "url does not exist in lurl-db");
continue;
}
ResultEntry resultEntry = obtainResultEntry(page, false);
if (resultEntry == null) continue; // the entry had some problems, cannot be used
urlRetrievalAllTime += resultEntry.dbRetrievalTime;
snippetComputationAllTime += resultEntry.snippetComputationTime;
// place the result to the result vector
synchronized (resultList) {
resultList.add(resultEntry);
}
// add references
synchronized (rankedCache) {
rankedCache.addReferences(resultEntry);
}
}
process.yield("offline snippet fetch", resultList.size());
}
// clean up events
cleanupEvents();
// store this search to a cache so it can be re-used
lastEvents.put(query.id(), this);
lastEventID = query.id();
}
private class localSearchProcess extends Thread {
public localSearchProcess() {
}
public void run() {
// do a local search
Map[] searchContainerMaps = process.localSearchContainers(query, wordIndex, null);
// use the search containers to fill up rcAbstracts locally
/*
if ((rcAbstracts != null) && (searchContainerMap != null)) {
Iterator i, ci = searchContainerMap.entrySet().iterator();
Map.Entry entry;
String wordhash;
indexContainer container;
TreeMap singleAbstract;
String mypeerhash = yacyCore.seedDB.mySeed.hash;
while (ci.hasNext()) {
entry = (Map.Entry) ci.next();
wordhash = (String) entry.getKey();
container = (indexContainer) entry.getValue();
// collect all urlhashes from the container
synchronized (rcAbstracts) {
singleAbstract = (TreeMap) rcAbstracts.get(wordhash); // a mapping from url-hashes to a string of peer-hashes
if (singleAbstract == null) singleAbstract = new TreeMap();
i = container.entries();
while (i.hasNext()) singleAbstract.put(((indexEntry) i.next()).urlHash(), mypeerhash);
rcAbstracts.put(wordhash, singleAbstract);
}
}
}
*/
// join and exlcude the local result
indexContainer rcLocal =
(searchContainerMaps == null) ?
plasmaWordIndex.emptyContainer(null, 0) :
process.localSearchJoinExclude(
searchContainerMaps[0].values(),
searchContainerMaps[1].values(),
query.maxDistance);
localcount = rcLocal.size();
// sort the local containers and truncate it to a limited count,
// so following sortings together with the global results will be fast
plasmaSearchPreOrder firstsort = new plasmaSearchPreOrder(query, process, ranking, rcLocal);
rcLocal = firstsort.strippedContainer(200);
synchronized (rankedCache) {
rankedCache.insert(rcLocal, true, true);
}
}
}
private static void cleanupEvents() {
// remove old events in the event cache
Iterator i = lastEvents.entrySet().iterator();
plasmaSearchEvent cleanEvent;
while (i.hasNext()) {
cleanEvent = (plasmaSearchEvent) ((Map.Entry) i.next()).getValue();
if (cleanEvent.eventTime + eventLifetime < System.currentTimeMillis()) {
// execute deletion of failed words
Set removeWords = cleanEvent.query.queryHashes;
removeWords.addAll(cleanEvent.query.excludeHashes);
cleanEvent.wordIndex.removeEntriesMultiple(removeWords, cleanEvent.failedURLs.keySet());
serverLog.logInfo("SearchEvents", "cleaning up event " + cleanEvent.query.id() + ", removed " + cleanEvent.failedURLs.size() + " URL references on " + removeWords.size() + " words");
// remove the event
i.remove();
}
}
}
private ResultEntry obtainResultEntry(indexURLEntry page, boolean fetchSnippetOnline) {
// a search result entry needs some work to produce a result Entry:
// - check if url entry exists in LURL-db
// - check exclusions, constraints, masks, media-domains
// - load snippet (see if page exists) and check if snippet contains searched word
// load only urls if there was not yet a root url of that hash
// find the url entry
long startTime = System.currentTimeMillis();
indexURLEntry.Components comp = page.comp();
String pagetitle = comp.title().toLowerCase();
if (comp.url() == null) {
registerFailure(page.hash(), "url corrupted (null)");
return null; // rare case where the url is corrupted
}
String pageurl = comp.url().toString().toLowerCase();
String pageauthor = comp.author().toLowerCase();
long dbRetrievalTime = System.currentTimeMillis() - startTime;
// check exclusion
if ((plasmaSearchQuery.matches(pagetitle, query.excludeHashes)) ||
(plasmaSearchQuery.matches(pageurl, query.excludeHashes)) ||
(plasmaSearchQuery.matches(pageauthor, query.excludeHashes))) {
return null;
}
// check url mask
if (!(pageurl.matches(query.urlMask))) {
return null;
}
// check constraints
if ((!(query.constraint.equals(plasmaSearchQuery.catchall_constraint))) &&
(query.constraint.get(plasmaCondenser.flag_cat_indexof)) &&
(!(comp.title().startsWith("Index of")))) {
final Iterator wi = query.queryHashes.iterator();
while (wi.hasNext()) wordIndex.removeEntry((String) wi.next(), page.hash());
registerFailure(page.hash(), "index-of constrained not fullfilled");
return null;
}
if ((query.contentdom == plasmaSearchQuery.CONTENTDOM_AUDIO) && (page.laudio() == 0)) {
registerFailure(page.hash(), "contentdom-audio constrained not fullfilled");
return null;
}
if ((query.contentdom == plasmaSearchQuery.CONTENTDOM_VIDEO) && (page.lvideo() == 0)) {
registerFailure(page.hash(), "contentdom-video constrained not fullfilled");
return null;
}
if ((query.contentdom == plasmaSearchQuery.CONTENTDOM_IMAGE) && (page.limage() == 0)) {
registerFailure(page.hash(), "contentdom-image constrained not fullfilled");
return null;
}
if ((query.contentdom == plasmaSearchQuery.CONTENTDOM_APP) && (page.lapp() == 0)) {
registerFailure(page.hash(), "contentdom-app constrained not fullfilled");
return null;
}
// load snippet
if (query.contentdom == plasmaSearchQuery.CONTENTDOM_TEXT) {
// attach text snippet
startTime = System.currentTimeMillis();
plasmaSnippetCache.TextSnippet snippet = plasmaSnippetCache.retrieveTextSnippet(comp.url(), snippetFetchWordHashes, fetchSnippetOnline, query.constraint.get(plasmaCondenser.flag_cat_indexof), 180, 3000, (fetchSnippetOnline) ? Integer.MAX_VALUE : 100000);
long snippetComputationTime = System.currentTimeMillis() - startTime;
if (snippet.getErrorCode() < 11) {
// we loaded the file and found the snippet
return new ResultEntry(page, wordIndex, snippet, null, dbRetrievalTime, snippetComputationTime); // result with snippet attached
} else if (!fetchSnippetOnline) {
// we did not demand online loading, therefore a failure does not mean that the missing snippet causes a rejection of this result
// this may happen during a remote search, because snippet loading is omitted to retrieve results faster
return new ResultEntry(page, wordIndex, null, null, dbRetrievalTime, snippetComputationTime); // result without snippet
} else {
// problems with snippet fetch
registerFailure(page.hash(), "no text snippet for URL " + comp.url());
plasmaSnippetCache.failConsequences(snippet, query.id());
return null;
}
} else {
// attach media information
startTime = System.currentTimeMillis();
ArrayList mediaSnippets = plasmaSnippetCache.retrieveMediaSnippets(comp.url(), snippetFetchWordHashes, query.contentdom, fetchSnippetOnline, 6000);
long snippetComputationTime = System.currentTimeMillis() - startTime;
if ((mediaSnippets != null) && (mediaSnippets.size() > 0)) {
// found media snippets, return entry
return new ResultEntry(page, wordIndex, null, mediaSnippets, dbRetrievalTime, snippetComputationTime);
} else if (!fetchSnippetOnline) {
return new ResultEntry(page, wordIndex, null, null, dbRetrievalTime, snippetComputationTime);
} else {
// problems with snippet fetch
registerFailure(page.hash(), "no media snippet for URL " + comp.url());
return null;
}
}
// finished, no more actions possible here
}
private boolean anyWorkerAlive() {
if (this.workerThreads == null) return false;
for (int i = 0; i < workerThreadCount; i++) {
if ((this.workerThreads[i] != null) && (this.workerThreads[i].isAlive())) return true;
}
return false;
}
private boolean anyRemoteSearchAlive() {
// check primary search threads
if ((this.primarySearchThreads != null) && (this.primarySearchThreads.length != 0)) {
for (int i = 0; i < this.primarySearchThreads.length; i++) {
if ((this.primarySearchThreads[i] != null) && (this.primarySearchThreads[i].isAlive())) return true;
}
}
// maybe a secondary search thread is alivem check this
if ((this.secondarySearchThreads != null) && (this.secondarySearchThreads.length != 0)) {
for (int i = 0; i < this.primarySearchThreads.length; i++) {
if ((this.secondarySearchThreads[i] != null) && (this.secondarySearchThreads[i].isAlive())) return true;
}
}
return false;
}
public plasmaSearchQuery getQuery() {
return query;
}
public plasmaSearchRankingProfile getRanking() {
return ranking;
}
public plasmaSearchProcessing getProcess() {
return process;
}
public yacySearch[] getPrimarySearchThreads() {
return primarySearchThreads;
}
public yacySearch[] getSecondarySearchThreads() {
return secondarySearchThreads;
}
public int getLocalCount() {
return this.localcount;
}
public int getGlobalCount() {
return this.rankedCache.getGlobalCount();
}
public long getURLRetrievalTime() {
return this.urlRetrievalAllTime;
}
public long getSnippetComputationTime() {
return this.snippetComputationAllTime;
}
public static plasmaSearchEvent getEvent(String eventID) {
synchronized (lastEvents) {
return (plasmaSearchEvent) lastEvents.get(eventID);
}
}
public static plasmaSearchEvent getEvent(plasmaSearchQuery query,
plasmaSearchRankingProfile ranking,
plasmaSearchProcessing localTiming,
plasmaWordIndex wordIndex,
TreeMap preselectedPeerHashes,
boolean generateAbstracts,
TreeSet abstractSet) {
synchronized (lastEvents) {
plasmaSearchEvent event = (plasmaSearchEvent) lastEvents.get(query.id());
if (event == null) {
event = new plasmaSearchEvent(query, ranking, localTiming, wordIndex, preselectedPeerHashes, generateAbstracts, abstractSet);
} else {
//re-new the event time for this event, so it is not deleted next time too early
event.eventTime = System.currentTimeMillis();
// replace the query, because this contains the current result offset
event.query = query;
}
// if worker threads had been alive, but did not succeed, start them again to fetch missing links
if ((query.onlineSnippetFetch) &&
(!event.anyWorkerAlive()) &&
(event.resultList.size() < query.neededResults() + 10) &&
((event.getLocalCount() + event.getGlobalCount()) > event.resultList.size())) {
// set new timeout
event.eventTime = System.currentTimeMillis();
// start worker threads to fetch urls and snippets
event.workerThreads = new resultWorker[workerThreadCount];
for (int i = 0; i < workerThreadCount; i++) {
event.workerThreads[i] = event.deployWorker(i, 3 * event.process.getTargetTime());
}
}
return event;
}
}
private resultWorker deployWorker(int id, long lifetime) {
resultWorker worker = new resultWorker(id, lifetime);
worker.start();
return worker;
}
private class resultWorker extends Thread {
private indexRWIEntry entry; // entry this thread is working on
private long timeout; // the date until this thread should try to work
private long sleeptime; // the sleeptime of this thread at the beginning of its life
private int id;
public resultWorker(int id, long lifetime) {
this.id = id;
this.timeout = System.currentTimeMillis() + lifetime;
this.sleeptime = lifetime / 10 * id;
this.entry = null;
}
public void run() {
// sleep first to give remote loading threads a chance to fetch entries
if (anyRemoteSearchAlive()) try {Thread.sleep(this.sleeptime);} catch (InterruptedException e1) {}
// start fetching urls and snippets
while (true) {
if (resultList.size() > query.neededResults() + query.displayResults()) break; // computed enough
if (System.currentTimeMillis() > this.timeout) break; // time is over
// try secondary search
prepareSecondarySearch(); // will be executed only once
// fetch next entry to work on
this.entry = null;
entry = nextOrder();
if (entry == null) {
if (anyRemoteSearchAlive()) {
// wait and try again
try {Thread.sleep(100);} catch (InterruptedException e) {}
continue;
} else {
// we will not see that there core more results in
break;
}
}
indexURLEntry page = wordIndex.loadedURL.load(entry.urlHash(), entry);
if (page == null) {
registerFailure(entry.urlHash(), "url does not exist in lurl-db");
continue;
}
ResultEntry resultEntry = obtainResultEntry(page, true);
if (resultEntry == null) continue; // the entry had some problems, cannot be used
urlRetrievalAllTime += resultEntry.dbRetrievalTime;
snippetComputationAllTime += resultEntry.snippetComputationTime;
// place the result to the result vector
synchronized (resultList) {
resultList.add(resultEntry);
}
// add references
synchronized (rankedCache) {
rankedCache.addReferences(resultEntry);
}
System.out.println("DEBUG SNIPPET_LOADING: thread " + id + " got " + resultEntry.url());
}
serverLog.logInfo("SEARCH", "resultWorker thread " + id + " terminated");
}
private indexRWIEntry nextOrder() {
synchronized (rankedCache) {
indexContainer c = rankedCache.container();
kelondroRow.Entry entry;
String urlhash;
for (int i = 0; i < c.size(); i++) {
entry = c.get(i);
urlhash = new String(entry.getColBytes(0));
if ((anyFailureWith(urlhash)) || (anyWorkerWith(urlhash)) || (anyResultWith(urlhash))) continue;
return new indexRWIEntry(entry);
}
}
return null; // no more entries available
}
private boolean anyWorkerWith(String urlhash) {
for (int i = 0; i < workerThreadCount; i++) {
if ((workerThreads[i] == null) || (workerThreads[i] == this)) continue;
if ((workerThreads[i].entry != null) && (workerThreads[i].entry.urlHash().equals(urlhash))) return true;
}
return false;
}
private boolean anyResultWith(String urlhash) {
for (int i = 0; i < resultList.size(); i++) {
if (((ResultEntry) resultList.get(i)).urlentry.hash().equals(urlhash)) return true;
}
return false;
}
private boolean anyFailureWith(String urlhash) {
return (failedURLs.get(urlhash) != null);
}
}
private void registerFailure(String urlhash, String reason) {
this.failedURLs.put(urlhash, reason);
serverLog.logInfo("search", "sorted out hash " + urlhash + " during search: " + reason);
}
public ResultEntry oneResult(int item) {
// first sleep a while to give accumulation threads a chance to work
long sleeptime = this.eventTime + (this.query.maximumTime / this.query.displayResults() * ((item % this.query.displayResults()) + 1)) - System.currentTimeMillis();
if ((anyWorkerAlive()) && (sleeptime > 0)) {
try {Thread.sleep(sleeptime);} catch (InterruptedException e) {}
}
// if there are less than 10 more results available, sleep some extra time to get a chance that the "common sense" ranking algorithm can work
if ((this.resultList.size() <= item + 10) && (anyWorkerAlive())) {
try {Thread.sleep(300);} catch (InterruptedException e) {}
}
// then sleep until any result is available (that should not happen)
while ((this.resultList.size() <= item) && (anyWorkerAlive())) {
try {Thread.sleep(100);} catch (InterruptedException e) {}
}
// finally, if there is something, return the result
synchronized (this.resultList) {
// check if we have enough entries
if (this.resultList.size() <= item) return null;
// fetch the best entry from the resultList, not the entry from item position
// whenever a specific entry was switched in its position and was returned here
// a moving pointer is set to assign that item position as not changeable
int bestpick = postRankingFavourite(item);
if (bestpick != item) {
// switch the elements
ResultEntry buf = (ResultEntry) this.resultList.get(bestpick);
serverLog.logInfo("SEARCH_POSTRANKING", "prefering [" + bestpick + "] " + buf.urlstring() + " over [" + item + "] " + ((ResultEntry) this.resultList.get(item)).urlstring());
this.resultList.set(bestpick, (ResultEntry) this.resultList.get(item));
this.resultList.set(item, buf);
}
//this.resultListLock = item; // lock the element; be prepared to return it
return (ResultEntry) this.resultList.get(item);
}
}
private int postRankingFavourite(int item) {
// do a post-ranking on resultList, which should be locked upon time of this call
long rank, bestrank = 0;
int bestitem = item;
ResultEntry entry;
for (int i = item; i < this.resultList.size(); i++) {
entry = (ResultEntry) this.resultList.get(i);
rank = this.ranking.postRanking(this.query, this.references(10), entry, item);
if (rank > bestrank) {
bestrank = rank;
bestitem = i;
}
}
return bestitem;
}
/*
public void removeRedundant() {
// remove all urls from the pageAcc structure that occur double by specific redundancy rules
// a link is redundant, if a sub-path of the url is cited before. redundant urls are removed
// we find redundant urls by iteration over all elements in pageAcc
Iterator i = pageAcc.entrySet().iterator();
HashMap paths = new HashMap(); // a url-subpath to pageAcc-key relation
Map.Entry entry;
// first scan all entries and find all urls that are referenced
while (i.hasNext()) {
entry = (Map.Entry) i.next();
paths.put(((indexURLEntry) entry.getValue()).comp().url().toNormalform(true, true), entry.getKey());
//if (path != null) path = shortenPath(path);
//if (path != null) paths.put(path, entry.getKey());
}
// now scan the pageAcc again and remove all redundant urls
i = pageAcc.entrySet().iterator();
String shorten;
while (i.hasNext()) {
entry = (Map.Entry) i.next();
shorten = shortenPath(((indexURLEntry) entry.getValue()).comp().url().toNormalform(true, true));
// scan all subpaths of the url
while (shorten != null) {
if (pageAcc.size() <= query.wantedResults) break;
if (paths.containsKey(shorten)) {
//System.out.println("deleting path from search result: " + path + " is redundant to " + shorten);
try {
i.remove();
} catch (IllegalStateException e) {
}
}
shorten = shortenPath(shorten);
}
}
}
private static String shortenPath(String path) {
int pos = path.lastIndexOf('/');
if (pos < 0) return null;
return path.substring(0, pos);
}
*/
public ArrayList completeResults(long waitingtime) {
long timeout = System.currentTimeMillis() + waitingtime;
while ((this.resultList.size() < query.neededResults()) && (anyWorkerAlive()) && (System.currentTimeMillis() < timeout)) {
try {Thread.sleep(200);} catch (InterruptedException e) {}
}
return this.resultList;
}
/*
// generate Result.Entry objects and optionally fetch snippets
int i = 0;
Entry entry;
final boolean includeSnippets = false;
while ((acc.hasMoreElements()) && (i < query.wantedResults)) {
try {
entry = new Entry(acc.nextElement(), wordIndex);
} catch (final RuntimeException e) {
continue;
}
// check bluelist again: filter out all links where any
// bluelisted word
// appear either in url, url's description or search word
// the search word was sorted out earlier
if (includeSnippets) {
entry.setSnippet(plasmaSnippetCache.retrieveTextSnippet(
entry.url(), query.queryHashes, false,
entry.flags().get(plasmaCondenser.flag_cat_indexof), 260,
1000));
// snippet =
// snippetCache.retrieveTextSnippet(comp.url(),
// query.queryHashes, false,
// urlentry.flags().get(plasmaCondenser.flag_cat_indexof),
// 260, 1000);
} else {
// snippet = null;
entry.setSnippet(null);
}
i++;
hits.add(entry);
}
*/
boolean secondarySearchStartet = false;
private void prepareSecondarySearch() {
if (secondarySearchStartet) return; // dont do this twice
if ((rcAbstracts == null) || (rcAbstracts.size() != query.queryHashes.size())) return; // secondary search not possible (yet)
this.secondarySearchStartet = true;
// catch up index abstracts and join them; then call peers again to submit their urls
System.out.println("DEBUG-INDEXABSTRACT: " + rcAbstracts.size() + " word references catched, " + query.queryHashes.size() + " needed");
Iterator i = rcAbstracts.entrySet().iterator();
Map.Entry entry;
while (i.hasNext()) {
entry = (Map.Entry) i.next();
System.out.println("DEBUG-INDEXABSTRACT: hash " + (String) entry.getKey() + ": " + ((query.queryHashes.contains((String) entry.getKey())) ? "NEEDED" : "NOT NEEDED") + "; " + ((TreeMap) entry.getValue()).size() + " entries");
}
TreeMap abstractJoin = (rcAbstracts.size() == query.queryHashes.size()) ? kelondroMSetTools.joinConstructive(rcAbstracts.values(), true) : new TreeMap();
if (abstractJoin.size() == 0) {
System.out.println("DEBUG-INDEXABSTRACT: no success using index abstracts from remote peers");
} else {
System.out.println("DEBUG-INDEXABSTRACT: index abstracts delivered " + abstractJoin.size() + " additional results for secondary search");
// generate query for secondary search
TreeMap secondarySearchURLs = new TreeMap(); // a (peerhash:urlhash-liststring) mapping
Iterator i1 = abstractJoin.entrySet().iterator();
Map.Entry entry1;
String url, urls, peer, peers;
String mypeerhash = yacyCore.seedDB.mySeed().hash;
boolean mypeerinvolved = false;
int mypeercount;
while (i1.hasNext()) {
entry1 = (Map.Entry) i1.next();
url = (String) entry1.getKey();
peers = (String) entry1.getValue();
System.out.println("DEBUG-INDEXABSTRACT: url " + url + ": from peers " + peers);
mypeercount = 0;
for (int j = 0; j < peers.length(); j = j + 12) {
peer = peers.substring(j, j + 12);
if ((peer.equals(mypeerhash)) && (mypeercount++ > 1)) continue;
//if (peers.indexOf(peer) < j) continue; // avoid doubles that may appear in the abstractJoin
urls = (String) secondarySearchURLs.get(peer);
urls = (urls == null) ? url : urls + url;
secondarySearchURLs.put(peer, urls);
}
if (mypeercount == 1) mypeerinvolved = true;
}
// compute words for secondary search and start the secondary searches
i1 = secondarySearchURLs.entrySet().iterator();
String words;
secondarySearchThreads = new yacySearch[(mypeerinvolved) ? secondarySearchURLs.size() - 1 : secondarySearchURLs.size()];
int c = 0;
while (i1.hasNext()) {
entry1 = (Map.Entry) i1.next();
peer = (String) entry1.getKey();
if (peer.equals(mypeerhash)) continue; // we dont need to ask ourself
urls = (String) entry1.getValue();
words = wordsFromPeer(peer, urls);
System.out.println("DEBUG-INDEXABSTRACT ***: peer " + peer + " has urls: " + urls);
System.out.println("DEBUG-INDEXABSTRACT ***: peer " + peer + " from words: " + words);
secondarySearchThreads[c++] = yacySearch.secondaryRemoteSearch(
words, "", urls, wordIndex, this.rankedCache, peer, plasmaSwitchboard.urlBlacklist,
ranking, query.constraint, preselectedPeerHashes);
}
}
}
private String wordsFromPeer(String peerhash, String urls) {
Map.Entry entry;
String word, peerlist, url, wordlist = "";
TreeMap urlPeerlist;
int p;
boolean hasURL;
synchronized (rcAbstracts) {
Iterator i = rcAbstracts.entrySet().iterator();
while (i.hasNext()) {
entry = (Map.Entry) i.next();
word = (String) entry.getKey();
urlPeerlist = (TreeMap) entry.getValue();
hasURL = true;
for (int j = 0; j < urls.length(); j = j + 12) {
url = urls.substring(j, j + 12);
peerlist = (String) urlPeerlist.get(url);
p = (peerlist == null) ? -1 : peerlist.indexOf(peerhash);
if ((p < 0) || (p % 12 != 0)) {
hasURL = false;
break;
}
}
if (hasURL) wordlist += word;
}
}
return wordlist;
}
public void remove(String urlhash) {
// removes the url hash reference from last search result
/*indexRWIEntry e =*/ this.rankedCache.remove(urlhash);
//assert e != null;
}
public Set references(int count) {
// returns a set of words that are computed as toplist
return this.rankedCache.getReferences(count);
}
public static class ResultEntry {
// payload objects
private indexURLEntry urlentry;
private indexURLEntry.Components urlcomps; // buffer for components
private String alternative_urlstring;
private String alternative_urlname;
private plasmaSnippetCache.TextSnippet textSnippet;
private ArrayList /* of plasmaSnippetCache.MediaSnippet */ mediaSnippets;
// statistic objects
public long dbRetrievalTime, snippetComputationTime;
public ResultEntry(indexURLEntry urlentry, plasmaWordIndex wordIndex, plasmaSnippetCache.TextSnippet textSnippet, ArrayList mediaSnippets,
long dbRetrievalTime, long snippetComputationTime) {
this.urlentry = urlentry;
this.urlcomps = urlentry.comp();
this.alternative_urlstring = null;
this.alternative_urlname = null;
this.textSnippet = textSnippet;
this.mediaSnippets = mediaSnippets;
this.dbRetrievalTime = dbRetrievalTime;
this.snippetComputationTime = snippetComputationTime;
String host = urlcomps.url().getHost();
if (host.endsWith(".yacyh")) {
// translate host into current IP
int p = host.indexOf(".");
String hash = yacySeed.hexHash2b64Hash(host.substring(p + 1, host.length() - 6));
yacySeed seed = yacyCore.seedDB.getConnected(hash);
String filename = urlcomps.url().getFile();
String address = null;
if ((seed == null) || ((address = seed.getPublicAddress()) == null)) {
// seed is not known from here
try {
wordIndex.removeWordReferences(
plasmaCondenser.getWords(
("yacyshare " +
filename.replace('?', ' ') +
" " +
urlcomps.title()).getBytes(), "UTF-8").keySet(),
urlentry.hash());
wordIndex.loadedURL.remove(urlentry.hash()); // clean up
throw new RuntimeException("index void");
} catch (UnsupportedEncodingException e) {
throw new RuntimeException("parser failed: " + e.getMessage());
}
}
alternative_urlstring = "http://" + address + "/" + host.substring(0, p) + filename;
alternative_urlname = "http://share." + seed.getName() + ".yacy" + filename;
if ((p = alternative_urlname.indexOf("?")) > 0) alternative_urlname = alternative_urlname.substring(0, p);
}
}
public String hash() {
return urlentry.hash();
}
public yacyURL url() {
return urlcomps.url();
}
public kelondroBitfield flags() {
return urlentry.flags();
}
public String urlstring() {
return (alternative_urlstring == null) ? urlcomps.url().toNormalform(false, true) : alternative_urlstring;
}
public String urlname() {
return (alternative_urlname == null) ? urlcomps.url().toNormalform(false, true) : alternative_urlname;
}
public String title() {
return urlcomps.title();
}
public plasmaSnippetCache.TextSnippet textSnippet() {
return this.textSnippet;
}
public ArrayList /* of plasmaSnippetCache.MediaSnippet */ mediaSnippets() {
return this.mediaSnippets;
}
public Date modified() {
return urlentry.moddate();
}
public int filesize() {
return urlentry.size();
}
public int limage() {
return urlentry.limage();
}
public int laudio() {
return urlentry.laudio();
}
public int lvideo() {
return urlentry.lvideo();
}
public int lapp() {
return urlentry.lapp();
}
public indexRWIEntry word() {
return urlentry.word();
}
public boolean hasTextSnippet() {
return (this.textSnippet != null) && (this.textSnippet.getErrorCode() < 11);
}
public boolean hasMediaSnippets() {
return (this.mediaSnippets != null) && (this.mediaSnippets.size() > 0);
}
public String resource() {
// generate transport resource
if ((textSnippet != null) && (textSnippet.exists())) {
return urlentry.toString(textSnippet.getLineRaw());
} else {
return urlentry.toString();
}
}
}
}