mirror of
https://github.com/yacy/yacy_search_server.git
synced 2024-09-21 00:00:13 +02:00
7bcfa033c9
a cache access shall not made directly to the cache any more, all loading attempts shall use the LoaderDispatcher. To control the usage of the cache, a enum instance from CrawlProfile.CacheStrategy shall be used. Some direct loading methods without the usage of a cache strategy have been removed. This affects also the verify-option of the yacysearch servlet. If there is a 'verify=false' now after this commit this does not necessarily mean that no snippets are generated. Instead, all snippets that can be retrieved using the cache only are presented. This still means that the search hit was not verified because the snippet was generated using the cache. If a cache-based generation of snippets is not possible, then the verify=false causes that the link is not rejected. git-svn-id: https://svn.berlios.de/svnroot/repos/yacy/trunk@6936 6c8d7289-2bf4-0310-a012-ef5d649a1542
401 lines
19 KiB
Java
401 lines
19 KiB
Java
// SearchEvent.java
|
|
// (C) 2005 by Michael Peter Christen; mc@yacy.net, Frankfurt a. M., Germany
|
|
// first published 10.10.2005 on http://yacy.net
|
|
//
|
|
// This is a part of YaCy, a peer-to-peer based web search engine
|
|
//
|
|
// $LastChangedDate$
|
|
// $LastChangedRevision$
|
|
// $LastChangedBy$
|
|
//
|
|
// LICENSE
|
|
//
|
|
// This program is free software; you can redistribute it and/or modify
|
|
// it under the terms of the GNU General Public License as published by
|
|
// the Free Software Foundation; either version 2 of the License, or
|
|
// (at your option) any later version.
|
|
//
|
|
// This program is distributed in the hope that it will be useful,
|
|
// but WITHOUT ANY WARRANTY; without even the implied warranty of
|
|
// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
|
|
// GNU General Public License for more details.
|
|
//
|
|
// You should have received a copy of the GNU General Public License
|
|
// along with this program; if not, write to the Free Software
|
|
// Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
|
|
|
|
package de.anomic.search;
|
|
|
|
import java.util.ArrayList;
|
|
import java.util.Iterator;
|
|
import java.util.Map;
|
|
|
|
import net.yacy.cora.document.MultiProtocolURI;
|
|
import net.yacy.document.Condenser;
|
|
import net.yacy.kelondro.data.meta.URIMetadataRow;
|
|
import net.yacy.kelondro.data.word.Word;
|
|
import net.yacy.kelondro.index.HandleSet;
|
|
import net.yacy.kelondro.index.RowSpaceExceededException;
|
|
import net.yacy.kelondro.logging.Log;
|
|
import net.yacy.kelondro.util.EventTracker;
|
|
import net.yacy.kelondro.util.SortStack;
|
|
import net.yacy.kelondro.util.SortStore;
|
|
import net.yacy.repository.LoaderDispatcher;
|
|
|
|
import de.anomic.crawler.CrawlProfile;
|
|
import de.anomic.search.MediaSnippet;
|
|
import de.anomic.yacy.yacySeedDB;
|
|
import de.anomic.yacy.graphics.ProfilingGraph;
|
|
|
|
public class ResultFetcher {
|
|
|
|
// input values
|
|
final RankingProcess rankedCache; // ordered search results, grows dynamically as all the query threads enrich this container
|
|
QueryParams query;
|
|
private final yacySeedDB peers;
|
|
|
|
// result values
|
|
protected final LoaderDispatcher loader;
|
|
protected Worker[] workerThreads;
|
|
protected final SortStore<ResultEntry> result;
|
|
protected final SortStore<MediaSnippet> images; // container to sort images by size
|
|
protected final HandleSet failedURLs; // a set of urlhashes that could not been verified during search
|
|
protected final HandleSet snippetFetchWordHashes; // a set of word hashes that are used to match with the snippets
|
|
long urlRetrievalAllTime;
|
|
long snippetComputationAllTime;
|
|
int taketimeout;
|
|
|
|
public ResultFetcher(
|
|
final LoaderDispatcher loader,
|
|
RankingProcess rankedCache,
|
|
final QueryParams query,
|
|
final yacySeedDB peers,
|
|
final int taketimeout) {
|
|
|
|
this.loader = loader;
|
|
this.rankedCache = rankedCache;
|
|
this.query = query;
|
|
this.peers = peers;
|
|
this.taketimeout = taketimeout;
|
|
|
|
this.urlRetrievalAllTime = 0;
|
|
this.snippetComputationAllTime = 0;
|
|
this.result = new SortStore<ResultEntry>(-1, true); // this is the result, enriched with snippets, ranked and ordered by ranking
|
|
this.images = new SortStore<MediaSnippet>(-1, true);
|
|
this.failedURLs = new HandleSet(URIMetadataRow.rowdef.primaryKeyLength, URIMetadataRow.rowdef.objectOrder, 0); // a set of url hashes where a worker thread tried to work on, but failed.
|
|
|
|
// snippets do not need to match with the complete query hashes,
|
|
// only with the query minus the stopwords which had not been used for the search
|
|
HandleSet filtered;
|
|
try {
|
|
filtered = HandleSet.joinConstructive(query.queryHashes, Switchboard.stopwordHashes);
|
|
} catch (RowSpaceExceededException e) {
|
|
Log.logException(e);
|
|
filtered = new HandleSet(query.queryHashes.row().primaryKeyLength, query.queryHashes.comparator(), 0);
|
|
}
|
|
this.snippetFetchWordHashes = query.queryHashes.clone();
|
|
if (filtered != null && !filtered.isEmpty()) {
|
|
this.snippetFetchWordHashes.excludeDestructive(Switchboard.stopwordHashes);
|
|
}
|
|
|
|
// start worker threads to fetch urls and snippets
|
|
this.workerThreads = null;
|
|
deployWorker(Math.min(10, query.itemsPerPage), query.neededResults());
|
|
EventTracker.update("SEARCH", new ProfilingGraph.searchEvent(query.id(true), this.workerThreads.length + " online snippet fetch threads started", 0, 0), false, 30000, ProfilingGraph.maxTime);
|
|
}
|
|
|
|
public void deployWorker(int deployCount, int neededResults) {
|
|
if (anyWorkerAlive()) return;
|
|
this.workerThreads = new Worker[(query.snippetCacheStrategy.isAllowedToFetchOnline()) ? deployCount : 1];
|
|
for (int i = 0; i < workerThreads.length; i++) {
|
|
this.workerThreads[i] = new Worker(i, 10000, query.snippetCacheStrategy, neededResults);
|
|
this.workerThreads[i].start();
|
|
}
|
|
}
|
|
|
|
boolean anyWorkerAlive() {
|
|
if (this.workerThreads == null) return false;
|
|
for (int i = 0; i < this.workerThreads.length; i++) {
|
|
if ((this.workerThreads[i] != null) &&
|
|
(this.workerThreads[i].isAlive()) &&
|
|
(this.workerThreads[i].busytime() < 3000)) return true;
|
|
}
|
|
return false;
|
|
}
|
|
|
|
public long getURLRetrievalTime() {
|
|
return this.urlRetrievalAllTime;
|
|
}
|
|
|
|
public long getSnippetComputationTime() {
|
|
return this.snippetComputationAllTime;
|
|
}
|
|
|
|
protected class Worker extends Thread {
|
|
|
|
private final long timeout; // the date until this thread should try to work
|
|
private long lastLifeSign; // when the last time the run()-loop was executed
|
|
private final int id;
|
|
private final CrawlProfile.CacheStrategy cacheStrategy;
|
|
private final int neededResults;
|
|
|
|
public Worker(final int id, final long maxlifetime, CrawlProfile.CacheStrategy cacheStrategy, int neededResults) {
|
|
this.id = id;
|
|
this.cacheStrategy = cacheStrategy;
|
|
this.lastLifeSign = System.currentTimeMillis();
|
|
this.timeout = System.currentTimeMillis() + Math.max(1000, maxlifetime);
|
|
this.neededResults = neededResults;
|
|
}
|
|
|
|
public void run() {
|
|
|
|
// start fetching urls and snippets
|
|
URIMetadataRow page;
|
|
//final int fetchAhead = snippetMode == 0 ? 0 : 10;
|
|
boolean nav_topics = query.navigators.equals("all") || query.navigators.indexOf("topics") >= 0;
|
|
try {
|
|
while (System.currentTimeMillis() < this.timeout) {
|
|
if (result.size() > neededResults) break;
|
|
this.lastLifeSign = System.currentTimeMillis();
|
|
|
|
// check if we have enough
|
|
if ((query.contentdom == ContentDomain.IMAGE) && (images.size() >= query.neededResults() + 50)) break;
|
|
if ((query.contentdom != ContentDomain.IMAGE) && (result.size() >= query.neededResults() + 10)) break;
|
|
|
|
// get next entry
|
|
page = rankedCache.takeURL(true, taketimeout);
|
|
if (page == null) break;
|
|
if (failedURLs.has(page.hash())) continue;
|
|
|
|
final ResultEntry resultEntry = fetchSnippet(page, cacheStrategy); // does not fetch snippets if snippetMode == 0
|
|
|
|
if (resultEntry == null) continue; // the entry had some problems, cannot be used
|
|
if (result.exists(resultEntry)) continue;
|
|
|
|
urlRetrievalAllTime += resultEntry.dbRetrievalTime;
|
|
snippetComputationAllTime += resultEntry.snippetComputationTime;
|
|
//System.out.println("+++DEBUG-resultWorker+++ fetched " + resultEntry.urlstring());
|
|
|
|
// place the result to the result vector
|
|
// apply post-ranking
|
|
long ranking = Long.valueOf(rankedCache.getOrder().cardinal(resultEntry.word()));
|
|
ranking += postRanking(resultEntry, rankedCache.getTopics());
|
|
//System.out.println("*** resultEntry.hash = " + resultEntry.hash());
|
|
result.push(resultEntry, ranking);
|
|
if (nav_topics) rankedCache.addTopics(resultEntry);
|
|
//System.out.println("DEBUG SNIPPET_LOADING: thread " + id + " got " + resultEntry.url());
|
|
}
|
|
} catch (final Exception e) {
|
|
Log.logException(e);
|
|
}
|
|
Log.logInfo("SEARCH", "resultWorker thread " + id + " terminated");
|
|
}
|
|
|
|
public long busytime() {
|
|
return System.currentTimeMillis() - this.lastLifeSign;
|
|
}
|
|
}
|
|
|
|
protected ResultEntry fetchSnippet(final URIMetadataRow page, CrawlProfile.CacheStrategy cacheStrategy) {
|
|
// Snippet Fetching can has 3 modes:
|
|
// 0 - do not fetch snippets
|
|
// 1 - fetch snippets offline only
|
|
// 2 - online snippet fetch
|
|
|
|
// load only urls if there was not yet a root url of that hash
|
|
// find the url entry
|
|
|
|
long startTime = System.currentTimeMillis();
|
|
final URIMetadataRow.Components metadata = page.metadata();
|
|
if (metadata == null) return null;
|
|
final long dbRetrievalTime = System.currentTimeMillis() - startTime;
|
|
|
|
if (cacheStrategy == null) {
|
|
return new ResultEntry(page, query.getSegment(), peers, null, null, dbRetrievalTime, 0); // result without snippet
|
|
}
|
|
|
|
// load snippet
|
|
if (query.contentdom == ContentDomain.TEXT) {
|
|
// attach text snippet
|
|
startTime = System.currentTimeMillis();
|
|
final TextSnippet snippet = TextSnippet.retrieveTextSnippet(
|
|
this.loader,
|
|
metadata,
|
|
snippetFetchWordHashes,
|
|
cacheStrategy,
|
|
((query.constraint != null) && (query.constraint.get(Condenser.flag_cat_indexof))),
|
|
180,
|
|
Integer.MAX_VALUE,
|
|
query.isGlobal());
|
|
final long snippetComputationTime = System.currentTimeMillis() - startTime;
|
|
Log.logInfo("SEARCH", "text snippet load time for " + metadata.url() + ": " + snippetComputationTime + ", " + ((snippet.getErrorCode() < 11) ? "snippet found" : ("no snippet found (" + snippet.getError() + ")")));
|
|
|
|
if (snippet.getErrorCode() < 11) {
|
|
// we loaded the file and found the snippet
|
|
return new ResultEntry(page, query.getSegment(), peers, snippet, null, dbRetrievalTime, snippetComputationTime); // result with snippet attached
|
|
} else if (cacheStrategy.mustBeOffline()) {
|
|
// we did not demand online loading, therefore a failure does not mean that the missing snippet causes a rejection of this result
|
|
// this may happen during a remote search, because snippet loading is omitted to retrieve results faster
|
|
return new ResultEntry(page, query.getSegment(), peers, null, null, dbRetrievalTime, snippetComputationTime); // result without snippet
|
|
} else {
|
|
// problems with snippet fetch
|
|
registerFailure(page.hash(), "no text snippet for URL " + metadata.url() + "; errorCode = " + snippet.getErrorCode());
|
|
return null;
|
|
}
|
|
} else {
|
|
// attach media information
|
|
startTime = System.currentTimeMillis();
|
|
final ArrayList<MediaSnippet> mediaSnippets = MediaSnippet.retrieveMediaSnippets(metadata.url(), snippetFetchWordHashes, query.contentdom, cacheStrategy, 6000, query.isGlobal());
|
|
final long snippetComputationTime = System.currentTimeMillis() - startTime;
|
|
Log.logInfo("SEARCH", "media snippet load time for " + metadata.url() + ": " + snippetComputationTime);
|
|
|
|
if (mediaSnippets != null && !mediaSnippets.isEmpty()) {
|
|
// found media snippets, return entry
|
|
return new ResultEntry(page, query.getSegment(), peers, null, mediaSnippets, dbRetrievalTime, snippetComputationTime);
|
|
} else if (cacheStrategy.mustBeOffline()) {
|
|
return new ResultEntry(page, query.getSegment(), peers, null, null, dbRetrievalTime, snippetComputationTime);
|
|
} else {
|
|
// problems with snippet fetch
|
|
registerFailure(page.hash(), "no media snippet for URL " + metadata.url());
|
|
return null;
|
|
}
|
|
}
|
|
// finished, no more actions possible here
|
|
}
|
|
|
|
private void registerFailure(final byte[] urlhash, final String reason) {
|
|
try {
|
|
this.failedURLs.put(urlhash);
|
|
} catch (RowSpaceExceededException e) {
|
|
Log.logException(e);
|
|
}
|
|
Log.logInfo("SEARCH", "sorted out urlhash " + new String(urlhash) + " during search: " + reason);
|
|
}
|
|
|
|
public int resultCount() {
|
|
return this.result.size();
|
|
}
|
|
|
|
public ResultEntry oneResult(final int item) {
|
|
// check if we already retrieved this item
|
|
// (happens if a search pages is accessed a second time)
|
|
EventTracker.update("SEARCH", new ProfilingGraph.searchEvent(query.id(true), "obtain one result entry - start", 0, 0), false, 30000, ProfilingGraph.maxTime);
|
|
if (this.result.sizeStore() > item) {
|
|
// we have the wanted result already in the result array .. return that
|
|
return this.result.element(item).element;
|
|
}
|
|
/*
|
|
System.out.println("rankedCache.size() = " + this.rankedCache.size());
|
|
System.out.println("result.size() = " + this.result.size());
|
|
System.out.println("query.neededResults() = " + query.neededResults());
|
|
*/
|
|
if ((!anyWorkerAlive()) &&
|
|
(((query.contentdom == ContentDomain.IMAGE) && (images.size() + 30 < query.neededResults())) ||
|
|
(this.result.size() < query.neededResults())) &&
|
|
//(event.query.onlineSnippetFetch) &&
|
|
(this.rankedCache.size() > this.result.size())
|
|
) {
|
|
// start worker threads to fetch urls and snippets
|
|
deployWorker(Math.min(10, query.itemsPerPage), query.neededResults());
|
|
}
|
|
|
|
// finally wait until enough results are there produced from the
|
|
// snippet fetch process
|
|
while ((anyWorkerAlive()) && (result.size() <= item)) {
|
|
try {Thread.sleep((item % query.itemsPerPage) * 10L);} catch (final InterruptedException e) {}
|
|
}
|
|
|
|
// finally, if there is something, return the result
|
|
if (this.result.size() <= item) return null;
|
|
return this.result.element(item).element;
|
|
}
|
|
|
|
private int resultCounter = 0;
|
|
public ResultEntry nextResult() {
|
|
final ResultEntry re = oneResult(resultCounter);
|
|
resultCounter++;
|
|
return re;
|
|
}
|
|
|
|
public MediaSnippet oneImage(final int item) {
|
|
// check if we already retrieved this item (happens if a search pages is accessed a second time)
|
|
if (this.images.sizeStore() > item) {
|
|
// we have the wanted result already in the result array .. return that
|
|
return this.images.element(item).element;
|
|
}
|
|
|
|
// generate result object
|
|
final ResultEntry result = nextResult();
|
|
MediaSnippet ms;
|
|
if (result != null) {
|
|
// iterate over all images in the result
|
|
final ArrayList<MediaSnippet> imagemedia = result.mediaSnippets();
|
|
if (imagemedia != null) {
|
|
for (int j = 0; j < imagemedia.size(); j++) {
|
|
ms = imagemedia.get(j);
|
|
images.push(ms, Long.valueOf(ms.ranking));
|
|
//System.out.println("*** image " + ms.href.hash() + " images.size = " + images.size() + "/" + images.size());
|
|
}
|
|
}
|
|
}
|
|
|
|
// now take the specific item from the image stack
|
|
if (this.images.size() <= item) return null;
|
|
return this.images.element(item).element;
|
|
}
|
|
|
|
public ArrayList<SortStack<ResultEntry>.stackElement> completeResults(final long waitingtime) {
|
|
final long timeout = System.currentTimeMillis() + waitingtime;
|
|
while ((result.size() < query.neededResults()) && (anyWorkerAlive()) && (System.currentTimeMillis() < timeout)) {
|
|
try {Thread.sleep(100);} catch (final InterruptedException e) {}
|
|
//System.out.println("+++DEBUG-completeResults+++ sleeping " + 200);
|
|
}
|
|
return this.result.list(this.result.size());
|
|
}
|
|
|
|
public long postRanking(
|
|
final ResultEntry rentry,
|
|
final Map<String, Navigator.Item> topwords) {
|
|
|
|
long r = 0;
|
|
|
|
// for media search: prefer pages with many links
|
|
if (query.contentdom == ContentDomain.IMAGE) r += rentry.limage() << query.ranking.coeff_cathasimage;
|
|
if (query.contentdom == ContentDomain.AUDIO) r += rentry.laudio() << query.ranking.coeff_cathasaudio;
|
|
if (query.contentdom == ContentDomain.VIDEO) r += rentry.lvideo() << query.ranking.coeff_cathasvideo;
|
|
if (query.contentdom == ContentDomain.APP ) r += rentry.lapp() << query.ranking.coeff_cathasapp;
|
|
|
|
// prefer hit with 'prefer' pattern
|
|
if (query.prefer.matcher(rentry.url().toNormalform(true, true)).matches()) r += 256 << query.ranking.coeff_prefer;
|
|
if (query.prefer.matcher(rentry.title()).matches()) r += 256 << query.ranking.coeff_prefer;
|
|
|
|
// apply 'common-sense' heuristic using references
|
|
final String urlstring = rentry.url().toNormalform(true, true);
|
|
final String[] urlcomps = MultiProtocolURI.urlComps(urlstring);
|
|
final String[] descrcomps = MultiProtocolURI.splitpattern.split(rentry.title().toLowerCase());
|
|
Navigator.Item tc;
|
|
for (int j = 0; j < urlcomps.length; j++) {
|
|
tc = topwords.get(urlcomps[j]);
|
|
if (tc != null) r += Math.max(1, tc.count) << query.ranking.coeff_urlcompintoplist;
|
|
}
|
|
for (int j = 0; j < descrcomps.length; j++) {
|
|
tc = topwords.get(descrcomps[j]);
|
|
if (tc != null) r += Math.max(1, tc.count) << query.ranking.coeff_descrcompintoplist;
|
|
}
|
|
|
|
// apply query-in-result matching
|
|
final HandleSet urlcomph = Word.words2hashesHandles(urlcomps);
|
|
final HandleSet descrcomph = Word.words2hashesHandles(descrcomps);
|
|
final Iterator<byte[]> shi = query.queryHashes.iterator();
|
|
byte[] queryhash;
|
|
while (shi.hasNext()) {
|
|
queryhash = shi.next();
|
|
if (urlcomph.has(queryhash)) r += 256 << query.ranking.coeff_appurl;
|
|
if (descrcomph.has(queryhash)) r += 256 << query.ranking.coeff_app_dc_title;
|
|
}
|
|
|
|
return r;
|
|
}
|
|
|
|
}
|