yacy_search_server/source/net/yacy/search/query/SnippetWorker.java
orbiter 5dfd6359cb redesign of the QueryParams class: introduced QueryGoal which holds the
query string parser. This shall be used to create a proper full-string
matching which is handled then by QueryGoal.
2012-11-18 01:22:41 +01:00

264 lines
14 KiB
Java

/**
* SnippetWorker
* Copyright 2012 by Michael Peter Christen, mc@yacy.net, Frankfurt a. M., Germany
* First released 01.11.2012 at http://yacy.net
*
* This library is free software; you can redistribute it and/or
* modify it under the terms of the GNU Lesser General Public
* License as published by the Free Software Foundation; either
* version 2.1 of the License, or (at your option) any later version.
*
* This library is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
* Lesser General Public License for more details.
*
* You should have received a copy of the GNU Lesser General Public License
* along with this program in the file lgpl21.txt
* If not, see <http://www.gnu.org/licenses/>.
*/
package net.yacy.search.query;
import java.util.Iterator;
import net.yacy.cora.document.ASCII;
import net.yacy.cora.document.Classification;
import net.yacy.cora.document.MultiProtocolURI;
import net.yacy.cora.federate.yacy.CacheStrategy;
import net.yacy.cora.sorting.ConcurrentScoreMap;
import net.yacy.cora.sorting.ScoreMap;
import net.yacy.cora.sorting.WeakPriorityBlockingQueue.ReverseElement;
import net.yacy.cora.storage.HandleSet;
import net.yacy.document.Condenser;
import net.yacy.kelondro.data.meta.URIMetadataNode;
import net.yacy.kelondro.data.word.Word;
import net.yacy.kelondro.logging.Log;
import net.yacy.kelondro.util.MemoryControl;
import net.yacy.search.index.Segment;
import net.yacy.search.snippet.ResultEntry;
import net.yacy.search.snippet.TextSnippet;
import net.yacy.search.snippet.TextSnippet.ResultClass;
public class SnippetWorker extends Thread {
private final SearchEvent snippetProcess;
private final long timeout; // the date until this thread should try to work
private long lastLifeSign; // when the last time the run()-loop was executed
private final CacheStrategy cacheStrategy;
private final int neededResults;
private boolean shallrun;
protected SnippetWorker(final SearchEvent snippetProcess, final long maxlifetime, final CacheStrategy cacheStrategy, final int neededResults) {
this.snippetProcess = snippetProcess;
this.cacheStrategy = cacheStrategy;
this.lastLifeSign = System.currentTimeMillis();
this.timeout = System.currentTimeMillis() + Math.max(1000, maxlifetime);
this.neededResults = neededResults;
this.shallrun = true;
}
@Override
public void run() {
// start fetching urls and snippets
URIMetadataNode page;
ResultEntry resultEntry;
try {
while (this.shallrun && System.currentTimeMillis() < this.timeout) {
this.lastLifeSign = System.currentTimeMillis();
if (MemoryControl.shortStatus()) {
Log.logWarning("SnippetProcess", "shortStatus");
break;
}
// check if we have enough; we stop only if we can fetch online; otherwise its better to run this to get better navigation
if ((this.cacheStrategy == null || this.cacheStrategy.isAllowedToFetchOnline()) && this.snippetProcess.result.sizeAvailable() >= this.neededResults) {
Log.logWarning("SnippetProcess", this.snippetProcess.result.sizeAvailable() + " = result.sizeAvailable() >= this.neededResults = " + this.neededResults);
break;
}
// check if we can succeed if we try to take another url
if (this.snippetProcess.rankingProcess.feedingIsFinished() && this.snippetProcess.rankingProcess.rwiQueueSize() == 0 && this.snippetProcess.nodeStack.sizeAvailable() == 0) {
Log.logWarning("SnippetProcess", "rankingProcess.feedingIsFinished() && rankingProcess.sizeQueue() == 0");
break;
}
// get next entry
page = this.snippetProcess.takeURL(true, Math.min(500, Math.max(20, this.timeout - System.currentTimeMillis())));
//if (page != null) Log.logInfo("SnippetProcess", "got one page: " + page.metadata().url().toNormalform(true, false));
//if (page == null) page = rankedCache.takeURL(false, this.timeout - System.currentTimeMillis());
if (page == null) {
//Log.logWarning("SnippetProcess", "page == null");
break; // no more available
}
this.setName(page.url().toNormalform(true)); // to support debugging
if (this.snippetProcess.query.filterfailurls && this.snippetProcess.workTables.failURLsContains(page.hash())) {
continue;
}
resultEntry = fetchSnippet(page, this.cacheStrategy); // does not fetch snippets if snippetMode == 0
if (resultEntry == null) {
continue; // the entry had some problems, cannot be used
}
//if (result.contains(resultEntry)) continue;
this.snippetProcess.snippetComputationAllTime += resultEntry.snippetComputationTime;
// place the result to the result vector
// apply post-ranking
long ranking = resultEntry.word() == null ? 0 : Long.valueOf(this.snippetProcess.rankingProcess.order.cardinal(resultEntry.word()));
ranking += postRanking(resultEntry, new ConcurrentScoreMap<String>() /*this.snippetProcess.rankingProcess.getTopicNavigator(10)*/);
resultEntry.ranking = ranking;
this.snippetProcess.result.put(new ReverseElement<ResultEntry>(resultEntry, ranking)); // remove smallest in case of overflow
this.snippetProcess.rankingProcess.addTopics(resultEntry);
}
if (System.currentTimeMillis() >= this.timeout) {
Log.logWarning("SnippetProcess", "worker ended with timeout");
}
//System.out.println("FINISHED WORKER " + id + " FOR " + this.neededResults + " RESULTS, loops = " + loops);
} catch (final Exception e) { Log.logException(e); }
//Log.logInfo("SEARCH", "resultWorker thread " + this.id + " terminated");
}
protected void pleaseStop() {
this.shallrun = false;
}
/**
* calculate the time since the worker has had the latest activity
* @return time in milliseconds lasted since latest activity
*/
protected long busytime() {
return System.currentTimeMillis() - this.lastLifeSign;
}
private long postRanking(
final ResultEntry rentry,
final ScoreMap<String> topwords) {
long r = 0;
// for media search: prefer pages with many links
r += rentry.limage() << this.snippetProcess.query.ranking.coeff_cathasimage;
r += rentry.laudio() << this.snippetProcess.query.ranking.coeff_cathasaudio;
r += rentry.lvideo() << this.snippetProcess.query.ranking.coeff_cathasvideo;
r += rentry.lapp() << this.snippetProcess.query.ranking.coeff_cathasapp;
// apply citation count
//System.out.println("POSTRANKING CITATION: references = " + rentry.referencesCount() + ", inbound = " + rentry.llocal() + ", outbound = " + rentry.lother());
r += (128 * rentry.referencesCount() / (1 + 2 * rentry.llocal() + rentry.lother())) << this.snippetProcess.query.ranking.coeff_citation;
// prefer hit with 'prefer' pattern
if (this.snippetProcess.query.prefer.matcher(rentry.url().toNormalform(true)).matches()) {
r += 256 << this.snippetProcess.query.ranking.coeff_prefer;
}
if (this.snippetProcess.query.prefer.matcher(rentry.title()).matches()) {
r += 256 << this.snippetProcess.query.ranking.coeff_prefer;
}
// apply 'common-sense' heuristic using references
final String urlstring = rentry.url().toNormalform(true);
final String[] urlcomps = MultiProtocolURI.urlComps(urlstring);
final String[] descrcomps = MultiProtocolURI.splitpattern.split(rentry.title().toLowerCase());
int tc;
for (final String urlcomp : urlcomps) {
tc = topwords.get(urlcomp);
if (tc > 0) {
r += Math.max(1, tc) << this.snippetProcess.query.ranking.coeff_urlcompintoplist;
}
}
for (final String descrcomp : descrcomps) {
tc = topwords.get(descrcomp);
if (tc > 0) {
r += Math.max(1, tc) << this.snippetProcess.query.ranking.coeff_descrcompintoplist;
}
}
// apply query-in-result matching
final HandleSet urlcomph = Word.words2hashesHandles(urlcomps);
final HandleSet descrcomph = Word.words2hashesHandles(descrcomps);
final Iterator<byte[]> shi = this.snippetProcess.query.getQueryGoal().getIncludeHashes().iterator();
byte[] queryhash;
while (shi.hasNext()) {
queryhash = shi.next();
if (urlcomph.has(queryhash)) {
r += 256 << this.snippetProcess.query.ranking.coeff_appurl;
}
if (descrcomph.has(queryhash)) {
r += 256 << this.snippetProcess.query.ranking.coeff_app_dc_title;
}
}
return r;
}
private ResultEntry fetchSnippet(final URIMetadataNode page, final CacheStrategy cacheStrategy) {
// Snippet Fetching can has 3 modes:
// 0 - do not fetch snippets
// 1 - fetch snippets offline only
// 2 - online snippet fetch
// load only urls if there was not yet a root url of that hash
// find the url entry
String solrsnippet = this.snippetProcess.snippets.get(ASCII.String(page.hash()));
if (solrsnippet != null && solrsnippet.length() > 0) {
final TextSnippet snippet = new TextSnippet(page.hash(), solrsnippet, true, ResultClass.SOURCE_CACHE, "");
return new ResultEntry(page, this.snippetProcess.query.getSegment(), this.snippetProcess.peers, snippet, null, 0);
}
if (cacheStrategy == null) {
final TextSnippet snippet = new TextSnippet(
null,
page,
this.snippetProcess.snippetFetchWordHashes,
//this.query.queryString,
null,
((this.snippetProcess.query.constraint != null) && (this.snippetProcess.query.constraint.get(Condenser.flag_cat_indexof))),
SearchEvent.SNIPPET_MAX_LENGTH,
!this.snippetProcess.query.isLocal());
return new ResultEntry(page, this.snippetProcess.query.getSegment(), this.snippetProcess.peers, snippet, null, 0); // result without snippet
}
// load snippet
if (page.url().getContentDomain() == Classification.ContentDomain.TEXT || page.url().getContentDomain() == Classification.ContentDomain.ALL) {
// attach text snippet
long startTime = System.currentTimeMillis();
final TextSnippet snippet = new TextSnippet(
this.snippetProcess.loader,
page,
this.snippetProcess.snippetFetchWordHashes,
cacheStrategy,
((this.snippetProcess.query.constraint != null) && (this.snippetProcess.query.constraint.get(Condenser.flag_cat_indexof))),
180,
!this.snippetProcess.query.isLocal());
final long snippetComputationTime = System.currentTimeMillis() - startTime;
SearchEvent.log.logInfo("text snippet load time for " + page.url() + ": " + snippetComputationTime + ", " + (!snippet.getErrorCode().fail() ? "snippet found" : ("no snippet found (" + snippet.getError() + ")")));
if (!snippet.getErrorCode().fail()) {
// we loaded the file and found the snippet
return new ResultEntry(page, this.snippetProcess.query.getSegment(), this.snippetProcess.peers, snippet, null, snippetComputationTime); // result with snippet attached
} else if (cacheStrategy.mustBeOffline()) {
// we did not demand online loading, therefore a failure does not mean that the missing snippet causes a rejection of this result
// this may happen during a remote search, because snippet loading is omitted to retrieve results faster
return new ResultEntry(page, this.snippetProcess.query.getSegment(), this.snippetProcess.peers, null, null, snippetComputationTime); // result without snippet
} else {
// problems with snippet fetch
if (this.snippetProcess.snippetFetchWordHashes.has(Segment.catchallHash)) {
// we accept that because the word cannot be on the page
return new ResultEntry(page, this.snippetProcess.query.getSegment(), this.snippetProcess.peers, null, null, 0);
}
final String reason = "no text snippet; errorCode = " + snippet.getErrorCode();
if (this.snippetProcess.deleteIfSnippetFail) {
this.snippetProcess.workTables.failURLsRegisterMissingWord(this.snippetProcess.query.getSegment().termIndex(), page.url(), this.snippetProcess.query.getQueryGoal().getIncludeHashes(), reason);
}
SearchEvent.log.logInfo("sorted out url " + page.url().toNormalform(true) + " during search: " + reason);
return null;
}
}
return new ResultEntry(page, this.snippetProcess.query.getSegment(), this.snippetProcess.peers, null, null, 0); // result without snippet
}
}