mirror of
https://github.com/yacy/yacy_search_server.git
synced 2024-09-21 00:00:13 +02:00
d6b82840f8
This uses an enhanced version of the Nutch/Solr TextProfileSignatue. As a result, a signature of the document is written to the solr search index. Additionally for each time when a signature is written, it is checked if the singature exists already in the index. If the signature does not exist, the document is marked as unique. The unique attribute can now be used to sort document lists and bring duplicates to the end of a result list. To enable this, a large portion of the search api to Solr had to be changed. This affected mainly caching of 'exists' searches to enhance the check for existing signatures and do this without actually doing a solr query. Because here the first time a long number is used as value in the Solr store, also the value naming in the YaCySchema had to be adopted and normalized. This caused that many files had to be changed.
264 lines
14 KiB
Java
264 lines
14 KiB
Java
/**
|
|
* SnippetWorker
|
|
* Copyright 2012 by Michael Peter Christen, mc@yacy.net, Frankfurt a. M., Germany
|
|
* First released 01.11.2012 at http://yacy.net
|
|
*
|
|
* This library is free software; you can redistribute it and/or
|
|
* modify it under the terms of the GNU Lesser General Public
|
|
* License as published by the Free Software Foundation; either
|
|
* version 2.1 of the License, or (at your option) any later version.
|
|
*
|
|
* This library is distributed in the hope that it will be useful,
|
|
* but WITHOUT ANY WARRANTY; without even the implied warranty of
|
|
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
|
|
* Lesser General Public License for more details.
|
|
*
|
|
* You should have received a copy of the GNU Lesser General Public License
|
|
* along with this program in the file lgpl21.txt
|
|
* If not, see <http://www.gnu.org/licenses/>.
|
|
*/
|
|
|
|
package net.yacy.search.query;
|
|
|
|
import java.util.Iterator;
|
|
|
|
import net.yacy.cora.document.ASCII;
|
|
import net.yacy.cora.document.MultiProtocolURI;
|
|
import net.yacy.cora.document.analysis.Classification;
|
|
import net.yacy.cora.federate.yacy.CacheStrategy;
|
|
import net.yacy.cora.sorting.ConcurrentScoreMap;
|
|
import net.yacy.cora.sorting.ScoreMap;
|
|
import net.yacy.cora.sorting.WeakPriorityBlockingQueue.ReverseElement;
|
|
import net.yacy.cora.storage.HandleSet;
|
|
import net.yacy.document.Condenser;
|
|
import net.yacy.kelondro.data.meta.URIMetadataNode;
|
|
import net.yacy.kelondro.data.word.Word;
|
|
import net.yacy.kelondro.logging.Log;
|
|
import net.yacy.kelondro.util.MemoryControl;
|
|
import net.yacy.search.index.Segment;
|
|
import net.yacy.search.snippet.ResultEntry;
|
|
import net.yacy.search.snippet.TextSnippet;
|
|
import net.yacy.search.snippet.TextSnippet.ResultClass;
|
|
|
|
public class SnippetWorker extends Thread {
|
|
private final SearchEvent snippetProcess;
|
|
private final long timeout; // the date until this thread should try to work
|
|
private long lastLifeSign; // when the last time the run()-loop was executed
|
|
private final CacheStrategy cacheStrategy;
|
|
private final int neededResults;
|
|
private boolean shallrun;
|
|
|
|
protected SnippetWorker(final SearchEvent snippetProcess, final long maxlifetime, final CacheStrategy cacheStrategy, final int neededResults) {
|
|
this.snippetProcess = snippetProcess;
|
|
this.cacheStrategy = cacheStrategy;
|
|
this.lastLifeSign = System.currentTimeMillis();
|
|
this.timeout = System.currentTimeMillis() + Math.max(1000, maxlifetime);
|
|
this.neededResults = neededResults;
|
|
this.shallrun = true;
|
|
}
|
|
|
|
@Override
|
|
public void run() {
|
|
|
|
// start fetching urls and snippets
|
|
URIMetadataNode page;
|
|
ResultEntry resultEntry;
|
|
try {
|
|
while (this.shallrun && System.currentTimeMillis() < this.timeout) {
|
|
this.lastLifeSign = System.currentTimeMillis();
|
|
|
|
if (MemoryControl.shortStatus()) {
|
|
Log.logWarning("SnippetProcess", "shortStatus");
|
|
break;
|
|
}
|
|
|
|
// check if we have enough; we stop only if we can fetch online; otherwise its better to run this to get better navigation
|
|
if ((this.cacheStrategy == null || this.cacheStrategy.isAllowedToFetchOnline()) && this.snippetProcess.result.sizeAvailable() >= this.neededResults) {
|
|
Log.logWarning("SnippetProcess", this.snippetProcess.result.sizeAvailable() + " = result.sizeAvailable() >= this.neededResults = " + this.neededResults);
|
|
break;
|
|
}
|
|
|
|
// check if we can succeed if we try to take another url
|
|
if (this.snippetProcess.rankingProcess.feedingIsFinished() && this.snippetProcess.rankingProcess.rwiQueueSize() == 0 && this.snippetProcess.nodeStack.sizeAvailable() == 0) {
|
|
Log.logWarning("SnippetProcess", "rankingProcess.feedingIsFinished() && rankingProcess.sizeQueue() == 0");
|
|
break;
|
|
}
|
|
|
|
// get next entry
|
|
page = this.snippetProcess.takeURL(true, Math.min(500, Math.max(20, this.timeout - System.currentTimeMillis())));
|
|
//if (page != null) Log.logInfo("SnippetProcess", "got one page: " + page.metadata().url().toNormalform(true, false));
|
|
//if (page == null) page = rankedCache.takeURL(false, this.timeout - System.currentTimeMillis());
|
|
if (page == null) {
|
|
//Log.logWarning("SnippetProcess", "page == null");
|
|
break; // no more available
|
|
}
|
|
|
|
this.setName(page.url().toNormalform(true)); // to support debugging
|
|
if (this.snippetProcess.query.filterfailurls && this.snippetProcess.workTables.failURLsContains(page.hash())) {
|
|
continue;
|
|
}
|
|
|
|
resultEntry = fetchSnippet(page, this.cacheStrategy); // does not fetch snippets if snippetMode == 0
|
|
if (resultEntry == null) {
|
|
continue; // the entry had some problems, cannot be used
|
|
}
|
|
|
|
//if (result.contains(resultEntry)) continue;
|
|
this.snippetProcess.snippetComputationAllTime += resultEntry.snippetComputationTime;
|
|
|
|
// place the result to the result vector
|
|
// apply post-ranking
|
|
long ranking = resultEntry.word() == null ? 0 : Long.valueOf(this.snippetProcess.rankingProcess.order.cardinal(resultEntry.word()));
|
|
ranking += postRanking(resultEntry, new ConcurrentScoreMap<String>() /*this.snippetProcess.rankingProcess.getTopicNavigator(10)*/);
|
|
resultEntry.ranking = ranking;
|
|
this.snippetProcess.result.put(new ReverseElement<ResultEntry>(resultEntry, ranking)); // remove smallest in case of overflow
|
|
this.snippetProcess.rankingProcess.addTopics(resultEntry);
|
|
}
|
|
if (System.currentTimeMillis() >= this.timeout) {
|
|
Log.logWarning("SnippetProcess", "worker ended with timeout");
|
|
}
|
|
//System.out.println("FINISHED WORKER " + id + " FOR " + this.neededResults + " RESULTS, loops = " + loops);
|
|
} catch (final Exception e) { Log.logException(e); }
|
|
//Log.logInfo("SEARCH", "resultWorker thread " + this.id + " terminated");
|
|
}
|
|
|
|
protected void pleaseStop() {
|
|
this.shallrun = false;
|
|
}
|
|
|
|
/**
|
|
* calculate the time since the worker has had the latest activity
|
|
* @return time in milliseconds lasted since latest activity
|
|
*/
|
|
protected long busytime() {
|
|
return System.currentTimeMillis() - this.lastLifeSign;
|
|
}
|
|
|
|
private long postRanking(
|
|
final ResultEntry rentry,
|
|
final ScoreMap<String> topwords) {
|
|
|
|
long r = 0;
|
|
|
|
// for media search: prefer pages with many links
|
|
r += rentry.limage() << this.snippetProcess.query.ranking.coeff_cathasimage;
|
|
r += rentry.laudio() << this.snippetProcess.query.ranking.coeff_cathasaudio;
|
|
r += rentry.lvideo() << this.snippetProcess.query.ranking.coeff_cathasvideo;
|
|
r += rentry.lapp() << this.snippetProcess.query.ranking.coeff_cathasapp;
|
|
|
|
// apply citation count
|
|
//System.out.println("POSTRANKING CITATION: references = " + rentry.referencesCount() + ", inbound = " + rentry.llocal() + ", outbound = " + rentry.lother());
|
|
r += (128 * rentry.referencesCount() / (1 + 2 * rentry.llocal() + rentry.lother())) << this.snippetProcess.query.ranking.coeff_citation;
|
|
|
|
// prefer hit with 'prefer' pattern
|
|
if (this.snippetProcess.query.prefer.matcher(rentry.url().toNormalform(true)).matches()) {
|
|
r += 256 << this.snippetProcess.query.ranking.coeff_prefer;
|
|
}
|
|
if (this.snippetProcess.query.prefer.matcher(rentry.title()).matches()) {
|
|
r += 256 << this.snippetProcess.query.ranking.coeff_prefer;
|
|
}
|
|
|
|
// apply 'common-sense' heuristic using references
|
|
final String urlstring = rentry.url().toNormalform(true);
|
|
final String[] urlcomps = MultiProtocolURI.urlComps(urlstring);
|
|
final String[] descrcomps = MultiProtocolURI.splitpattern.split(rentry.title().toLowerCase());
|
|
int tc;
|
|
for (final String urlcomp : urlcomps) {
|
|
tc = topwords.get(urlcomp);
|
|
if (tc > 0) {
|
|
r += Math.max(1, tc) << this.snippetProcess.query.ranking.coeff_urlcompintoplist;
|
|
}
|
|
}
|
|
for (final String descrcomp : descrcomps) {
|
|
tc = topwords.get(descrcomp);
|
|
if (tc > 0) {
|
|
r += Math.max(1, tc) << this.snippetProcess.query.ranking.coeff_descrcompintoplist;
|
|
}
|
|
}
|
|
|
|
// apply query-in-result matching
|
|
final HandleSet urlcomph = Word.words2hashesHandles(urlcomps);
|
|
final HandleSet descrcomph = Word.words2hashesHandles(descrcomps);
|
|
final Iterator<byte[]> shi = this.snippetProcess.query.getQueryGoal().getIncludeHashes().iterator();
|
|
byte[] queryhash;
|
|
while (shi.hasNext()) {
|
|
queryhash = shi.next();
|
|
if (urlcomph.has(queryhash)) {
|
|
r += 256 << this.snippetProcess.query.ranking.coeff_appurl;
|
|
}
|
|
if (descrcomph.has(queryhash)) {
|
|
r += 256 << this.snippetProcess.query.ranking.coeff_app_dc_title;
|
|
}
|
|
}
|
|
|
|
return r;
|
|
}
|
|
|
|
private ResultEntry fetchSnippet(final URIMetadataNode page, final CacheStrategy cacheStrategy) {
|
|
// Snippet Fetching can has 3 modes:
|
|
// 0 - do not fetch snippets
|
|
// 1 - fetch snippets offline only
|
|
// 2 - online snippet fetch
|
|
|
|
// load only urls if there was not yet a root url of that hash
|
|
// find the url entry
|
|
|
|
String solrsnippet = this.snippetProcess.snippets.get(ASCII.String(page.hash()));
|
|
if (solrsnippet != null && solrsnippet.length() > 0) {
|
|
final TextSnippet snippet = new TextSnippet(page.hash(), solrsnippet, true, ResultClass.SOURCE_CACHE, "");
|
|
return new ResultEntry(page, this.snippetProcess.query.getSegment(), this.snippetProcess.peers, snippet, null, 0);
|
|
}
|
|
|
|
if (cacheStrategy == null) {
|
|
final TextSnippet snippet = new TextSnippet(
|
|
null,
|
|
page,
|
|
this.snippetProcess.snippetFetchWordHashes,
|
|
//this.query.queryString,
|
|
null,
|
|
((this.snippetProcess.query.constraint != null) && (this.snippetProcess.query.constraint.get(Condenser.flag_cat_indexof))),
|
|
SearchEvent.SNIPPET_MAX_LENGTH,
|
|
!this.snippetProcess.query.isLocal());
|
|
return new ResultEntry(page, this.snippetProcess.query.getSegment(), this.snippetProcess.peers, snippet, null, 0); // result without snippet
|
|
}
|
|
|
|
// load snippet
|
|
if (page.url().getContentDomain() == Classification.ContentDomain.TEXT || page.url().getContentDomain() == Classification.ContentDomain.ALL) {
|
|
// attach text snippet
|
|
long startTime = System.currentTimeMillis();
|
|
final TextSnippet snippet = new TextSnippet(
|
|
this.snippetProcess.loader,
|
|
page,
|
|
this.snippetProcess.snippetFetchWordHashes,
|
|
cacheStrategy,
|
|
((this.snippetProcess.query.constraint != null) && (this.snippetProcess.query.constraint.get(Condenser.flag_cat_indexof))),
|
|
180,
|
|
!this.snippetProcess.query.isLocal());
|
|
final long snippetComputationTime = System.currentTimeMillis() - startTime;
|
|
SearchEvent.log.logInfo("text snippet load time for " + page.url() + ": " + snippetComputationTime + ", " + (!snippet.getErrorCode().fail() ? "snippet found" : ("no snippet found (" + snippet.getError() + ")")));
|
|
|
|
if (!snippet.getErrorCode().fail()) {
|
|
// we loaded the file and found the snippet
|
|
return new ResultEntry(page, this.snippetProcess.query.getSegment(), this.snippetProcess.peers, snippet, null, snippetComputationTime); // result with snippet attached
|
|
} else if (cacheStrategy.mustBeOffline()) {
|
|
// we did not demand online loading, therefore a failure does not mean that the missing snippet causes a rejection of this result
|
|
// this may happen during a remote search, because snippet loading is omitted to retrieve results faster
|
|
return new ResultEntry(page, this.snippetProcess.query.getSegment(), this.snippetProcess.peers, null, null, snippetComputationTime); // result without snippet
|
|
} else {
|
|
// problems with snippet fetch
|
|
if (this.snippetProcess.snippetFetchWordHashes.has(Segment.catchallHash)) {
|
|
// we accept that because the word cannot be on the page
|
|
return new ResultEntry(page, this.snippetProcess.query.getSegment(), this.snippetProcess.peers, null, null, 0);
|
|
}
|
|
final String reason = "no text snippet; errorCode = " + snippet.getErrorCode();
|
|
if (this.snippetProcess.deleteIfSnippetFail) {
|
|
this.snippetProcess.workTables.failURLsRegisterMissingWord(this.snippetProcess.query.getSegment().termIndex(), page.url(), this.snippetProcess.query.getQueryGoal().getIncludeHashes(), reason);
|
|
}
|
|
SearchEvent.log.logInfo("sorted out url " + page.url().toNormalform(true) + " during search: " + reason);
|
|
return null;
|
|
}
|
|
}
|
|
return new ResultEntry(page, this.snippetProcess.query.getSegment(), this.snippetProcess.peers, null, null, 0); // result without snippet
|
|
}
|
|
}
|