Michael Peter Christen d6b82840f8 added a feature to find similarities in documents.
This uses an enhanced version of the Nutch/Solr TextProfileSignatue.
As a result, a signature of the document is written to the solr search
index. Additionally for each time when a signature is written, it is
checked if the singature exists already in the index. If the signature
does not exist, the document is marked as unique. The unique attribute
can now be used to sort document lists and bring duplicates to the end
of a result list.
To enable this, a large portion of the search api to Solr had to be
changed. This affected mainly caching of 'exists' searches to enhance
the check for existing signatures and do this without actually doing a
solr query.
Because here the first time a long number is used as value in the Solr
store, also the value naming in the YaCySchema had to be adopted and
normalized. This caused that many files had to be changed.
2012-11-21 18:46:49 +01:00

264 lines
14 KiB

* SnippetWorker
* Copyright 2012 by Michael Peter Christen, mc@yacy.net, Frankfurt a. M., Germany
* First released 01.11.2012 at http://yacy.net
* This library is free software; you can redistribute it and/or
* modify it under the terms of the GNU Lesser General Public
* License as published by the Free Software Foundation; either
* version 2.1 of the License, or (at your option) any later version.
* This library is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* Lesser General Public License for more details.
* You should have received a copy of the GNU Lesser General Public License
* along with this program in the file lgpl21.txt
* If not, see <http://www.gnu.org/licenses/>.
package net.yacy.search.query;
import java.util.Iterator;
import net.yacy.cora.document.ASCII;
import net.yacy.cora.document.MultiProtocolURI;
import net.yacy.cora.document.analysis.Classification;
import net.yacy.cora.federate.yacy.CacheStrategy;
import net.yacy.cora.sorting.ConcurrentScoreMap;
import net.yacy.cora.sorting.ScoreMap;
import net.yacy.cora.sorting.WeakPriorityBlockingQueue.ReverseElement;
import net.yacy.cora.storage.HandleSet;
import net.yacy.document.Condenser;
import net.yacy.kelondro.data.meta.URIMetadataNode;
import net.yacy.kelondro.data.word.Word;
import net.yacy.kelondro.logging.Log;
import net.yacy.kelondro.util.MemoryControl;
import net.yacy.search.index.Segment;
import net.yacy.search.snippet.ResultEntry;
import net.yacy.search.snippet.TextSnippet;
import net.yacy.search.snippet.TextSnippet.ResultClass;
public class SnippetWorker extends Thread {
private final SearchEvent snippetProcess;
private final long timeout; // the date until this thread should try to work
private long lastLifeSign; // when the last time the run()-loop was executed
private final CacheStrategy cacheStrategy;
private final int neededResults;
private boolean shallrun;
protected SnippetWorker(final SearchEvent snippetProcess, final long maxlifetime, final CacheStrategy cacheStrategy, final int neededResults) {
this.snippetProcess = snippetProcess;
this.cacheStrategy = cacheStrategy;
this.lastLifeSign = System.currentTimeMillis();
this.timeout = System.currentTimeMillis() + Math.max(1000, maxlifetime);
this.neededResults = neededResults;
this.shallrun = true;
public void run() {
// start fetching urls and snippets
URIMetadataNode page;
ResultEntry resultEntry;
try {
while (this.shallrun && System.currentTimeMillis() < this.timeout) {
this.lastLifeSign = System.currentTimeMillis();
if (MemoryControl.shortStatus()) {
Log.logWarning("SnippetProcess", "shortStatus");
// check if we have enough; we stop only if we can fetch online; otherwise its better to run this to get better navigation
if ((this.cacheStrategy == null || this.cacheStrategy.isAllowedToFetchOnline()) && this.snippetProcess.result.sizeAvailable() >= this.neededResults) {
Log.logWarning("SnippetProcess", this.snippetProcess.result.sizeAvailable() + " = result.sizeAvailable() >= this.neededResults = " + this.neededResults);
// check if we can succeed if we try to take another url
if (this.snippetProcess.rankingProcess.feedingIsFinished() && this.snippetProcess.rankingProcess.rwiQueueSize() == 0 && this.snippetProcess.nodeStack.sizeAvailable() == 0) {
Log.logWarning("SnippetProcess", "rankingProcess.feedingIsFinished() && rankingProcess.sizeQueue() == 0");
// get next entry
page = this.snippetProcess.takeURL(true, Math.min(500, Math.max(20, this.timeout - System.currentTimeMillis())));
//if (page != null) Log.logInfo("SnippetProcess", "got one page: " + page.metadata().url().toNormalform(true, false));
//if (page == null) page = rankedCache.takeURL(false, this.timeout - System.currentTimeMillis());
if (page == null) {
//Log.logWarning("SnippetProcess", "page == null");
break; // no more available
this.setName(page.url().toNormalform(true)); // to support debugging
if (this.snippetProcess.query.filterfailurls && this.snippetProcess.workTables.failURLsContains(page.hash())) {
resultEntry = fetchSnippet(page, this.cacheStrategy); // does not fetch snippets if snippetMode == 0
if (resultEntry == null) {
continue; // the entry had some problems, cannot be used
//if (result.contains(resultEntry)) continue;
this.snippetProcess.snippetComputationAllTime += resultEntry.snippetComputationTime;
// place the result to the result vector
// apply post-ranking
long ranking = resultEntry.word() == null ? 0 : Long.valueOf(this.snippetProcess.rankingProcess.order.cardinal(resultEntry.word()));
ranking += postRanking(resultEntry, new ConcurrentScoreMap<String>() /*this.snippetProcess.rankingProcess.getTopicNavigator(10)*/);
resultEntry.ranking = ranking;
this.snippetProcess.result.put(new ReverseElement<ResultEntry>(resultEntry, ranking)); // remove smallest in case of overflow
if (System.currentTimeMillis() >= this.timeout) {
Log.logWarning("SnippetProcess", "worker ended with timeout");
//System.out.println("FINISHED WORKER " + id + " FOR " + this.neededResults + " RESULTS, loops = " + loops);
} catch (final Exception e) { Log.logException(e); }
//Log.logInfo("SEARCH", "resultWorker thread " + this.id + " terminated");
protected void pleaseStop() {
this.shallrun = false;
* calculate the time since the worker has had the latest activity
* @return time in milliseconds lasted since latest activity
protected long busytime() {
return System.currentTimeMillis() - this.lastLifeSign;
private long postRanking(
final ResultEntry rentry,
final ScoreMap<String> topwords) {
long r = 0;
// for media search: prefer pages with many links
r += rentry.limage() << this.snippetProcess.query.ranking.coeff_cathasimage;
r += rentry.laudio() << this.snippetProcess.query.ranking.coeff_cathasaudio;
r += rentry.lvideo() << this.snippetProcess.query.ranking.coeff_cathasvideo;
r += rentry.lapp() << this.snippetProcess.query.ranking.coeff_cathasapp;
// apply citation count
//System.out.println("POSTRANKING CITATION: references = " + rentry.referencesCount() + ", inbound = " + rentry.llocal() + ", outbound = " + rentry.lother());
r += (128 * rentry.referencesCount() / (1 + 2 * rentry.llocal() + rentry.lother())) << this.snippetProcess.query.ranking.coeff_citation;
// prefer hit with 'prefer' pattern
if (this.snippetProcess.query.prefer.matcher(rentry.url().toNormalform(true)).matches()) {
r += 256 << this.snippetProcess.query.ranking.coeff_prefer;
if (this.snippetProcess.query.prefer.matcher(rentry.title()).matches()) {
r += 256 << this.snippetProcess.query.ranking.coeff_prefer;
// apply 'common-sense' heuristic using references
final String urlstring = rentry.url().toNormalform(true);
final String[] urlcomps = MultiProtocolURI.urlComps(urlstring);
final String[] descrcomps = MultiProtocolURI.splitpattern.split(rentry.title().toLowerCase());
int tc;
for (final String urlcomp : urlcomps) {
tc = topwords.get(urlcomp);
if (tc > 0) {
r += Math.max(1, tc) << this.snippetProcess.query.ranking.coeff_urlcompintoplist;
for (final String descrcomp : descrcomps) {
tc = topwords.get(descrcomp);
if (tc > 0) {
r += Math.max(1, tc) << this.snippetProcess.query.ranking.coeff_descrcompintoplist;
// apply query-in-result matching
final HandleSet urlcomph = Word.words2hashesHandles(urlcomps);
final HandleSet descrcomph = Word.words2hashesHandles(descrcomps);
final Iterator<byte[]> shi = this.snippetProcess.query.getQueryGoal().getIncludeHashes().iterator();
byte[] queryhash;
while (shi.hasNext()) {
queryhash = shi.next();
if (urlcomph.has(queryhash)) {
r += 256 << this.snippetProcess.query.ranking.coeff_appurl;
if (descrcomph.has(queryhash)) {
r += 256 << this.snippetProcess.query.ranking.coeff_app_dc_title;
return r;
private ResultEntry fetchSnippet(final URIMetadataNode page, final CacheStrategy cacheStrategy) {
// Snippet Fetching can has 3 modes:
// 0 - do not fetch snippets
// 1 - fetch snippets offline only
// 2 - online snippet fetch
// load only urls if there was not yet a root url of that hash
// find the url entry
String solrsnippet = this.snippetProcess.snippets.get(ASCII.String(page.hash()));
if (solrsnippet != null && solrsnippet.length() > 0) {
final TextSnippet snippet = new TextSnippet(page.hash(), solrsnippet, true, ResultClass.SOURCE_CACHE, "");
return new ResultEntry(page, this.snippetProcess.query.getSegment(), this.snippetProcess.peers, snippet, null, 0);
if (cacheStrategy == null) {
final TextSnippet snippet = new TextSnippet(
((this.snippetProcess.query.constraint != null) && (this.snippetProcess.query.constraint.get(Condenser.flag_cat_indexof))),
return new ResultEntry(page, this.snippetProcess.query.getSegment(), this.snippetProcess.peers, snippet, null, 0); // result without snippet
// load snippet
if (page.url().getContentDomain() == Classification.ContentDomain.TEXT || page.url().getContentDomain() == Classification.ContentDomain.ALL) {
// attach text snippet
long startTime = System.currentTimeMillis();
final TextSnippet snippet = new TextSnippet(
((this.snippetProcess.query.constraint != null) && (this.snippetProcess.query.constraint.get(Condenser.flag_cat_indexof))),
final long snippetComputationTime = System.currentTimeMillis() - startTime;
SearchEvent.log.logInfo("text snippet load time for " + page.url() + ": " + snippetComputationTime + ", " + (!snippet.getErrorCode().fail() ? "snippet found" : ("no snippet found (" + snippet.getError() + ")")));
if (!snippet.getErrorCode().fail()) {
// we loaded the file and found the snippet
return new ResultEntry(page, this.snippetProcess.query.getSegment(), this.snippetProcess.peers, snippet, null, snippetComputationTime); // result with snippet attached
} else if (cacheStrategy.mustBeOffline()) {
// we did not demand online loading, therefore a failure does not mean that the missing snippet causes a rejection of this result
// this may happen during a remote search, because snippet loading is omitted to retrieve results faster
return new ResultEntry(page, this.snippetProcess.query.getSegment(), this.snippetProcess.peers, null, null, snippetComputationTime); // result without snippet
} else {
// problems with snippet fetch
if (this.snippetProcess.snippetFetchWordHashes.has(Segment.catchallHash)) {
// we accept that because the word cannot be on the page
return new ResultEntry(page, this.snippetProcess.query.getSegment(), this.snippetProcess.peers, null, null, 0);
final String reason = "no text snippet; errorCode = " + snippet.getErrorCode();
if (this.snippetProcess.deleteIfSnippetFail) {
this.snippetProcess.workTables.failURLsRegisterMissingWord(this.snippetProcess.query.getSegment().termIndex(), page.url(), this.snippetProcess.query.getQueryGoal().getIncludeHashes(), reason);
SearchEvent.log.logInfo("sorted out url " + page.url().toNormalform(true) + " during search: " + reason);
return null;
return new ResultEntry(page, this.snippetProcess.query.getSegment(), this.snippetProcess.peers, null, null, 0); // result without snippet