yacy_search_server/source/net/yacy/search/query/SnippetWorker.java

/**
 *  SnippetWorker
 *  Copyright 2012 by Michael Peter Christen, mc@yacy.net, Frankfurt a. M., Germany
 *  First released 01.11.2012 at http://yacy.net
 *
 *  This library is free software; you can redistribute it and/or
 *  modify it under the terms of the GNU Lesser General Public
 *  License as published by the Free Software Foundation; either
 *  version 2.1 of the License, or (at your option) any later version.
 *
 *  This library is distributed in the hope that it will be useful,
 *  but WITHOUT ANY WARRANTY; without even the implied warranty of
 *  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
 *  Lesser General Public License for more details.
 *
 *  You should have received a copy of the GNU Lesser General Public License
 *  along with this program in the file lgpl21.txt
 *  If not, see <http://www.gnu.org/licenses/>.
 */

package net.yacy.search.query;

import java.util.Iterator;

import net.yacy.cora.document.ASCII;
import net.yacy.cora.document.Classification;
import net.yacy.cora.document.MultiProtocolURI;
import net.yacy.cora.federate.yacy.CacheStrategy;
import net.yacy.cora.sorting.ConcurrentScoreMap;
import net.yacy.cora.sorting.ScoreMap;
import net.yacy.cora.sorting.WeakPriorityBlockingQueue.ReverseElement;
import net.yacy.cora.storage.HandleSet;
import net.yacy.document.Condenser;
import net.yacy.kelondro.data.meta.URIMetadataNode;
import net.yacy.kelondro.data.word.Word;
import net.yacy.kelondro.logging.Log;
import net.yacy.kelondro.util.MemoryControl;
import net.yacy.search.index.Segment;
import net.yacy.search.snippet.ResultEntry;
import net.yacy.search.snippet.TextSnippet;
import net.yacy.search.snippet.TextSnippet.ResultClass;

public class SnippetWorker extends Thread {
    private final SearchEvent snippetProcess;
    private final long timeout; // the date until this thread should try to work
    private long lastLifeSign; // when the last time the run()-loop was executed
    private final CacheStrategy cacheStrategy;
    private final int neededResults;
    private boolean shallrun;

    protected SnippetWorker(final SearchEvent snippetProcess, final long maxlifetime, final CacheStrategy cacheStrategy, final int neededResults) {
        this.snippetProcess = snippetProcess;
        this.cacheStrategy = cacheStrategy;
        this.lastLifeSign = System.currentTimeMillis();
        this.timeout = System.currentTimeMillis() + Math.max(1000, maxlifetime);
        this.neededResults = neededResults;
        this.shallrun = true;
    }

    @Override
    public void run() {

        // start fetching urls and snippets
        URIMetadataNode page;
        ResultEntry resultEntry;
        try {
            while (this.shallrun && System.currentTimeMillis() < this.timeout) {
                this.lastLifeSign = System.currentTimeMillis();

                if (MemoryControl.shortStatus()) {
                    Log.logWarning("SnippetProcess", "shortStatus");
                    break;
                }

                // check if we have enough; we stop only if we can fetch online; otherwise its better to run this to get better navigation
                if ((this.cacheStrategy == null || this.cacheStrategy.isAllowedToFetchOnline()) && this.snippetProcess.result.sizeAvailable() >= this.neededResults) {
                    Log.logWarning("SnippetProcess", this.snippetProcess.result.sizeAvailable() + " = result.sizeAvailable() >= this.neededResults = " + this.neededResults);
                    break;
                }

                // check if we can succeed if we try to take another url
                if (this.snippetProcess.rankingProcess.feedingIsFinished() && this.snippetProcess.rankingProcess.rwiQueueSize() == 0 && this.snippetProcess.nodeStack.sizeAvailable() == 0) {
                    Log.logWarning("SnippetProcess", "rankingProcess.feedingIsFinished() && rankingProcess.sizeQueue() == 0");
                    break;
                }

                // get next entry
                page = this.snippetProcess.takeURL(true, Math.min(500, Math.max(20, this.timeout - System.currentTimeMillis())));
                //if (page != null) Log.logInfo("SnippetProcess", "got one page: " + page.metadata().url().toNormalform(true, false));
                //if (page == null) page = rankedCache.takeURL(false, this.timeout - System.currentTimeMillis());
                if (page == null) {
                    //Log.logWarning("SnippetProcess", "page == null");
                    break; // no more available
                }

                this.setName(page.url().toNormalform(true)); // to support debugging
                if (this.snippetProcess.query.filterfailurls && this.snippetProcess.workTables.failURLsContains(page.hash())) {
                    continue;
                }

                resultEntry = fetchSnippet(page, this.cacheStrategy); // does not fetch snippets if snippetMode == 0
                if (resultEntry == null) {
                    continue; // the entry had some problems, cannot be used
                }

                //if (result.contains(resultEntry)) continue;
                this.snippetProcess.snippetComputationAllTime += resultEntry.snippetComputationTime;

                // place the result to the result vector
                // apply post-ranking
                long ranking = resultEntry.word() == null ? 0 : Long.valueOf(this.snippetProcess.rankingProcess.order.cardinal(resultEntry.word()));
                ranking += postRanking(resultEntry, new ConcurrentScoreMap<String>() /*this.snippetProcess.rankingProcess.getTopicNavigator(10)*/);
                resultEntry.ranking = ranking;
                this.snippetProcess.result.put(new ReverseElement<ResultEntry>(resultEntry, ranking)); // remove smallest in case of overflow
                this.snippetProcess.rankingProcess.addTopics(resultEntry);
            }
            if (System.currentTimeMillis() >= this.timeout) {
                Log.logWarning("SnippetProcess", "worker ended with timeout");
            }
            //System.out.println("FINISHED WORKER " + id + " FOR " + this.neededResults + " RESULTS, loops = " + loops);
        } catch (final Exception e) { Log.logException(e); }
        //Log.logInfo("SEARCH", "resultWorker thread " + this.id + " terminated");
    }

    protected void pleaseStop() {
        this.shallrun = false;
    }

    /**
     * calculate the time since the worker has had the latest activity
     * @return time in milliseconds lasted since latest activity
     */
    protected long busytime() {
        return System.currentTimeMillis() - this.lastLifeSign;
    }

    private long postRanking(
            final ResultEntry rentry,
            final ScoreMap<String> topwords) {

        long r = 0;

        // for media search: prefer pages with many links
        r += rentry.limage() << this.snippetProcess.query.ranking.coeff_cathasimage;
        r += rentry.laudio() << this.snippetProcess.query.ranking.coeff_cathasaudio;
        r += rentry.lvideo() << this.snippetProcess.query.ranking.coeff_cathasvideo;
        r += rentry.lapp()   << this.snippetProcess.query.ranking.coeff_cathasapp;

        // apply citation count
        //System.out.println("POSTRANKING CITATION: references = " + rentry.referencesCount() + ", inbound = " + rentry.llocal() + ", outbound = " + rentry.lother());
        r += (128 * rentry.referencesCount() / (1 + 2 * rentry.llocal() + rentry.lother())) << this.snippetProcess.query.ranking.coeff_citation;

        // prefer hit with 'prefer' pattern
        if (this.snippetProcess.query.prefer.matcher(rentry.url().toNormalform(true)).matches()) {
            r += 256 << this.snippetProcess.query.ranking.coeff_prefer;
        }
        if (this.snippetProcess.query.prefer.matcher(rentry.title()).matches()) {
            r += 256 << this.snippetProcess.query.ranking.coeff_prefer;
        }

        // apply 'common-sense' heuristic using references
        final String urlstring = rentry.url().toNormalform(true);
        final String[] urlcomps = MultiProtocolURI.urlComps(urlstring);
        final String[] descrcomps = MultiProtocolURI.splitpattern.split(rentry.title().toLowerCase());
        int tc;
        for (final String urlcomp : urlcomps) {
            tc = topwords.get(urlcomp);
            if (tc > 0) {
                r += Math.max(1, tc) << this.snippetProcess.query.ranking.coeff_urlcompintoplist;
            }
        }
        for (final String descrcomp : descrcomps) {
            tc = topwords.get(descrcomp);
            if (tc > 0) {
                r += Math.max(1, tc) << this.snippetProcess.query.ranking.coeff_descrcompintoplist;
            }
        }

        // apply query-in-result matching
        final HandleSet urlcomph = Word.words2hashesHandles(urlcomps);
        final HandleSet descrcomph = Word.words2hashesHandles(descrcomps);
        final Iterator<byte[]> shi = this.snippetProcess.query.getQueryGoal().getIncludeHashes().iterator();
        byte[] queryhash;
        while (shi.hasNext()) {
            queryhash = shi.next();
            if (urlcomph.has(queryhash)) {
                r += 256 << this.snippetProcess.query.ranking.coeff_appurl;
            }
            if (descrcomph.has(queryhash)) {
                r += 256 << this.snippetProcess.query.ranking.coeff_app_dc_title;
            }
        }

        return r;
    }

    private ResultEntry fetchSnippet(final URIMetadataNode page, final CacheStrategy cacheStrategy) {
        // Snippet Fetching can has 3 modes:
        // 0 - do not fetch snippets
        // 1 - fetch snippets offline only
        // 2 - online snippet fetch

        // load only urls if there was not yet a root url of that hash
        // find the url entry

        String solrsnippet = this.snippetProcess.snippets.get(ASCII.String(page.hash()));
        if (solrsnippet != null && solrsnippet.length() > 0) {
            final TextSnippet snippet = new TextSnippet(page.hash(), solrsnippet, true, ResultClass.SOURCE_CACHE, "");
            return new ResultEntry(page, this.snippetProcess.query.getSegment(), this.snippetProcess.peers, snippet, null, 0);
        }

        if (cacheStrategy == null) {
            final TextSnippet snippet = new TextSnippet(
                    null,
                    page,
                    this.snippetProcess.snippetFetchWordHashes,
                    //this.query.queryString,
                    null,
                    ((this.snippetProcess.query.constraint != null) && (this.snippetProcess.query.constraint.get(Condenser.flag_cat_indexof))),
                    SearchEvent.SNIPPET_MAX_LENGTH,
                    !this.snippetProcess.query.isLocal());
            return new ResultEntry(page, this.snippetProcess.query.getSegment(), this.snippetProcess.peers, snippet, null, 0); // result without snippet
        }

        // load snippet
        if (page.url().getContentDomain() == Classification.ContentDomain.TEXT || page.url().getContentDomain() == Classification.ContentDomain.ALL) {
            // attach text snippet
            long startTime = System.currentTimeMillis();
            final TextSnippet snippet = new TextSnippet(
                    this.snippetProcess.loader,
                    page,
                    this.snippetProcess.snippetFetchWordHashes,
                    cacheStrategy,
                    ((this.snippetProcess.query.constraint != null) && (this.snippetProcess.query.constraint.get(Condenser.flag_cat_indexof))),
                    180,
                    !this.snippetProcess.query.isLocal());
            final long snippetComputationTime = System.currentTimeMillis() - startTime;
            SearchEvent.log.logInfo("text snippet load time for " + page.url() + ": " + snippetComputationTime + ", " + (!snippet.getErrorCode().fail() ? "snippet found" : ("no snippet found (" + snippet.getError() + ")")));

            if (!snippet.getErrorCode().fail()) {
                // we loaded the file and found the snippet
                return new ResultEntry(page, this.snippetProcess.query.getSegment(), this.snippetProcess.peers, snippet, null, snippetComputationTime); // result with snippet attached
            } else if (cacheStrategy.mustBeOffline()) {
                // we did not demand online loading, therefore a failure does not mean that the missing snippet causes a rejection of this result
                // this may happen during a remote search, because snippet loading is omitted to retrieve results faster
                return new ResultEntry(page, this.snippetProcess.query.getSegment(), this.snippetProcess.peers, null, null, snippetComputationTime); // result without snippet
            } else {
                // problems with snippet fetch
                if (this.snippetProcess.snippetFetchWordHashes.has(Segment.catchallHash)) {
                    // we accept that because the word cannot be on the page
                    return new ResultEntry(page, this.snippetProcess.query.getSegment(), this.snippetProcess.peers, null, null, 0);
                }
                final String reason = "no text snippet; errorCode = " + snippet.getErrorCode();
                if (this.snippetProcess.deleteIfSnippetFail) {
                    this.snippetProcess.workTables.failURLsRegisterMissingWord(this.snippetProcess.query.getSegment().termIndex(), page.url(), this.snippetProcess.query.getQueryGoal().getIncludeHashes(), reason);
                }
                SearchEvent.log.logInfo("sorted out url " + page.url().toNormalform(true) + " during search: " + reason);
                return null;
            }
        }
        return new ResultEntry(page, this.snippetProcess.query.getSegment(), this.snippetProcess.peers, null, null, 0); // result without snippet
    }
}