yacy_search_server/source/de/anomic/plasma/plasmaSearchProcessing.java

// plasmaSearchProcessing.java
// (C) 2005 by Michael Peter Christen; mc@yacy.net, Frankfurt a. M., Germany
// first published 17.10.2005 on http://yacy.net
//
// This is a part of YaCy, a peer-to-peer based web search engine
//
// $LastChangedDate: 2006-04-02 22:40:07 +0200 (So, 02 Apr 2006) $
// $LastChangedRevision: 1986 $
// $LastChangedBy: orbiter $
//
// LICENSE
//
// This program is free software; you can redistribute it and/or modify
// it under the terms of the GNU General Public License as published by
// the Free Software Foundation; either version 2 of the License, or
// (at your option) any later version.
//
// This program is distributed in the hope that it will be useful,
// but WITHOUT ANY WARRANTY; without even the implied warranty of
// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
// GNU General Public License for more details.
//
// You should have received a copy of the GNU General Public License
// along with this program; if not, write to the Free Software
// Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA  02111-1307  USA

package de.anomic.plasma;

import java.util.ArrayList;
import java.util.Collection;
import java.util.HashMap;
import java.util.Iterator;
import java.util.Map;
import java.util.Set;
import java.util.TreeMap;

import de.anomic.index.indexContainer;
import de.anomic.index.indexRWIEntry;
import de.anomic.server.serverByteBuffer;

/**
 *
 * This class provides search processes and keeps a timing record of the processes
 * It shall be used to initiate a search and also to evaluate
 * the real obtained timings after a search is performed
 */

public class plasmaSearchProcessing implements Cloneable {

    // collection:
    // time = time to get a RWI out of RAM cache, assortments and WORDS files
    // count = maximum number of RWI-entries that shall be collected

    // join
    // time = time to perform the join between all collected RWIs
    // count = maximum number of entries that shall be joined

    // presort:
    // time = time to do a sort of the joined URL-records
    // count = maximum number of entries that shall be pre-sorted

    // urlfetch:
    // time = time to fetch the real URLs from the LURL database
    // count = maximum number of urls that shall be fetched

    // postsort:
    // time = time for final sort of URLs
    // count = maximum number oof URLs that shall be retrieved during sort

    // snippetfetch:
    // time = time to fetch snippets for selected URLs
    // count = maximum number of snipptes to be fetched

    public static final String COLLECTION   = "collection";
    public static final String JOIN         = "join";
    public static final String PRESORT      = "presort";
    public static final String URLFETCH     = "urlfetch";

    private static final long minimumTargetTime = 100;

    private long targetTime;
    private int  targetCount;
    private ArrayList yield;
    private long timer;

    private plasmaSearchProcessing() {
        targetTime = minimumTargetTime;
        targetCount = 10;
        yield = new ArrayList();
        timer = 0;
    }

    public plasmaSearchProcessing(long time, int count) {
        this();
        this.targetTime = time;
        this.targetCount = count;
    }

    public static class Entry {
        public String process;
        public int count;
        public long time;
        public Entry(String process, int count, long time) {
            this.process = process;
            this.count = count;
            this.time = time;
        }
    }

    public int getTargetCount() {
        return this.targetCount;
    }

    public long getTargetTime() {
        return this.targetTime;
    }

    public void startTimer() {
        this.timer = System.currentTimeMillis();
    }

    public void yield(String s, int count) {
        long t = System.currentTimeMillis() - this.timer;
        Entry e = new Entry(s, count, t);
        yield.add(e);
    }

    public Iterator events() {
        // iteratese Entry-type Objects
        return yield.iterator();
    }

    public int size() {
        // returns number of events / Entry-Objects in yield array
        return yield.size();
    }

    // collection
    public Map[] localSearchContainers(
            plasmaSearchQuery query,
            plasmaWordIndex wordIndex,
            Set urlselection) {
        // search for the set of hashes and return a map of of wordhash:indexContainer containing the seach result

        // retrieve entities that belong to the hashes
        startTimer();
        Map inclusionContainers = (query.queryHashes.size() == 0) ? new HashMap() : wordIndex.getContainers(
                        query.queryHashes,
                        urlselection,
                        true,
                        true);
        if ((inclusionContainers.size() != 0) && (inclusionContainers.size() < query.queryHashes.size())) inclusionContainers = new HashMap(); // prevent that only a subset is returned
        Map exclusionContainers = ((inclusionContainers == null) || (inclusionContainers.size() == 0)) ? new HashMap() : wordIndex.getContainers(
                query.excludeHashes,
                urlselection,
                true,
                true);
        yield(plasmaSearchProcessing.COLLECTION, inclusionContainers.size());

        return new Map[]{inclusionContainers, exclusionContainers};
    }

    // join
    public indexContainer localSearchJoinExclude(
            Collection includeContainers,
            Collection excludeContainers,
            int maxDistance) {
        // join a search result and return the joincount (number of pages after join)

        // since this is a conjunction we return an empty entity if any word is not known
        if (includeContainers == null) return plasmaWordIndex.emptyContainer(null, 0);

        // join the result
        startTimer();
        indexContainer rcLocal = indexContainer.joinContainers(includeContainers, maxDistance);
        if (rcLocal != null) {
            indexContainer.excludeContainers(rcLocal, excludeContainers);
        }
        if (rcLocal == null) rcLocal = plasmaWordIndex.emptyContainer(null, 0);
        yield(plasmaSearchProcessing.JOIN, rcLocal.size());

        return rcLocal;
    }


    public static final serverByteBuffer compressIndex(indexContainer inputContainer, indexContainer excludeContainer, long maxtime) {
        // collect references according to domains
        long timeout = (maxtime < 0) ? Long.MAX_VALUE : System.currentTimeMillis() + maxtime;
        TreeMap doms = new TreeMap();
        synchronized (inputContainer) {
            Iterator i = inputContainer.entries();
            indexRWIEntry iEntry;
            String dom, paths;
            while (i.hasNext()) {
                iEntry = (indexRWIEntry) i.next();
                if ((excludeContainer != null) && (excludeContainer.get(iEntry.urlHash()) != null)) continue; // do not include urls that are in excludeContainer
                dom = iEntry.urlHash().substring(6);
                if ((paths = (String) doms.get(dom)) == null) {
                    doms.put(dom, iEntry.urlHash().substring(0, 6));
                } else {
                    doms.put(dom, paths + iEntry.urlHash().substring(0, 6));
                }
                if (System.currentTimeMillis() > timeout)
                    break;
            }
        }
        // construct a result string
        serverByteBuffer bb = new serverByteBuffer(inputContainer.size() * 6);
        bb.append('{');
        Iterator i = doms.entrySet().iterator();
        Map.Entry entry;
        while (i.hasNext()) {
            entry = (Map.Entry) i.next();
            bb.append((String) entry.getKey());
            bb.append(':');
            bb.append((String) entry.getValue());
            if (System.currentTimeMillis() > timeout)
                break;
            if (i.hasNext())
                bb.append(',');
        }
        bb.append('}');
        return bb;
    }

    public static final void decompressIndex(TreeMap target, serverByteBuffer ci, String peerhash) {
        // target is a mapping from url-hashes to a string of peer-hashes
        if ((ci.byteAt(0) == '{') && (ci.byteAt(ci.length() - 1) == '}')) {
            //System.out.println("DEBUG-DECOMPRESS: input is " + ci.toString());
            ci = ci.trim(1, ci.length() - 2);
            String dom, url, peers;
            while ((ci.length() >= 13) && (ci.byteAt(6) == ':')) {
                assert ci.length() >= 6 : "ci.length() = " + ci.length();
                dom = ci.toString(0, 6);
                ci.trim(7);
                while ((ci.length() > 0) && (ci.byteAt(0) != ',')) {
                    assert ci.length() >= 6 : "ci.length() = " + ci.length();
                    url = ci.toString(0, 6) + dom;
                    ci.trim(6);
                    peers = (String) target.get(url);
                    if (peers == null) {
                        target.put(url, peerhash);
                    } else {
                        target.put(url, peers + peerhash);
                    }
                    //System.out.println("DEBUG-DECOMPRESS: " + url + ":" + target.get(url));
                }
                if (ci.byteAt(0) == ',') ci.trim(1);
            }
        }
    }


}