// plasmaSearchProcessing.java // (C) 2005 by Michael Peter Christen; mc@yacy.net, Frankfurt a. M., Germany // first published 17.10.2005 on http://yacy.net // // This is a part of YaCy, a peer-to-peer based web search engine // // $LastChangedDate: 2006-04-02 22:40:07 +0200 (So, 02 Apr 2006) $ // $LastChangedRevision: 1986 $ // $LastChangedBy: orbiter $ // // LICENSE // // This program is free software; you can redistribute it and/or modify // it under the terms of the GNU General Public License as published by // the Free Software Foundation; either version 2 of the License, or // (at your option) any later version. // // This program is distributed in the hope that it will be useful, // but WITHOUT ANY WARRANTY; without even the implied warranty of // MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the // GNU General Public License for more details. // // You should have received a copy of the GNU General Public License // along with this program; if not, write to the Free Software // Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA package de.anomic.plasma; import java.util.ArrayList; import java.util.Collection; import java.util.HashMap; import java.util.Iterator; import java.util.Map; import java.util.Set; import java.util.TreeMap; import de.anomic.index.indexContainer; import de.anomic.index.indexRWIEntry; import de.anomic.server.serverByteBuffer; /** * * This class provides search processes and keeps a timing record of the processes * It shall be used to initiate a search and also to evaluate * the real obtained timings after a search is performed */ public class plasmaSearchProcessing implements Cloneable { // collection: // time = time to get a RWI out of RAM cache, assortments and WORDS files // count = maximum number of RWI-entries that shall be collected // join // time = time to perform the join between all collected RWIs // count = maximum number of entries that shall be joined // presort: // time = time to do a sort of the joined URL-records // count = maximum number of entries that shall be pre-sorted // urlfetch: // time = time to fetch the real URLs from the LURL database // count = maximum number of urls that shall be fetched // postsort: // time = time for final sort of URLs // count = maximum number oof URLs that shall be retrieved during sort // snippetfetch: // time = time to fetch snippets for selected URLs // count = maximum number of snipptes to be fetched public static final String COLLECTION = "collection"; public static final String JOIN = "join"; public static final String PRESORT = "presort"; public static final String URLFETCH = "urlfetch"; private static final long minimumTargetTime = 100; private long targetTime; private int targetCount; private ArrayList yield; private long timer; private plasmaSearchProcessing() { targetTime = minimumTargetTime; targetCount = 10; yield = new ArrayList(); timer = 0; } public plasmaSearchProcessing(long time, int count) { this(); this.targetTime = time; this.targetCount = count; } public static class Entry { public String process; public int count; public long time; public Entry(String process, int count, long time) { this.process = process; this.count = count; this.time = time; } } public int getTargetCount() { return this.targetCount; } public long getTargetTime() { return this.targetTime; } public void startTimer() { this.timer = System.currentTimeMillis(); } public void yield(String s, int count) { long t = System.currentTimeMillis() - this.timer; Entry e = new Entry(s, count, t); yield.add(e); } public Iterator events() { // iteratese Entry-type Objects return yield.iterator(); } public int size() { // returns number of events / Entry-Objects in yield array return yield.size(); } public Map[] localSearchContainers( plasmaSearchQuery query, plasmaWordIndex wordIndex, Set urlselection) { // search for the set of hashes and return a map of of wordhash:indexContainer containing the seach result // retrieve entities that belong to the hashes startTimer(); Map inclusionContainers = (query.queryHashes.size() == 0) ? new HashMap() : wordIndex.getContainers( query.queryHashes, urlselection, true, true); if ((inclusionContainers.size() != 0) && (inclusionContainers.size() < query.queryHashes.size())) inclusionContainers = new HashMap(); // prevent that only a subset is returned Map exclusionContainers = ((inclusionContainers == null) || (inclusionContainers.size() == 0)) ? new HashMap() : wordIndex.getContainers( query.excludeHashes, urlselection, true, true); yield(plasmaSearchProcessing.COLLECTION, inclusionContainers.size()); return new Map[]{inclusionContainers, exclusionContainers}; } public indexContainer localSearchJoinExclude( Collection includeContainers, Collection excludeContainers, int maxDistance) { // join a search result and return the joincount (number of pages after join) // since this is a conjunction we return an empty entity if any word is not known if (includeContainers == null) return plasmaWordIndex.emptyContainer(null, 0); // join the result startTimer(); indexContainer rcLocal = indexContainer.joinContainers(includeContainers, maxDistance); if (rcLocal != null) { indexContainer.excludeContainers(rcLocal, excludeContainers); } if (rcLocal == null) rcLocal = plasmaWordIndex.emptyContainer(null, 0); yield(plasmaSearchProcessing.JOIN, rcLocal.size()); return rcLocal; } public static final serverByteBuffer compressIndex(indexContainer inputContainer, indexContainer excludeContainer, long maxtime) { // collect references according to domains long timeout = (maxtime < 0) ? Long.MAX_VALUE : System.currentTimeMillis() + maxtime; TreeMap doms = new TreeMap(); synchronized (inputContainer) { Iterator i = inputContainer.entries(); indexRWIEntry iEntry; String dom, paths; while (i.hasNext()) { iEntry = (indexRWIEntry) i.next(); if ((excludeContainer != null) && (excludeContainer.get(iEntry.urlHash()) != null)) continue; // do not include urls that are in excludeContainer dom = iEntry.urlHash().substring(6); if ((paths = (String) doms.get(dom)) == null) { doms.put(dom, iEntry.urlHash().substring(0, 6)); } else { doms.put(dom, paths + iEntry.urlHash().substring(0, 6)); } if (System.currentTimeMillis() > timeout) break; } } // construct a result string serverByteBuffer bb = new serverByteBuffer(inputContainer.size() * 6); bb.append('{'); Iterator i = doms.entrySet().iterator(); Map.Entry entry; while (i.hasNext()) { entry = (Map.Entry) i.next(); bb.append((String) entry.getKey()); bb.append(':'); bb.append((String) entry.getValue()); if (System.currentTimeMillis() > timeout) break; if (i.hasNext()) bb.append(','); } bb.append('}'); return bb; } public static final void decompressIndex(TreeMap target, serverByteBuffer ci, String peerhash) { // target is a mapping from url-hashes to a string of peer-hashes if ((ci.byteAt(0) == '{') && (ci.byteAt(ci.length() - 1) == '}')) { //System.out.println("DEBUG-DECOMPRESS: input is " + ci.toString()); ci = ci.trim(1, ci.length() - 2); String dom, url, peers; while ((ci.length() >= 13) && (ci.byteAt(6) == ':')) { assert ci.length() >= 6 : "ci.length() = " + ci.length(); dom = ci.toString(0, 6); ci.trim(7); while ((ci.length() > 0) && (ci.byteAt(0) != ',')) { assert ci.length() >= 6 : "ci.length() = " + ci.length(); url = ci.toString(0, 6) + dom; ci.trim(6); peers = (String) target.get(url); if (peers == null) { target.put(url, peerhash); } else { target.put(url, peers + peerhash); } //System.out.println("DEBUG-DECOMPRESS: " + url + ":" + target.get(url)); } if (ci.byteAt(0) == ',') ci.trim(1); } } } }