yacy_search_server/source/de/anomic/plasma/plasmaSearchProcessing.java

254 lines
9.2 KiB
Java

// plasmaSearchProcessing.java
// (C) 2005 by Michael Peter Christen; mc@yacy.net, Frankfurt a. M., Germany
// first published 17.10.2005 on http://yacy.net
//
// This is a part of YaCy, a peer-to-peer based web search engine
//
// $LastChangedDate: 2006-04-02 22:40:07 +0200 (So, 02 Apr 2006) $
// $LastChangedRevision: 1986 $
// $LastChangedBy: orbiter $
//
// LICENSE
//
// This program is free software; you can redistribute it and/or modify
// it under the terms of the GNU General Public License as published by
// the Free Software Foundation; either version 2 of the License, or
// (at your option) any later version.
//
// This program is distributed in the hope that it will be useful,
// but WITHOUT ANY WARRANTY; without even the implied warranty of
// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
// GNU General Public License for more details.
//
// You should have received a copy of the GNU General Public License
// along with this program; if not, write to the Free Software
// Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
package de.anomic.plasma;
import java.util.ArrayList;
import java.util.Collection;
import java.util.HashMap;
import java.util.Iterator;
import java.util.Map;
import java.util.Set;
import java.util.TreeMap;
import de.anomic.index.indexContainer;
import de.anomic.index.indexRWIEntry;
import de.anomic.server.serverByteBuffer;
/**
*
* This class provides search processes and keeps a timing record of the processes
* It shall be used to initiate a search and also to evaluate
* the real obtained timings after a search is performed
*/
public class plasmaSearchProcessing implements Cloneable {
// collection:
// time = time to get a RWI out of RAM cache, assortments and WORDS files
// count = maximum number of RWI-entries that shall be collected
// join
// time = time to perform the join between all collected RWIs
// count = maximum number of entries that shall be joined
// presort:
// time = time to do a sort of the joined URL-records
// count = maximum number of entries that shall be pre-sorted
// urlfetch:
// time = time to fetch the real URLs from the LURL database
// count = maximum number of urls that shall be fetched
// postsort:
// time = time for final sort of URLs
// count = maximum number oof URLs that shall be retrieved during sort
// snippetfetch:
// time = time to fetch snippets for selected URLs
// count = maximum number of snipptes to be fetched
public static final String COLLECTION = "collection";
public static final String JOIN = "join";
public static final String PRESORT = "presort";
public static final String URLFETCH = "urlfetch";
private static final long minimumTargetTime = 100;
private long targetTime;
private int targetCount;
private ArrayList yield;
private long timer;
private plasmaSearchProcessing() {
targetTime = minimumTargetTime;
targetCount = 10;
yield = new ArrayList();
timer = 0;
}
public plasmaSearchProcessing(long time, int count) {
this();
this.targetTime = time;
this.targetCount = count;
}
public static class Entry {
public String process;
public int count;
public long time;
public Entry(String process, int count, long time) {
this.process = process;
this.count = count;
this.time = time;
}
}
public int getTargetCount() {
return this.targetCount;
}
public long getTargetTime() {
return this.targetTime;
}
public void startTimer() {
this.timer = System.currentTimeMillis();
}
public void yield(String s, int count) {
long t = System.currentTimeMillis() - this.timer;
Entry e = new Entry(s, count, t);
yield.add(e);
}
public Iterator events() {
// iteratese Entry-type Objects
return yield.iterator();
}
public int size() {
// returns number of events / Entry-Objects in yield array
return yield.size();
}
public Map[] localSearchContainers(
plasmaSearchQuery query,
plasmaWordIndex wordIndex,
Set urlselection) {
// search for the set of hashes and return a map of of wordhash:indexContainer containing the seach result
// retrieve entities that belong to the hashes
startTimer();
Map inclusionContainers = (query.queryHashes.size() == 0) ? new HashMap() : wordIndex.getContainers(
query.queryHashes,
urlselection,
true,
true);
if ((inclusionContainers.size() != 0) && (inclusionContainers.size() < query.queryHashes.size())) inclusionContainers = new HashMap(); // prevent that only a subset is returned
Map exclusionContainers = ((inclusionContainers == null) || (inclusionContainers.size() == 0)) ? new HashMap() : wordIndex.getContainers(
query.excludeHashes,
urlselection,
true,
true);
yield(plasmaSearchProcessing.COLLECTION, inclusionContainers.size());
return new Map[]{inclusionContainers, exclusionContainers};
}
public indexContainer localSearchJoinExclude(
Collection includeContainers,
Collection excludeContainers,
int maxDistance) {
// join a search result and return the joincount (number of pages after join)
// since this is a conjunction we return an empty entity if any word is not known
if (includeContainers == null) return plasmaWordIndex.emptyContainer(null, 0);
// join the result
startTimer();
indexContainer rcLocal = indexContainer.joinContainers(includeContainers, maxDistance);
if (rcLocal != null) {
indexContainer.excludeContainers(rcLocal, excludeContainers);
}
if (rcLocal == null) rcLocal = plasmaWordIndex.emptyContainer(null, 0);
yield(plasmaSearchProcessing.JOIN, rcLocal.size());
return rcLocal;
}
public static final serverByteBuffer compressIndex(indexContainer inputContainer, indexContainer excludeContainer, long maxtime) {
// collect references according to domains
long timeout = (maxtime < 0) ? Long.MAX_VALUE : System.currentTimeMillis() + maxtime;
TreeMap doms = new TreeMap();
synchronized (inputContainer) {
Iterator i = inputContainer.entries();
indexRWIEntry iEntry;
String dom, paths;
while (i.hasNext()) {
iEntry = (indexRWIEntry) i.next();
if ((excludeContainer != null) && (excludeContainer.get(iEntry.urlHash()) != null)) continue; // do not include urls that are in excludeContainer
dom = iEntry.urlHash().substring(6);
if ((paths = (String) doms.get(dom)) == null) {
doms.put(dom, iEntry.urlHash().substring(0, 6));
} else {
doms.put(dom, paths + iEntry.urlHash().substring(0, 6));
}
if (System.currentTimeMillis() > timeout)
break;
}
}
// construct a result string
serverByteBuffer bb = new serverByteBuffer(inputContainer.size() * 6);
bb.append('{');
Iterator i = doms.entrySet().iterator();
Map.Entry entry;
while (i.hasNext()) {
entry = (Map.Entry) i.next();
bb.append((String) entry.getKey());
bb.append(':');
bb.append((String) entry.getValue());
if (System.currentTimeMillis() > timeout)
break;
if (i.hasNext())
bb.append(',');
}
bb.append('}');
return bb;
}
public static final void decompressIndex(TreeMap target, serverByteBuffer ci, String peerhash) {
// target is a mapping from url-hashes to a string of peer-hashes
if ((ci.byteAt(0) == '{') && (ci.byteAt(ci.length() - 1) == '}')) {
//System.out.println("DEBUG-DECOMPRESS: input is " + ci.toString());
ci = ci.trim(1, ci.length() - 2);
String dom, url, peers;
while ((ci.length() >= 13) && (ci.byteAt(6) == ':')) {
assert ci.length() >= 6 : "ci.length() = " + ci.length();
dom = ci.toString(0, 6);
ci.trim(7);
while ((ci.length() > 0) && (ci.byteAt(0) != ',')) {
assert ci.length() >= 6 : "ci.length() = " + ci.length();
url = ci.toString(0, 6) + dom;
ci.trim(6);
peers = (String) target.get(url);
if (peers == null) {
target.put(url, peerhash);
} else {
target.put(url, peers + peerhash);
}
//System.out.println("DEBUG-DECOMPRESS: " + url + ":" + target.get(url));
}
if (ci.byteAt(0) == ',') ci.trim(1);
}
}
}
}