yacy_search_server/source/de/anomic/plasma/plasmaSearchProcessing.java

330 lines
12 KiB
Java
Raw Normal View History

// plasmaSearchProcessing.java
// (C) 2005 by Michael Peter Christen; mc@yacy.net, Frankfurt a. M., Germany
// first published 17.10.2005 on http://yacy.net
//
// This is a part of YaCy, a peer-to-peer based web search engine
//
// $LastChangedDate: 2006-04-02 22:40:07 +0200 (So, 02 Apr 2006) $
// $LastChangedRevision: 1986 $
// $LastChangedBy: orbiter $
//
// LICENSE
//
// This program is free software; you can redistribute it and/or modify
// it under the terms of the GNU General Public License as published by
// the Free Software Foundation; either version 2 of the License, or
// (at your option) any later version.
//
// This program is distributed in the hope that it will be useful,
// but WITHOUT ANY WARRANTY; without even the implied warranty of
// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
// GNU General Public License for more details.
//
// You should have received a copy of the GNU General Public License
// along with this program; if not, write to the Free Software
// Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
package de.anomic.plasma;
import java.util.Collection;
import java.util.HashMap;
import java.util.Map;
import java.util.Set;
import de.anomic.index.indexContainer;
/**
*
* This class provides search processes and keeps a timing record of the processes
* It shall be used to initiate a search and also to evaluate
* the real obtained timings after a search is performed
*/
public class plasmaSearchProcessing implements Cloneable {
// collection:
// time = time to get a RWI out of RAM cache, assortments and WORDS files
// count = maximum number of RWI-entries that shall be collected
// join
// time = time to perform the join between all collected RWIs
// count = maximum number of entries that shall be joined
// presort:
// time = time to do a sort of the joined URL-records
// count = maximum number of entries that shall be pre-sorted
// urlfetch:
// time = time to fetch the real URLs from the LURL database
// count = maximum number of urls that shall be fetched
// postsort:
// time = time for final sort of URLs
// count = maximum number oof URLs that shall be retrieved during sort
// snippetfetch:
// time = time to fetch snippets for selected URLs
// count = maximum number of snipptes to be fetched
public static final char PROCESS_COLLECTION = 'c';
public static final char PROCESS_JOIN = 'j';
public static final char PROCESS_PRESORT = 'r';
public static final char PROCESS_URLFETCH = 'u';
public static final char PROCESS_POSTSORT = 'o';
public static final char PROCESS_FILTER = 'f';
public static final char PROCESS_SNIPPETFETCH = 's';
private static final long minimumTargetTime = 100;
public static char[] sequence = new char[]{
PROCESS_COLLECTION,
PROCESS_JOIN,
PROCESS_PRESORT,
PROCESS_URLFETCH,
PROCESS_POSTSORT,
PROCESS_FILTER,
PROCESS_SNIPPETFETCH
};
private HashMap targetTime;
private HashMap targetCount;
private HashMap yieldTime;
private HashMap yieldCount;
private long timer;
private plasmaSearchProcessing() {
targetTime = new HashMap();
targetCount = new HashMap();
yieldTime = new HashMap();
yieldCount = new HashMap();
timer = 0;
}
public plasmaSearchProcessing(long time, int count) {
this(
3 * time / 12, 10 * count,
1 * time / 12, 10 * count,
1 * time / 12, 10 * count,
2 * time / 12, 5 * count,
3 * time / 12, count,
1 * time / 12, count,
1 * time / 12, 1
);
}
public plasmaSearchProcessing(
long time_collection, int count_collection,
long time_join, int count_join,
long time_presort, int count_presort,
long time_urlfetch, int count_urlfetch,
long time_postsort, int count_postsort,
long time_filter, int count_filter,
long time_snippetfetch, int count_snippetfetch) {
this();
targetTime.put(new Character(PROCESS_COLLECTION), new Long(time_collection));
targetTime.put(new Character(PROCESS_JOIN), new Long(time_join));
targetTime.put(new Character(PROCESS_PRESORT), new Long(time_presort));
targetTime.put(new Character(PROCESS_URLFETCH), new Long(time_urlfetch));
targetTime.put(new Character(PROCESS_POSTSORT), new Long(time_postsort));
targetTime.put(new Character(PROCESS_FILTER), new Long(time_filter));
targetTime.put(new Character(PROCESS_SNIPPETFETCH), new Long(time_snippetfetch));
targetCount.put(new Character(PROCESS_COLLECTION), new Integer(count_collection));
targetCount.put(new Character(PROCESS_JOIN), new Integer(count_join));
targetCount.put(new Character(PROCESS_PRESORT), new Integer(count_presort));
targetCount.put(new Character(PROCESS_URLFETCH), new Integer(count_urlfetch));
targetCount.put(new Character(PROCESS_POSTSORT), new Integer(count_postsort));
targetCount.put(new Character(PROCESS_FILTER), new Integer(count_filter));
targetCount.put(new Character(PROCESS_SNIPPETFETCH), new Integer(count_snippetfetch));
}
public Object clone() {
plasmaSearchProcessing p = new plasmaSearchProcessing();
p.targetTime = (HashMap) this.targetTime.clone();
p.targetCount = (HashMap) this.targetCount.clone();
p.yieldTime = (HashMap) this.yieldTime.clone();
p.yieldCount = (HashMap) this.yieldCount.clone();
return p;
}
public plasmaSearchProcessing(String s) {
targetTime = new HashMap();
targetCount = new HashMap();
yieldTime = new HashMap();
yieldCount = new HashMap();
intoMap(s, targetTime, targetCount);
}
public long duetime() {
// returns the old duetime value as sum of all waiting times
long d = 0;
for (int i = 0; i < sequence.length; i++) {
d += ((Long) targetTime.get(new Character(sequence[i]))).longValue();
}
return d;
}
public void putYield(String s) {
intoMap(s, yieldTime, yieldCount);
}
public String yieldToString() {
return toString(yieldTime, yieldCount);
}
public String targetToString() {
return toString(targetTime, targetCount);
}
public long getTargetTime(char type) {
// sum up all time that was demanded and subtract all that had been wasted
long sum = 0;
Long t;
Character element;
for (int i = 0; i < sequence.length; i++) {
element = new Character(sequence[i]);
t = (Long) targetTime.get(element);
if (t != null) sum += t.longValue();
if (type == sequence[i]) return (sum < 0) ? minimumTargetTime : sum;
t = (Long) yieldTime.get(element);
if (t != null) sum -= t.longValue();
}
return minimumTargetTime;
}
public int getTargetCount(char type) {
Integer i = (Integer) targetCount.get(new Character(type));
if (i == null) return -1; else return i.intValue();
}
public long getYieldTime(char type) {
Long l = (Long) yieldTime.get(new Character(type));
if (l == null) return -1; else return l.longValue();
}
public int getYieldCount(char type) {
Integer i = (Integer) yieldCount.get(new Character(type));
if (i == null) return -1; else return i.intValue();
}
public void startTimer() {
this.timer = System.currentTimeMillis();
}
public void setYieldTime(char type) {
// sets a time that is computed using the timer
long t = System.currentTimeMillis() - this.timer;
yieldTime.put(new Character(type), new Long(t));
}
public void setYieldCount(char type, int count) {
yieldCount.put(new Character(type), new Integer(count));
}
public String reportToString() {
return "target=" + toString(targetTime, targetCount) + "; yield=" + toString(yieldTime, yieldCount);
}
public static String toString(HashMap time, HashMap count) {
// put this into a format in such a way that it can be send in a http header or post argument
// that means that no '=' or spaces are allowed
StringBuffer sb = new StringBuffer(sequence.length * 10);
Character element;
Integer xi;
Long xl;
for (int i = 0; i < sequence.length; i++) {
element = new Character(sequence[i]);
sb.append("t");
sb.append(element);
xl = (Long) time.get(element);
sb.append((xl == null) ? "0" : xl.toString());
sb.append("|");
sb.append("c");
sb.append(element);
xi = (Integer) count.get(element);
sb.append((xi == null) ? "0" : xi.toString());
sb.append("|");
}
return sb.toString();
}
public static void intoMap(String s, HashMap time, HashMap count) {
// this is the reverse method to toString
int p = 0;
char ct;
String elt;
String v;
int p1;
while ((p < s.length()) && ((p1 = s.indexOf('|', p)) > 0)) {
ct = s.charAt(p);
elt = s.substring(p + 1, p + 2);
v = s.substring(p + 2, p1);
if (ct == 't') {
time.put(elt, new Long(Long.parseLong(v)));
} else {
count.put(elt, new Integer(Integer.parseInt(v)));
}
}
}
// the processes
// collection
public Map[] localSearchContainers(
plasmaSearchQuery query,
plasmaWordIndex wordIndex,
Set urlselection) {
// search for the set of hashes and return a map of of wordhash:indexContainer containing the seach result
// retrieve entities that belong to the hashes
startTimer();
long start = System.currentTimeMillis();
Map inclusionContainers = (query.queryHashes.size() == 0) ? new HashMap() : wordIndex.getContainers(
query.queryHashes,
urlselection,
true,
true,
getTargetTime(plasmaSearchProcessing.PROCESS_COLLECTION) * query.queryHashes.size() / (query.queryHashes.size() + query.excludeHashes.size()));
if ((inclusionContainers.size() != 0) && (inclusionContainers.size() < query.queryHashes.size())) inclusionContainers = new HashMap(); // prevent that only a subset is returned
long remaintime = getTargetTime(plasmaSearchProcessing.PROCESS_COLLECTION) - System.currentTimeMillis() + start;
Map exclusionContainers = ((inclusionContainers == null) || (inclusionContainers.size() == 0) || (remaintime <= 0)) ? new HashMap() : wordIndex.getContainers(
query.excludeHashes,
urlselection,
true,
true,
remaintime);
setYieldTime(plasmaSearchProcessing.PROCESS_COLLECTION);
setYieldCount(plasmaSearchProcessing.PROCESS_COLLECTION, inclusionContainers.size());
return new Map[]{inclusionContainers, exclusionContainers};
}
// join
public indexContainer localSearchJoinExclude(
Collection includeContainers,
Collection excludeContainers,
long time, int maxDistance) {
// join a search result and return the joincount (number of pages after join)
// since this is a conjunction we return an empty entity if any word is not known
if (includeContainers == null) return plasmaWordIndex.emptyContainer(null, 0);
// join the result
startTimer();
long start = System.currentTimeMillis();
indexContainer rcLocal = indexContainer.joinContainers(includeContainers, time, maxDistance);
long remaining = getTargetTime(plasmaSearchProcessing.PROCESS_JOIN) - System.currentTimeMillis() + start;
if ((rcLocal != null) && (remaining > 0)) {
indexContainer.excludeContainers(rcLocal, excludeContainers, remaining);
}
if (rcLocal == null) rcLocal = plasmaWordIndex.emptyContainer(null, 0);
setYieldTime(plasmaSearchProcessing.PROCESS_JOIN);
setYieldCount(plasmaSearchProcessing.PROCESS_JOIN, rcLocal.size());
return rcLocal;
}
}