- fixed bug with new indexAbstract generation

- added partly evaluation of indexAbstracts during remote searches

git-svn-id: https://svn.berlios.de/svnroot/repos/yacy/trunk@2544 6c8d7289-2bf4-0310-a012-ef5d649a1542
This commit is contained in:
orbiter 2006-09-11 10:39:25 +00:00
parent fded1f4a5d
commit 82a6054275
9 changed files with 134 additions and 76 deletions

View File

@ -55,6 +55,7 @@ import java.util.Set;
import de.anomic.http.httpHeader;
import de.anomic.index.indexContainer;
import de.anomic.index.indexEntryAttribute;
import de.anomic.index.indexURL;
import de.anomic.plasma.plasmaCrawlLURL;
import de.anomic.plasma.plasmaSearchEvent;
import de.anomic.plasma.plasmaSearchRankingProfile;
@ -158,8 +159,8 @@ public final class search {
if ((maxcounthash == null) || (urls.length() != 0)) {
prop.put("indexabstract","");
} else {
String indexabstract = "indexabstract." + maxcounthash + "=" + ((indexContainer) containers.get(maxcounthash)).compressedIndex(1000);
yacyCore.log.logFine("DEBUG HASH SEARCH: " + indexabstract);
String indexabstract = "indexabstract." + maxcounthash + "=" + indexURL.compressIndex(((indexContainer) containers.get(maxcounthash)), 1000).toString();
//yacyCore.log.logFine("DEBUG HASH SEARCH: " + indexabstract);
prop.put("indexabstract", indexabstract);
}

View File

@ -32,7 +32,6 @@ import java.util.Iterator;
import java.util.Set;
import de.anomic.kelondro.kelondroOrder;
import de.anomic.server.serverByteBuffer;
public interface indexContainer {
@ -44,7 +43,6 @@ public interface indexContainer {
public void setWordHash(String newWordHash);
public String getWordHash();
public serverByteBuffer compressedIndex(long maxtime);
public void select(Set urlselection);
public void setOrdering(kelondroOrder newOrder, int newColumn);

View File

@ -31,7 +31,6 @@ import java.util.Collection;
import java.util.ConcurrentModificationException;
import java.util.Iterator;
import java.util.Set;
import java.util.Map;
import java.util.TreeMap;
import de.anomic.kelondro.kelondroBase64Order;
@ -39,7 +38,6 @@ import de.anomic.kelondro.kelondroNaturalOrder;
import de.anomic.kelondro.kelondroOrder;
import de.anomic.kelondro.kelondroRow;
import de.anomic.kelondro.kelondroRowSet;
import de.anomic.server.serverByteBuffer;
public class indexRowSetContainer extends kelondroRowSet implements indexContainer {
@ -67,43 +65,6 @@ public class indexRowSetContainer extends kelondroRowSet implements indexContain
return newContainer;
}
public serverByteBuffer compressedIndex(long maxtime) {
// collect references according to domains
long timeout = (maxtime < 0) ? Long.MAX_VALUE : System.currentTimeMillis() + maxtime;
TreeMap doms = new TreeMap();
synchronized(this) {
Iterator i = entries();
indexEntry iEntry;
String dom, paths;
while (i.hasNext()) {
iEntry = (indexEntry) i.next();
dom = iEntry.urlHash().substring(6);
if ((paths = (String) doms.get(dom)) == null) {
doms.put(dom, iEntry.urlHash().substring(0, 6));
} else {
doms.put(dom, paths + iEntry.urlHash().substring(0, 6));
}
if (System.currentTimeMillis() > timeout) break;
}
}
// construct a result string
serverByteBuffer bb = new serverByteBuffer(this.size() * indexURLEntry.urlEntryRow.width(0) / 2);
bb.append('{');
Iterator i = doms.entrySet().iterator();
Map.Entry entry;
while (i.hasNext()) {
entry = (Map.Entry) i.next();
bb.append((String) entry.getKey());
bb.append(':');
bb.append((String) entry.getValue());
if (System.currentTimeMillis() > timeout) break;
if (i.hasNext()) bb.append(',');
}
bb.append('}');
bb.trim();
return bb;
}
public void setWordHash(String newWordHash) {
this.wordHash = newWordHash;
}

View File

@ -33,12 +33,15 @@ import java.net.MalformedURLException;
import java.text.SimpleDateFormat;
import java.util.HashMap;
import java.util.Iterator;
import java.util.Map;
import java.util.TreeMap;
import de.anomic.kelondro.kelondroBase64Order;
import de.anomic.kelondro.kelondroIndex;
import de.anomic.kelondro.kelondroRAMIndex;
import de.anomic.kelondro.kelondroTree;
import de.anomic.kelondro.kelondroRow;
import de.anomic.server.serverByteBuffer;
import de.anomic.server.serverCodings;
import de.anomic.yacy.yacySeedDB;
@ -586,7 +589,7 @@ public class indexURL {
private static String[] testTLDs = new String[] {"com", "net", "org", "uk", "fr", "de", "es", "it"};
public static final URL probablyWordURL(String urlHash, String word) {
if (word == null) return null;
if ((word == null) || (word.length() == 0)) return null;
String pattern = urlHash.substring(6, 11);
for (int i = 0; i < testTLDs.length; i++) {
if (pattern.equals(protocolHostPort("http", "www." + word.toLowerCase() + "." + testTLDs[i], 80)))
@ -635,4 +638,65 @@ public class indexURL {
return hash;
}
public static final serverByteBuffer compressIndex(indexContainer container, long maxtime) {
// collect references according to domains
long timeout = (maxtime < 0) ? Long.MAX_VALUE : System.currentTimeMillis() + maxtime;
TreeMap doms = new TreeMap();
synchronized(container) {
Iterator i = container.entries();
indexEntry iEntry;
String dom, paths;
while (i.hasNext()) {
iEntry = (indexEntry) i.next();
dom = iEntry.urlHash().substring(6);
if ((paths = (String) doms.get(dom)) == null) {
doms.put(dom, iEntry.urlHash().substring(0, 6));
} else {
doms.put(dom, paths + iEntry.urlHash().substring(0, 6));
}
if (System.currentTimeMillis() > timeout) break;
}
}
// construct a result string
serverByteBuffer bb = new serverByteBuffer(container.size() * 6);
bb.append('{');
Iterator i = doms.entrySet().iterator();
Map.Entry entry;
while (i.hasNext()) {
entry = (Map.Entry) i.next();
bb.append((String) entry.getKey());
bb.append(':');
bb.append((String) entry.getValue());
if (System.currentTimeMillis() > timeout) break;
if (i.hasNext()) bb.append(',');
}
bb.append('}');
bb.trim();
return bb;
}
public static final void decompressIndex(TreeMap target, serverByteBuffer ci, String peerhash) {
// target is a mapping from url-hashes to a string of peer-hashes
if ((ci.byteAt(0) == '{') && (ci.byteAt(ci.length() - 1) == '}')) {
ci = ci.trim(1, ci.length() - 1);
String dom, url, peers;
while ((ci.length() >= 13) && (ci.byteAt(6) == ':')) {
dom = ci.toString(0, 6);
ci.trim(7);
while ((ci.length() == 6) || ((ci.length() > 6) && (ci.byteAt(6) != ','))) {
url = ci.toString(0, 6) + dom;
ci.trim(6);
peers = (String) target.get(url);
if (peers == null) {
target.put(url, peerhash);
} else {
target.put(url, peers + peerhash);
}
}
if (ci.byteAt(0) == ',') ci.trim(1);
}
}
}
}

View File

@ -47,6 +47,7 @@ import java.util.Iterator;
import java.util.Map;
import java.util.HashSet;
import java.util.Set;
import java.util.TreeMap;
import de.anomic.kelondro.kelondroException;
import de.anomic.server.logging.serverLog;
@ -67,8 +68,9 @@ public final class plasmaSearchEvent extends Thread implements Runnable {
private plasmaWordIndex wordIndex;
private plasmaCrawlLURL urlStore;
private plasmaSnippetCache snippetCache;
private indexContainer rcGlobal; // cache for results
private int rcGlobalCount;
private indexContainer rcContainers; // cache for results
private int rcContainerCount;
private Map rcAbstracts; // cache for index abstracts
private plasmaSearchTimingProfile profileLocal, profileGlobal;
private boolean postsort;
private yacySearch[] searchThreads;
@ -88,8 +90,9 @@ public final class plasmaSearchEvent extends Thread implements Runnable {
this.ranking = ranking;
this.urlStore = urlStore;
this.snippetCache = snippetCache;
this.rcGlobal = new indexRowSetContainer(null);
this.rcGlobalCount = 0;
this.rcContainers = new indexRowSetContainer(null);
this.rcContainerCount = 0;
this.rcAbstracts = new TreeMap();
this.profileLocal = localTiming;
this.profileGlobal = remoteTiming;
this.postsort = postsort;
@ -130,7 +133,7 @@ public final class plasmaSearchEvent extends Thread implements Runnable {
log.logFine("STARTING " + fetchpeers + " THREADS TO CATCH EACH " + profileGlobal.getTargetCount(plasmaSearchTimingProfile.PROCESS_POSTSORT) + " URLs WITHIN " + (profileGlobal.duetime() / 1000) + " SECONDS");
long timeout = System.currentTimeMillis() + profileGlobal.duetime();
searchThreads = yacySearch.searchHashes(query.queryHashes, query.prefer, query.urlMask, query.maxDistance, urlStore, rcGlobal, fetchpeers, plasmaSwitchboard.urlBlacklist, snippetCache, profileGlobal, ranking);
searchThreads = yacySearch.searchHashes(query.queryHashes, query.prefer, query.urlMask, query.maxDistance, urlStore, rcContainers, rcAbstracts, fetchpeers, plasmaSwitchboard.urlBlacklist, snippetCache, profileGlobal, ranking);
// meanwhile do a local search
indexContainer rcLocal = localSearchJoin(localSearchContainers(null).values());
@ -145,7 +148,7 @@ public final class plasmaSearchEvent extends Thread implements Runnable {
// wait a little time ..
try {Thread.sleep(100);} catch (InterruptedException e) {}
}
int globalContributions = rcGlobal.size();
int globalContributions = rcContainers.size();
// finished searching
log.logFine("SEARCH TIME AFTER GLOBAL-TRIGGER TO " + fetchpeers + " PEERS: " + ((System.currentTimeMillis() - start) / 1000) + " seconds");
@ -222,7 +225,7 @@ public final class plasmaSearchEvent extends Thread implements Runnable {
profileLocal.startTimer();
long pst = System.currentTimeMillis();
searchResult.add(rcLocal, preorderTime);
searchResult.add(rcGlobal, preorderTime);
searchResult.add(rcContainers, preorderTime);
preorderTime = preorderTime - (System.currentTimeMillis() - pst);
if (preorderTime < 0) preorderTime = 200;
plasmaSearchPreOrder preorder = new plasmaSearchPreOrder(query, ranking, searchResult, preorderTime);
@ -352,10 +355,10 @@ public final class plasmaSearchEvent extends Thread implements Runnable {
//log.logFine("FINISHED FLUSH RESULTS PROCESS for query " + query.hashes(","));
}
serverLog.logFine("PLASMA", "FINISHED FLUSHING " + rcGlobalCount + " GLOBAL SEARCH RESULTS FOR SEARCH " + query.queryWords);
serverLog.logFine("PLASMA", "FINISHED FLUSHING " + rcContainerCount + " GLOBAL SEARCH RESULTS FOR SEARCH " + query.queryWords);
// finally delete the temporary index
rcGlobal = null;
rcContainers = null;
flushThreads.remove(this);
}
@ -364,24 +367,24 @@ public final class plasmaSearchEvent extends Thread implements Runnable {
// flush the rcGlobal as much as is there so far
// this must be called sometime after search results had been computed
int count = 0;
if ((rcGlobal != null) && (rcGlobal.size() > 0)) {
synchronized (rcGlobal) {
if ((rcContainers != null) && (rcContainers.size() > 0)) {
synchronized (rcContainers) {
String wordHash;
Iterator hashi = query.queryHashes.iterator();
boolean dhtCache = false;
while (hashi.hasNext()) {
wordHash = (String) hashi.next();
rcGlobal.setWordHash(wordHash);
rcContainers.setWordHash(wordHash);
dhtCache = dhtCache | wordIndex.busyCacheFlush;
wordIndex.addEntries(rcGlobal, System.currentTimeMillis(), dhtCache);
log.logFine("FLUSHED " + wordHash + ": " + rcGlobal.size() + " url entries to " + ((dhtCache) ? "DHT cache" : "word cache"));
wordIndex.addEntries(rcContainers, System.currentTimeMillis(), dhtCache);
log.logFine("FLUSHED " + wordHash + ": " + rcContainers.size() + " url entries to " + ((dhtCache) ? "DHT cache" : "word cache"));
}
// the rcGlobal was flushed, empty it
count += rcGlobal.size();
rcGlobal.clear();
count += rcContainers.size();
rcContainers.clear();
}
}
rcGlobalCount += count;
rcContainerCount += count;
}
}

View File

@ -137,6 +137,7 @@ public final class plasmaSearchQuery {
}
public String words(String separator) {
if (queryWords == null) return "";
StringBuffer result = new StringBuffer(8 * queryWords.size());
Iterator i = queryWords.iterator();
if (i.hasNext()) result.append((String) i.next());

View File

@ -274,16 +274,15 @@ public final class serverByteBuffer extends OutputStream {
return tmp;
}
/*
private serverByteBuffer trim(int start) {
public serverByteBuffer trim(int start) {
// the end value is outside (+1) of the wanted target array
if (start > length) throw new IndexOutOfBoundsException("trim: start > length");
offset = offset + start;
length = length - start;
return this;
}
*/
private serverByteBuffer trim(int start, int end) {
public serverByteBuffer trim(int start, int end) {
// the end value is outside (+1) of the wanted target array
if (start > length) throw new IndexOutOfBoundsException("trim: start > length");
if (end > length) throw new IndexOutOfBoundsException("trim: end > length");
@ -347,6 +346,10 @@ public final class serverByteBuffer extends OutputStream {
return new String(buffer, offset, length);
}
public String toString(int left, int rightbound) {
return new String(buffer, offset + left, rightbound - left);
}
public Properties propParser() {
// extract a=b or a="b" - relations from the buffer
int pos = offset;

View File

@ -48,6 +48,8 @@ import java.io.UnsupportedEncodingException;
import java.util.ArrayList;
import java.util.Date;
import java.util.HashMap;
import java.util.TreeMap;
import java.util.Map;
import java.util.Iterator;
import de.anomic.htmlFilter.htmlFilterContentScraper;
@ -56,6 +58,7 @@ import de.anomic.index.indexContainer;
import de.anomic.index.indexEntry;
import de.anomic.index.indexEntryAttribute;
import de.anomic.index.indexRowSetContainer;
import de.anomic.index.indexURL;
import de.anomic.index.indexURLEntry;
import de.anomic.kelondro.kelondroBase64Order;
import de.anomic.net.URL;
@ -65,6 +68,7 @@ import de.anomic.plasma.plasmaSearchTimingProfile;
import de.anomic.plasma.plasmaSnippetCache;
import de.anomic.plasma.plasmaSwitchboard;
import de.anomic.plasma.urlPattern.plasmaURLPattern;
import de.anomic.server.serverByteBuffer;
import de.anomic.server.serverCodings;
import de.anomic.server.serverCore;
import de.anomic.server.serverObjects;
@ -370,6 +374,7 @@ public final class yacyClient {
yacySeed targetPeer,
plasmaCrawlLURL urlManager,
indexContainer containerCache,
Map abstractCache,
plasmaURLPattern blacklist,
plasmaSnippetCache snippets,
plasmaSearchTimingProfile timingProfile,
@ -524,9 +529,25 @@ public final class yacyClient {
}
}
// finally insert the containers to the index
// insert the containers to the index
for (int m = 0; m < words; m++) { containerCache.add(container[m], -1); }
// read index abstract
Iterator i = result.entrySet().iterator();
Map.Entry entry;
TreeMap singleAbstract;
String wordhash;
while (i.hasNext()) {
entry = (Map.Entry) i.next();
if (((String) entry.getKey()).startsWith("indexabstract.")) {
wordhash = ((String) entry.getKey()).substring(14);
singleAbstract = (TreeMap) abstractCache.get(wordhash);
if (singleAbstract == null) singleAbstract = new TreeMap();
indexURL.decompressIndex(singleAbstract, new serverByteBuffer(((String) entry.getValue()).getBytes()), targetPeer.hash);
abstractCache.put(wordhash, singleAbstract);
}
}
// generate statistics
long searchtime;
try {

View File

@ -46,6 +46,7 @@ package de.anomic.yacy;
import java.util.Enumeration;
import java.util.HashMap;
import java.util.Iterator;
import java.util.Map;
import java.util.Set;
import de.anomic.index.indexContainer;
@ -63,6 +64,7 @@ public class yacySearch extends Thread {
final private boolean global;
final private plasmaCrawlLURL urlManager;
final private indexContainer containerCache;
final private Map abstractCache;
final private plasmaURLPattern blacklist;
final private plasmaSnippetCache snippetCache;
final private yacySeed targetPeer;
@ -72,8 +74,10 @@ public class yacySearch extends Thread {
final private plasmaSearchRankingProfile rankingProfile;
final private String prefer, filter;
public yacySearch(Set wordhashes, String prefer, String filter, int maxDistance, boolean global, yacySeed targetPeer,
plasmaCrawlLURL urlManager, indexContainer containerCache, plasmaURLPattern blacklist, plasmaSnippetCache snippetCache,
public yacySearch(Set wordhashes, String prefer, String filter, int maxDistance,
boolean global, yacySeed targetPeer, plasmaCrawlLURL urlManager,
indexContainer containerCache, Map abstractCache,
plasmaURLPattern blacklist, plasmaSnippetCache snippetCache,
plasmaSearchTimingProfile timingProfile, plasmaSearchRankingProfile rankingProfile) {
super("yacySearch_" + targetPeer.getName());
this.wordhashes = wordhashes;
@ -82,6 +86,7 @@ public class yacySearch extends Thread {
this.global = global;
this.urlManager = urlManager;
this.containerCache = containerCache;
this.abstractCache = abstractCache;
this.blacklist = blacklist;
this.snippetCache = snippetCache;
this.targetPeer = targetPeer;
@ -92,7 +97,7 @@ public class yacySearch extends Thread {
}
public void run() {
this.links = yacyClient.search(set2string(wordhashes), prefer, filter, maxDistance, global, targetPeer, urlManager, containerCache, blacklist, snippetCache, timingProfile, rankingProfile);
this.links = yacyClient.search(set2string(wordhashes), prefer, filter, maxDistance, global, targetPeer, urlManager, containerCache, abstractCache, blacklist, snippetCache, timingProfile, rankingProfile);
if (links != 0) {
//yacyCore.log.logInfo("REMOTE SEARCH - remote peer " + targetPeer.hash + ":" + targetPeer.getName() + " contributed " + links + " links for word hash " + wordhashes);
yacyCore.seedDB.mySeed.incRI(links);
@ -181,7 +186,8 @@ public class yacySearch extends Thread {
return result;
}
public static yacySearch[] searchHashes(Set wordhashes, String prefer, String filter, int maxDist, plasmaCrawlLURL urlManager, indexContainer containerCache,
public static yacySearch[] searchHashes(Set wordhashes, String prefer, String filter, int maxDist, plasmaCrawlLURL urlManager,
indexContainer containerCache, Map abstractCache,
int targets, plasmaURLPattern blacklist, plasmaSnippetCache snippetCache,
plasmaSearchTimingProfile timingProfile, plasmaSearchRankingProfile rankingProfile) {
// check own peer status
@ -196,7 +202,7 @@ public class yacySearch extends Thread {
yacySearch[] searchThreads = new yacySearch[targets];
for (int i = 0; i < targets; i++) {
searchThreads[i]= new yacySearch(wordhashes, prefer, filter, maxDist, true, targetPeers[i],
urlManager, containerCache, blacklist, snippetCache, timingProfile, rankingProfile);
urlManager, containerCache, abstractCache, blacklist, snippetCache, timingProfile, rankingProfile);
searchThreads[i].start();
//try {Thread.sleep(20);} catch (InterruptedException e) {}
}