- better data structures in secondary search

- fixed a big memory leak in secondary search
This commit is contained in:
Michael Peter Christen 2012-07-03 07:12:20 +02:00
parent de903a53a0
commit 613b45f604
3 changed files with 68 additions and 79 deletions

View File

@ -28,8 +28,10 @@ package net.yacy.kelondro.data.word;
import java.io.Serializable; import java.io.Serializable;
import java.util.Collections; import java.util.Collections;
import java.util.HashSet;
import java.util.Iterator; import java.util.Iterator;
import java.util.Map; import java.util.Map;
import java.util.Set;
import java.util.SortedMap; import java.util.SortedMap;
import java.util.TreeMap; import java.util.TreeMap;
@ -116,16 +118,16 @@ public class WordReferenceFactory implements ReferenceFactory<WordReference>, Se
* decompress an index abstract that was generated from a word index and transmitted over a network connection * decompress an index abstract that was generated from a word index and transmitted over a network connection
* @param ci * @param ci
* @param peerhash * @param peerhash
* @return * @return a urlhash -> peerlist map: this shows in which peers an url is stored
*/ */
public static final SortedMap<String, StringBuilder> decompressIndex(ByteBuffer ci, final String peerhash) { public static final SortedMap<String, Set<String>> decompressIndex(ByteBuffer ci, final String peerhash) {
SortedMap<String, StringBuilder> target = Collections.synchronizedSortedMap(new TreeMap<String, StringBuilder>()); SortedMap<String, Set<String>> target = Collections.synchronizedSortedMap(new TreeMap<String, Set<String>>());
// target is a mapping from url-hashes to a string of peer-hashes // target is a mapping from url-hashes to a string of peer-hashes
if (ci.byteAt(0) != '{' || ci.byteAt(ci.length() - 1) != '}') return target; if (ci.byteAt(0) != '{' || ci.byteAt(ci.length() - 1) != '}') return target;
//System.out.println("DEBUG-DECOMPRESS: input is " + ci.toString()); //System.out.println("DEBUG-DECOMPRESS: input is " + ci.toString());
ci = ci.trim(1, ci.length() - 2); ci = ci.trim(1, ci.length() - 2);
String dom, url; String dom, url;
StringBuilder peers; Set<String> peers;
StringBuilder urlsb; StringBuilder urlsb;
while ((ci.length() >= 13) && (ci.byteAt(6) == ':')) { while ((ci.length() >= 13) && (ci.byteAt(6) == ':')) {
assert ci.length() >= 6 : "ci.length() = " + ci.length(); assert ci.length() >= 6 : "ci.length() = " + ci.length();
@ -140,16 +142,15 @@ public class WordReferenceFactory implements ReferenceFactory<WordReference>, Se
peers = target.get(url); peers = target.get(url);
if (peers == null) { if (peers == null) {
peers = new StringBuilder(24); peers = new HashSet<String>();
peers.append(peerhash);
target.put(url, peers); target.put(url, peers);
} else {
peers.append(peerhash);
} }
peers.add(peerhash);
//System.out.println("DEBUG-DECOMPRESS: " + url + ":" + target.get(url)); //System.out.println("DEBUG-DECOMPRESS: " + url + ":" + target.get(url));
} }
if (ci.byteAt(0) == ',') ci.trim(1); if (ci.byteAt(0) == ',') ci.trim(1);
} }
//System.out.println("DEBUG-DECOMPRESS: " + target);
return target; return target;
} }
} }

View File

@ -26,6 +26,7 @@ package net.yacy.peers;
import java.util.Iterator; import java.util.Iterator;
import java.util.List; import java.util.List;
import java.util.Set;
import java.util.SortedMap; import java.util.SortedMap;
import java.util.regex.Pattern; import java.util.regex.Pattern;
@ -215,7 +216,7 @@ public class RemoteSearch extends Thread {
} }
public static RemoteSearch secondaryRemoteSearch( public static RemoteSearch secondaryRemoteSearch(
final String wordhashes, final String urlhashes, final Set<String> wordhashes, final String urlhashes,
final long time, final long time,
final Segment indexSegment, final Segment indexSegment,
final SeedDB peers, final SeedDB peers,
@ -223,7 +224,6 @@ public class RemoteSearch extends Thread {
final String targethash, final Blacklist blacklist, final String targethash, final Blacklist blacklist,
final RankingProfile rankingProfile, final RankingProfile rankingProfile,
final Bitfield constraint, final SortedMap<byte[], String> clusterselection) { final Bitfield constraint, final SortedMap<byte[], String> clusterselection) {
assert wordhashes.length() >= 12 : "wordhashes = " + wordhashes;
// check own peer status // check own peer status
if (peers.mySeed() == null || peers.mySeed().getPublicAddress() == null) { return null; } if (peers.mySeed() == null || peers.mySeed().getPublicAddress() == null) { return null; }
@ -234,8 +234,10 @@ public class RemoteSearch extends Thread {
final Seed targetPeer = peers.getConnected(targethash); final Seed targetPeer = peers.getConnected(targethash);
if (targetPeer == null || targetPeer.hash == null) return null; if (targetPeer == null || targetPeer.hash == null) return null;
if (clusterselection != null) targetPeer.setAlternativeAddress(clusterselection.get(ASCII.getBytes(targetPeer.hash))); if (clusterselection != null) targetPeer.setAlternativeAddress(clusterselection.get(ASCII.getBytes(targetPeer.hash)));
StringBuilder whs = new StringBuilder(24);
for (String s: wordhashes) whs.append(s);
final RemoteSearch searchThread = new RemoteSearch( final RemoteSearch searchThread = new RemoteSearch(
wordhashes, "", urlhashes, QueryParams.matchnothing_pattern, QueryParams.catchall_pattern, QueryParams.catchall_pattern, new QueryParams.Modifier(""), "", "", "", "all", 20, time, 9999, true, 0, targetPeer, whs.toString(), "", urlhashes, QueryParams.matchnothing_pattern, QueryParams.catchall_pattern, QueryParams.catchall_pattern, new QueryParams.Modifier(""), "", "", "", "all", 20, time, 9999, true, 0, targetPeer,
indexSegment, peers, containerCache, null, blacklist, rankingProfile, constraint); indexSegment, peers, containerCache, null, blacklist, rankingProfile, constraint);
searchThread.start(); searchThread.start();
return searchThread; return searchThread;

View File

@ -28,9 +28,11 @@ package net.yacy.search.query;
import java.util.ArrayList; import java.util.ArrayList;
import java.util.Collections; import java.util.Collections;
import java.util.HashSet;
import java.util.Iterator; import java.util.Iterator;
import java.util.List; import java.util.List;
import java.util.Map; import java.util.Map;
import java.util.Set;
import java.util.SortedMap; import java.util.SortedMap;
import java.util.SortedSet; import java.util.SortedSet;
import java.util.TreeMap; import java.util.TreeMap;
@ -521,17 +523,16 @@ public final class SearchEvent
} }
} }
public class SecondarySearchSuperviser extends Thread public class SecondarySearchSuperviser extends Thread {
{
// cache for index abstracts; word:TreeMap mapping where the embedded TreeMap is a urlhash:peerlist relation // cache for index abstracts; word:TreeMap mapping where the embedded TreeMap is a urlhash:peerlist relation
// this relation contains the information where specific urls can be found in specific peers // this relation contains the information where specific urls can be found in specific peers
private final SortedMap<String, SortedMap<String, StringBuilder>> abstractsCache; private final SortedMap<String, SortedMap<String, Set<String>>> abstractsCache;
private final SortedSet<String> checkedPeers; private final SortedSet<String> checkedPeers;
private final Semaphore trigger; private final Semaphore trigger;
public SecondarySearchSuperviser() { public SecondarySearchSuperviser() {
this.abstractsCache = Collections.synchronizedSortedMap(new TreeMap<String, SortedMap<String, StringBuilder>>()); this.abstractsCache = Collections.synchronizedSortedMap(new TreeMap<String, SortedMap<String, Set<String>>>());
this.checkedPeers = Collections.synchronizedSortedSet(new TreeSet<String>()); this.checkedPeers = Collections.synchronizedSortedSet(new TreeSet<String>());
this.trigger = new Semaphore(0); this.trigger = new Semaphore(0);
} }
@ -542,25 +543,24 @@ public final class SearchEvent
* @param wordhash * @param wordhash
* @param singleAbstract // a mapping from url-hashes to a string of peer-hashes * @param singleAbstract // a mapping from url-hashes to a string of peer-hashes
*/ */
public void addAbstract(final String wordhash, final SortedMap<String, StringBuilder> singleAbstract) { public void addAbstract(final String wordhash, final SortedMap<String, Set<String>> singleAbstract) {
final SortedMap<String, StringBuilder> oldAbstract; final SortedMap<String, Set<String>> oldAbstract = this.abstractsCache.get(wordhash);
oldAbstract = this.abstractsCache.get(wordhash); if ( oldAbstract == null ) {
if ( oldAbstract == null ) { // new abstracts in the cache
// new abstracts in the cache this.abstractsCache.put(wordhash, singleAbstract);
this.abstractsCache.put(wordhash, singleAbstract); return;
return; }
}
// extend the abstracts in the cache: join the single abstracts // extend the abstracts in the cache: join the single abstracts
new Thread() { new Thread() {
@Override @Override
public void run() { public void run() {
Thread.currentThread().setName("SearchEvent.paddAbstract:" + wordhash); Thread.currentThread().setName("SearchEvent.addAbstract:" + wordhash);
for ( final Map.Entry<String, StringBuilder> oneref : singleAbstract.entrySet() ) { for ( final Map.Entry<String, Set<String>> oneref : singleAbstract.entrySet() ) {
final String urlhash = oneref.getKey(); final String urlhash = oneref.getKey();
final StringBuilder peerlistNew = oneref.getValue(); final Set<String> peerlistNew = oneref.getValue();
final StringBuilder peerlistOld = oldAbstract.put(urlhash, peerlistNew); final Set<String> peerlistOld = oldAbstract.put(urlhash, peerlistNew);
if ( peerlistOld != null ) { if ( peerlistOld != null ) {
peerlistOld.append(peerlistNew); peerlistOld.addAll(peerlistNew);
} }
} }
} }
@ -572,32 +572,21 @@ public final class SearchEvent
this.trigger.release(); this.trigger.release();
} }
private String wordsFromPeer(final String peerhash, final StringBuilder urls) { private Set<String> wordsFromPeer(final String peerhash, final Set<String> urls) {
Map.Entry<String, SortedMap<String, StringBuilder>> entry; Set<String> wordlist = new HashSet<String>();
String word, url, wordlist = ""; String word;
StringBuilder peerlist; Set<String> peerlist;
SortedMap<String, StringBuilder> urlPeerlist; SortedMap<String, Set<String>> urlPeerlist; // urlhash:peerlist
int p; for ( Map.Entry<String, SortedMap<String, Set<String>>> entry: this.abstractsCache.entrySet()) {
boolean hasURL;
final Iterator<Map.Entry<String, SortedMap<String, StringBuilder>>> i =
this.abstractsCache.entrySet().iterator();
while ( i.hasNext() ) {
entry = i.next();
word = entry.getKey(); word = entry.getKey();
urlPeerlist = entry.getValue(); urlPeerlist = entry.getValue();
hasURL = true; for (String url: urls) {
for ( int j = 0; j < urls.length(); j = j + 12 ) {
url = urls.substring(j, j + 12);
peerlist = urlPeerlist.get(url); peerlist = urlPeerlist.get(url);
p = (peerlist == null) ? -1 : peerlist.indexOf(peerhash); if (peerlist != null && peerlist.contains(peerhash)) {
if ( (p < 0) || (p % 12 != 0) ) { wordlist.add(word);
hasURL = false;
break; break;
} }
} }
if ( hasURL ) {
wordlist += word;
}
} }
return wordlist; return wordlist;
} }
@ -605,36 +594,36 @@ public final class SearchEvent
@Override @Override
public void run() { public void run() {
try { try {
int t = 0; boolean aquired;
while ( this.trigger.tryAcquire(10000, TimeUnit.MILLISECONDS) ) { while ( aquired = this.trigger.tryAcquire(3000, TimeUnit.MILLISECONDS) ) {
// a trigger was released if ( !aquired || MemoryControl.shortStatus()) {
prepareSecondarySearch();
t++;
if ( t > 10 ) {
break; break;
} }
// a trigger was released
prepareSecondarySearch();
} }
} catch ( final InterruptedException e ) { } catch ( final InterruptedException e ) {
// the thread was interrupted // the thread was interrupted
// do nothing // do nothing
} }
// the time-out was reached // the time-out was reached:
// as we will never again prepare another secondary search, we can flush all cached data
this.abstractsCache.clear();
this.checkedPeers.clear();
} }
private void prepareSecondarySearch() { private void prepareSecondarySearch() {
if ( this.abstractsCache == null if ( this.abstractsCache == null || this.abstractsCache.size() != SearchEvent.this.query.queryHashes.size() ) {
|| this.abstractsCache.size() != SearchEvent.this.query.queryHashes.size() ) {
return; // secondary search not possible (yet) return; // secondary search not possible (yet)
} }
// catch up index abstracts and join them; then call peers again to submit their urls // catch up index abstracts and join them; then call peers again to submit their urls
/* /*
System.out.println("DEBUG-INDEXABSTRACT: " + this.abstractsCache.size() + " word references caught, " + SearchEvent.this.query.queryHashes.size() + " needed"); System.out.println("DEBUG-INDEXABSTRACT: " + this.abstractsCache.size() + " word references caught, " + SearchEvent.this.query.queryHashes.size() + " needed");
for (final Map.Entry<String, SortedMap<String, StringBuilder>> entry: this.abstractsCache.entrySet()) { for (final Map.Entry<String, SortedMap<String, Set<String>>> entry: this.abstractsCache.entrySet()) {
System.out.println("DEBUG-INDEXABSTRACT: hash " + entry.getKey() + ": " + ((SearchEvent.this.query.queryHashes.has(entry.getKey().getBytes()) ? "NEEDED" : "NOT NEEDED") + "; " + entry.getValue().size() + " entries")); System.out.println("DEBUG-INDEXABSTRACT: hash " + entry.getKey() + ": " + ((SearchEvent.this.query.queryHashes.has(entry.getKey().getBytes()) ? "NEEDED" : "NOT NEEDED") + "; " + entry.getValue().size() + " entries"));
} }
*/ */
// find out if there are enough references for all words that are searched // find out if there are enough references for all words that are searched
if ( this.abstractsCache.size() != SearchEvent.this.query.queryHashes.size() ) { if ( this.abstractsCache.size() != SearchEvent.this.query.queryHashes.size() ) {
@ -642,38 +631,37 @@ public final class SearchEvent
} }
// join all the urlhash:peerlist relations: the resulting map has values with a combined peer-list list // join all the urlhash:peerlist relations: the resulting map has values with a combined peer-list list
final SortedMap<String, StringBuilder> abstractJoin = final SortedMap<String, Set<String>> abstractJoin = SetTools.joinConstructive(this.abstractsCache.values(), true);
SetTools.joinConstructive(this.abstractsCache.values(), true);
if ( abstractJoin.isEmpty() ) { if ( abstractJoin.isEmpty() ) {
return; return;
// the join result is now a urlhash: peer-list relation // the join result is now a urlhash: peer-list relation
} }
// generate a list of peers that have the urls for the joined search result // generate a list of peers that have the urls for the joined search result
final SortedMap<String, StringBuilder> secondarySearchURLs = new TreeMap<String, StringBuilder>(); // a (peerhash:urlhash-liststring) mapping final SortedMap<String, Set<String>> secondarySearchURLs = new TreeMap<String, Set<String>>(); // a (peerhash:urlhash-liststring) mapping
String url, peer; String url;
StringBuilder urls, peerlist; Set<String> urls;
Set<String> peerlist;
final String mypeerhash = SearchEvent.this.peers.mySeed().hash; final String mypeerhash = SearchEvent.this.peers.mySeed().hash;
boolean mypeerinvolved = false; boolean mypeerinvolved = false;
int mypeercount; int mypeercount;
for ( final Map.Entry<String, StringBuilder> entry : abstractJoin.entrySet() ) { for ( final Map.Entry<String, Set<String>> entry : abstractJoin.entrySet() ) {
url = entry.getKey(); url = entry.getKey();
peerlist = entry.getValue(); peerlist = entry.getValue();
//System.out.println("DEBUG-INDEXABSTRACT: url " + url + ": from peers " + peerlist); //System.out.println("DEBUG-INDEXABSTRACT: url " + url + ": from peers " + peerlist);
mypeercount = 0; mypeercount = 0;
for ( int j = 0; j < peerlist.length(); j += 12 ) { for (String peer: peerlist) {
peer = peerlist.substring(j, j + 12);
if ( (peer.equals(mypeerhash)) && (mypeercount++ > 1) ) { if ( (peer.equals(mypeerhash)) && (mypeercount++ > 1) ) {
continue; continue;
} }
//if (peers.indexOf(peer) < j) continue; // avoid doubles that may appear in the abstractJoin //if (peers.indexOf(peer) < j) continue; // avoid doubles that may appear in the abstractJoin
urls = secondarySearchURLs.get(peer); urls = secondarySearchURLs.get(peer);
if ( urls == null ) { if ( urls == null ) {
urls = new StringBuilder(24); urls = new HashSet<String>();
urls.append(url); urls.add(url);
secondarySearchURLs.put(peer, urls); secondarySearchURLs.put(peer, urls);
} else { } else {
urls.append(url); urls.add(url);
} }
secondarySearchURLs.put(peer, urls); secondarySearchURLs.put(peer, urls);
} }
@ -683,13 +671,12 @@ public final class SearchEvent
} }
// compute words for secondary search and start the secondary searches // compute words for secondary search and start the secondary searches
String words; Set<String> words;
SearchEvent.this.secondarySearchThreads = SearchEvent.this.secondarySearchThreads =
new RemoteSearch[(mypeerinvolved) ? secondarySearchURLs.size() - 1 : secondarySearchURLs new RemoteSearch[(mypeerinvolved) ? secondarySearchURLs.size() - 1 : secondarySearchURLs.size()];
.size()];
int c = 0; int c = 0;
for ( final Map.Entry<String, StringBuilder> entry : secondarySearchURLs.entrySet() ) { for ( final Map.Entry<String, Set<String>> entry : secondarySearchURLs.entrySet() ) {
peer = entry.getKey(); String peer = entry.getKey();
if ( peer.equals(mypeerhash) ) { if ( peer.equals(mypeerhash) ) {
continue; // we don't need to ask ourself continue; // we don't need to ask ourself
} }
@ -698,11 +685,10 @@ public final class SearchEvent
} }
urls = entry.getValue(); urls = entry.getValue();
words = wordsFromPeer(peer, urls); words = wordsFromPeer(peer, urls);
if ( words.length() == 0 ) { if ( words.size() == 0 ) {
continue; // ??? continue; // ???
} }
assert words.length() >= 12 : "words = " + words; Log.logInfo("SearchEvent.SecondarySearchSuperviser", "asking peer " + peer + " for urls: " + urls + " from words: " + words);
//System.out.println("DEBUG-INDEXABSTRACT ***: peer " + peer + " has urls: " + urls + " from words: " + words);
this.checkedPeers.add(peer); this.checkedPeers.add(peer);
SearchEvent.this.secondarySearchThreads[c++] = SearchEvent.this.secondarySearchThreads[c++] =
RemoteSearch.secondaryRemoteSearch( RemoteSearch.secondaryRemoteSearch(