changes towards better join-search

- added generation of a compressed index within remote peers during global search
- added selection of specific urls within remote peers during secondary global search


git-svn-id: https://svn.berlios.de/svnroot/repos/yacy/trunk@2539 6c8d7289-2bf4-0310-a012-ef5d649a1542
This commit is contained in:
orbiter 2006-09-10 22:36:47 +00:00
parent 4a494464af
commit 74d1dea30b
14 changed files with 156 additions and 53 deletions

View File

@ -149,7 +149,7 @@ public class IndexControl_p {
if (delurl || delurlref) {
// generate an urlx array
indexContainer index = null;
index = switchboard.wordIndex.getContainer(keyhash, true, -1);
index = switchboard.wordIndex.getContainer(keyhash, null, true, -1);
Iterator en = index.entries();
int i = 0;
urlx = new String[index.size()];
@ -252,7 +252,7 @@ public class IndexControl_p {
indexContainer index;
String result;
long starttime = System.currentTimeMillis();
index = switchboard.wordIndex.getContainer(keyhash, true, -1);
index = switchboard.wordIndex.getContainer(keyhash, null, true, -1);
// built urlCache
Iterator urlIter = index.entries();
HashMap knownURLs = new HashMap();
@ -424,7 +424,7 @@ public class IndexControl_p {
// search for a word hash and generate a list of url links
indexContainer index = null;
try {
index = switchboard.wordIndex.getContainer(keyhash, true, -1);
index = switchboard.wordIndex.getContainer(keyhash, null, true, -1);
final StringBuffer result = new StringBuffer(1024);
if (index.size() == 0) {

View File

@ -8,4 +8,5 @@ references=#[references]#
joincount=#[joincount]#
count=#[linkcount]#
#[links]#
#[indexcount]#
#[indexcount]#
#[indexabstract]#

View File

@ -49,6 +49,7 @@
import java.util.HashSet;
import java.util.Iterator;
import java.util.Map;
import java.util.Set;
import de.anomic.http.httpHeader;
@ -81,7 +82,8 @@ public final class search {
final String oseed = post.get("myseed", ""); // complete seed of the requesting peer
// final String youare = post.get("youare", ""); // seed hash of the target peer, used for testing network stability
final String key = post.get("key", ""); // transmission key for response
final String query = post.get("query", ""); // a string of word hashes
final String query = post.get("query", ""); // a string of word hashes that shall be searched and combined
final String urls = post.get("urls", ""); // a string of url hashes that are preselected for the search: no other may be returned
// final String fwdep = post.get("fwdep", ""); // forward depth. if "0" then peer may NOT ask another peer for more results
// final String fwden = post.get("fwden", ""); // forward deny, a list of seed hashes. They may NOT be target of forward hopping
final long duetime= post.getLong("duetime", 3000);
@ -117,34 +119,64 @@ public final class search {
yacyCore.log.logInfo("INIT HASH SEARCH: " + squery.queryHashes + " - " + squery.wantedResults + " links");
long timestamp1 = System.currentTimeMillis();
// prepare a search profile
plasmaSearchRankingProfile rankingProfile = new plasmaSearchRankingProfile(new String[]{plasmaSearchRankingProfile.ORDER_YBR, plasmaSearchRankingProfile.ORDER_DATE, plasmaSearchRankingProfile.ORDER_QUALITY});
plasmaSearchTimingProfile localTiming = new plasmaSearchTimingProfile(squery.maximumTime, squery.wantedResults);
plasmaSearchTimingProfile remoteTiming = null;
plasmaSearchEvent theSearch = new plasmaSearchEvent(squery, rankingProfile, localTiming, remoteTiming, true, yacyCore.log, sb.wordIndex, sb.urlPool.loadedURL, sb.snippetCache);
Set containers = theSearch.localSearchContainers();
indexContainer localResults = theSearch.localSearchJoin(containers);
int joincount = localResults.size();
plasmaSearchResult acc = theSearch.order(localResults);
// set statistic details of search result
prop.put("joincount", Integer.toString(joincount));
// retrieve index containers from search request
plasmaSearchEvent theSearch = new plasmaSearchEvent(squery, rankingProfile, localTiming, remoteTiming, true, yacyCore.log, sb.wordIndex, sb.urlPool.loadedURL, sb.snippetCache);
Set urlselection = null;
if ((urls.length() > 0) && (urls.length() % 12 == 0)) {
for (int i = 0; i < (urls.length() / 12); i++) urlselection.add(urls.substring(i * 12, (i + 1 * 12)));
}
Map containers = theSearch.localSearchContainers(urlselection);
// set statistic details of search result and find best result index set
String maxcounthash = null;
if (containers == null) {
prop.put("indexcount", "");
} else {
Iterator ci = containers.iterator();
Iterator ci = containers.entrySet().iterator();
StringBuffer indexcount = new StringBuffer();
Map.Entry entry;
String wordhash;
int maxcount = -1;
while (ci.hasNext()) {
indexContainer container = (indexContainer) ci.next();
entry = (Map.Entry) ci.next();
wordhash = (String) entry.getKey();
indexContainer container = (indexContainer) entry.getValue();
if (container.size() > maxcount) maxcounthash = wordhash;
indexcount.append("indexcount.").append(container.getWordHash()).append('=').append(Integer.toString(container.size())).append(serverCore.crlfString);
}
prop.put("indexcount", new String(indexcount));
}
// generate compressed index for maxcounthash
// this is not needed if the search is restricted to specific urls, because it is a re-search
if ((maxcounthash == null) || (urls.length() != 0)) {
prop.put("indexabstract","");
} else {
String indexabstract = "indexabstract." + maxcounthash + "=" + ((indexContainer) containers.get(maxcounthash)).compressedIndex(1000);
yacyCore.log.logFine("DEBUG HASH SEARCH: " + indexabstract);
prop.put("indexabstract", indexabstract);
}
// join and order the result
indexContainer localResults = theSearch.localSearchJoin(containers.values());
int joincount = localResults.size();
prop.put("joincount", Integer.toString(joincount));
plasmaSearchResult acc = theSearch.order(localResults);
// prepare result
if ((joincount == 0) || (acc == null)) {
// no results
prop.put("links", "");
prop.put("linkcount", "0");
prop.put("references", "");
} else {
// result is a List of urlEntry elements

View File

@ -36,7 +36,7 @@ public abstract class indexAbstractRI implements indexRI {
}
public long getUpdateTime(String wordHash) {
indexContainer entries = getContainer(wordHash, false, -1);
indexContainer entries = getContainer(wordHash, null, false, -1);
if (entries == null) return 0;
return entries.updated();
}

View File

@ -108,10 +108,11 @@ public class indexCollectionRI extends indexAbstractRI implements indexRI {
}
public indexContainer getContainer(String wordHash, boolean deleteIfEmpty, long maxtime) {
public indexContainer getContainer(String wordHash, Set urlselection, boolean deleteIfEmpty, long maxtime) {
try {
kelondroRowSet collection = collectionIndex.get(wordHash.getBytes(), deleteIfEmpty);
if (collection == null) return null;
collection.select(urlselection);
if ((collection == null) || (collection.size() == 0)) return null;
return new indexRowSetContainer(wordHash, collection);
} catch (IOException e) {
return null;

View File

@ -32,6 +32,7 @@ import java.util.Iterator;
import java.util.Set;
import de.anomic.kelondro.kelondroOrder;
import de.anomic.server.serverByteBuffer;
public interface indexContainer {
@ -43,7 +44,9 @@ public interface indexContainer {
public void setWordHash(String newWordHash);
public String getWordHash();
public serverByteBuffer compressedIndex(long maxtime);
public void select(Set urlselection);
public void setOrdering(kelondroOrder newOrder, int newColumn);
public kelondroOrder order();
public int orderColumn();

View File

@ -386,8 +386,14 @@ public final class indexRAMCacheRI extends indexAbstractRI implements indexRI {
return (((long) intTime) * (long) 1000) + initTime;
}
public indexContainer getContainer(String wordHash, boolean deleteIfEmpty, long maxtime_dummy) {
return (indexContainer) wCache.get(wordHash);
public indexContainer getContainer(String wordHash, Set urlselection, boolean deleteIfEmpty, long maxtime_dummy) {
if (urlselection == null) {
return (indexContainer) wCache.get(wordHash);
} else {
indexContainer ic = ((indexContainer) wCache.get(wordHash)).topLevelClone();
ic.select(urlselection);
return ic;
}
}
public indexContainer deleteContainer(String wordHash) {

View File

@ -53,7 +53,7 @@ public interface indexRI {
public long getUpdateTime(String wordHash);
public indexContainer getContainer(String wordHash, boolean deleteIfEmpty, long maxtime);
public indexContainer getContainer(String wordHash, Set urlselection, boolean deleteIfEmpty, long maxtime);
public indexContainer deleteContainer(String wordHash);
public boolean removeEntry(String wordHash, String urlHash, boolean deleteComplete);

View File

@ -27,9 +27,11 @@
package de.anomic.index;
import java.lang.reflect.Method;
import java.util.Collection;
import java.util.ConcurrentModificationException;
import java.util.Iterator;
import java.util.Set;
import java.util.Map;
import java.util.TreeMap;
import de.anomic.kelondro.kelondroBase64Order;
@ -37,6 +39,7 @@ import de.anomic.kelondro.kelondroNaturalOrder;
import de.anomic.kelondro.kelondroOrder;
import de.anomic.kelondro.kelondroRow;
import de.anomic.kelondro.kelondroRowSet;
import de.anomic.server.serverByteBuffer;
public class indexRowSetContainer extends kelondroRowSet implements indexContainer {
@ -64,6 +67,43 @@ public class indexRowSetContainer extends kelondroRowSet implements indexContain
return newContainer;
}
public serverByteBuffer compressedIndex(long maxtime) {
// collect references according to domains
long timeout = (maxtime < 0) ? Long.MAX_VALUE : System.currentTimeMillis() + maxtime;
TreeMap doms = new TreeMap();
synchronized(this) {
Iterator i = entries();
indexEntry iEntry;
String dom, paths;
while (i.hasNext()) {
iEntry = (indexEntry) i.next();
dom = iEntry.urlHash().substring(6);
if ((paths = (String) doms.get(dom)) == null) {
doms.put(dom, iEntry.urlHash().substring(0, 6));
} else {
doms.put(dom, paths + iEntry.urlHash().substring(0, 6));
}
if (System.currentTimeMillis() > timeout) break;
}
}
// construct a result string
serverByteBuffer bb = new serverByteBuffer(this.size() * indexURLEntry.urlEntryRow.width(0) / 2);
bb.append('{');
Iterator i = doms.entrySet().iterator();
Map.Entry entry;
while (i.hasNext()) {
entry = (Map.Entry) i.next();
bb.append((String) entry.getKey());
bb.append(':');
bb.append((String) entry.getValue());
if (System.currentTimeMillis() > timeout) break;
if (i.hasNext()) bb.append(',');
}
bb.append('}');
bb.trim();
return bb;
}
public void setWordHash(String newWordHash) {
this.wordHash = newWordHash;
}
@ -94,15 +134,18 @@ public class indexRowSetContainer extends kelondroRowSet implements indexContain
public int add(indexContainer c, long maxTime) {
// returns the number of new elements
long startTime = System.currentTimeMillis();
long timeout = (maxTime < 0) ? Long.MAX_VALUE : System.currentTimeMillis() + maxTime;
if (c == null) return 0;
int x = 0;
synchronized (c) {
Iterator i = c.entries();
while ((i.hasNext()) && ((maxTime < 0) || ((startTime + maxTime) > System.currentTimeMillis()))) {
while (i.hasNext()) {
try {
if (addi((indexEntry) i.next())) x++;
} catch (ConcurrentModificationException e) {}
} catch (ConcurrentModificationException e) {
e.printStackTrace();
}
if (System.currentTimeMillis() > timeout) break;
}
}
this.lastTimeWrote = java.lang.Math.max(this.lastTimeWrote, c.updated());
@ -202,7 +245,7 @@ public class indexRowSetContainer extends kelondroRowSet implements indexContain
return c;
}
public static indexContainer joinContainer(Set containers, long time, int maxDistance) {
public static indexContainer joinContainer(Collection containers, long time, int maxDistance) {
long stamp = System.currentTimeMillis();

View File

@ -25,6 +25,7 @@
package de.anomic.kelondro;
import java.util.Iterator;
import java.util.Set;
public class kelondroRowCollection {
@ -293,6 +294,18 @@ public class kelondroRowCollection {
}
}
public void select(Set keys) {
// removes all entries but the ones given by urlselection
if (keys == null) return;
synchronized (this) {
Iterator i = rows();
kelondroRow.Entry row;
while (i.hasNext()) {
row = (kelondroRow.Entry) i.next();
if (!(keys.contains(row.getColString(0, null)))) i.remove();
}
}
}
protected final void sort(kelondroOrder newOrder, int newColumn) {
if ((this.sortOrder == null) ||

View File

@ -42,9 +42,11 @@
package de.anomic.plasma;
import java.util.Collection;
import java.util.Iterator;
import java.util.Set;
import java.util.Map;
import java.util.HashSet;
import java.util.Set;
import de.anomic.kelondro.kelondroException;
import de.anomic.server.logging.serverLog;
@ -131,7 +133,7 @@ public final class plasmaSearchEvent extends Thread implements Runnable {
searchThreads = yacySearch.searchHashes(query.queryHashes, query.prefer, query.urlMask, query.maxDistance, urlStore, rcGlobal, fetchpeers, plasmaSwitchboard.urlBlacklist, snippetCache, profileGlobal, ranking);
// meanwhile do a local search
indexContainer rcLocal = localSearchJoin(localSearchContainers());
indexContainer rcLocal = localSearchJoin(localSearchContainers(null).values());
plasmaSearchResult localResult = orderLocal(rcLocal, timeout);
// catch up global results:
@ -161,7 +163,7 @@ public final class plasmaSearchEvent extends Thread implements Runnable {
lastEvent = this;
return result;
} else {
indexContainer rcLocal = localSearchJoin(localSearchContainers());
indexContainer rcLocal = localSearchJoin(localSearchContainers(null).values());
plasmaSearchResult result = order(rcLocal);
result.localContributions = rcLocal.size();
@ -173,13 +175,14 @@ public final class plasmaSearchEvent extends Thread implements Runnable {
}
}
public Set localSearchContainers() {
public Map localSearchContainers(Set urlselection) {
// search for the set of hashes and return the set of containers containing the seach result
// retrieve entities that belong to the hashes
profileLocal.startTimer();
Set containers = wordIndex.getContainers(
Map containers = wordIndex.getContainers(
query.queryHashes,
urlselection,
true,
true,
profileLocal.getTargetTime(plasmaSearchTimingProfile.PROCESS_COLLECTION));
@ -190,7 +193,7 @@ public final class plasmaSearchEvent extends Thread implements Runnable {
return containers;
}
public indexContainer localSearchJoin(Set containers) {
public indexContainer localSearchJoin(Collection containers) {
// join a search result and return the joincount (number of pages after join)
// since this is a conjunction we return an empty entity if any word is not known

View File

@ -49,6 +49,7 @@ package de.anomic.plasma;
import java.io.File;
import java.io.IOException;
import java.util.HashMap;
import java.util.Iterator;
import java.util.Map;
import java.util.HashSet;
@ -321,11 +322,11 @@ public final class plasmaWordIndex extends indexAbstractRI implements indexRI {
return condenser.RESULT_SIMI_WORDS;
}
public indexContainer getContainer(String wordHash, boolean deleteIfEmpty, long maxTime) {
public indexContainer getContainer(String wordHash, Set urlselection, boolean deleteIfEmpty, long maxTime) {
long start = System.currentTimeMillis();
// get from cache
indexContainer container = ramCache.getContainer(wordHash, true, -1);
indexContainer container = ramCache.getContainer(wordHash, urlselection, true, -1);
// We must not use the container from cache to store everything we find,
// as that container remains linked to in the cache and might be changed later
@ -336,18 +337,18 @@ public final class plasmaWordIndex extends indexAbstractRI implements indexRI {
// get from collection index
if (useCollectionIndex) {
if (container == null) {
container = collections.getContainer(wordHash, true, (maxTime < 0) ? -1 : maxTime);
container = collections.getContainer(wordHash, urlselection, true, (maxTime < 0) ? -1 : maxTime);
} else {
container.add(collections.getContainer(wordHash, true, (maxTime < 0) ? -1 : maxTime), -1);
container.add(collections.getContainer(wordHash, urlselection, true, (maxTime < 0) ? -1 : maxTime), -1);
}
}
// get from assortments
if (container == null) {
container = assortmentCluster.getContainer(wordHash, true, (maxTime < 0) ? -1 : maxTime);
container = assortmentCluster.getContainer(wordHash, urlselection, true, (maxTime < 0) ? -1 : maxTime);
} else {
// add containers from assortment cluster
container.add(assortmentCluster.getContainer(wordHash, true, (maxTime < 0) ? -1 : maxTime), -1);
container.add(assortmentCluster.getContainer(wordHash, urlselection, true, (maxTime < 0) ? -1 : maxTime), -1);
}
// get from backend
@ -355,14 +356,14 @@ public final class plasmaWordIndex extends indexAbstractRI implements indexRI {
maxTime = maxTime - (System.currentTimeMillis() - start);
if (maxTime < 0) maxTime = 100;
}
container.add(backend.getContainer(wordHash, deleteIfEmpty, (maxTime < 0) ? -1 : maxTime), -1);
container.add(backend.getContainer(wordHash, urlselection, deleteIfEmpty, (maxTime < 0) ? -1 : maxTime), -1);
return container;
}
public Set getContainers(Set wordHashes, boolean deleteIfEmpty, boolean interruptIfEmpty, long maxTime) {
public Map getContainers(Set wordHashes, Set urlselection, boolean deleteIfEmpty, boolean interruptIfEmpty, long maxTime) {
// retrieve entities that belong to the hashes
HashSet containers = new HashSet();
HashMap containers = new HashMap();
String singleHash;
indexContainer singleContainer;
Iterator i = wordHashes.iterator();
@ -378,12 +379,12 @@ public final class plasmaWordIndex extends indexAbstractRI implements indexRI {
singleHash = (String) i.next();
// retrieve index
singleContainer = getContainer(singleHash, deleteIfEmpty, (maxTime < 0) ? -1 : remaining / (wordHashes.size() - containers.size()));
singleContainer = getContainer(singleHash, urlselection, deleteIfEmpty, (maxTime < 0) ? -1 : remaining / (wordHashes.size() - containers.size()));
// check result
if (((singleContainer == null) || (singleContainer.size() == 0)) && (interruptIfEmpty)) return new HashSet();
if (((singleContainer == null) || (singleContainer.size() == 0)) && (interruptIfEmpty)) return new HashMap();
containers.add(singleContainer);
containers.put(singleHash, singleContainer);
}
return containers;
}

View File

@ -295,17 +295,17 @@ public final class plasmaWordIndexAssortmentCluster extends indexAbstractRI impl
return initialSize - urlHashes.size();
}
public indexContainer getContainer(String wordHash, boolean deleteIfEmpty, long maxTime) {
public indexContainer getContainer(String wordHash, Set urlselection, boolean deleteIfEmpty, long maxTime) {
// collect all records from all the assortments and return them
indexContainer buffer, record = new indexRowSetContainer(wordHash);
long limitTime = (maxTime < 0) ? Long.MAX_VALUE : System.currentTimeMillis() + maxTime;
long remainingTime;
long timeout = (maxTime < 0) ? Long.MAX_VALUE : System.currentTimeMillis() + maxTime;
for (int i = 0; i < clusterCount; i++) {
buffer = assortments[i].get(wordHash);
remainingTime = limitTime - System.currentTimeMillis();
if (0 > remainingTime) break;
if (buffer != null) record.add(buffer, remainingTime);
if (buffer != null) {
buffer.select(urlselection);
record.add(buffer, -1);
}
if (System.currentTimeMillis() > timeout) break;
}
return record;
}

View File

@ -99,7 +99,7 @@ public class plasmaWordIndexFileCluster extends indexAbstractRI implements index
}
public Object next() {
return getContainer((String) wordIterator.next(), true, 100);
return getContainer((String) wordIterator.next(), null, true, 100);
}
public void remove() {
@ -225,7 +225,7 @@ public class plasmaWordIndexFileCluster extends indexAbstractRI implements index
}
}
public synchronized indexContainer getContainer(String wordHash, boolean deleteIfEmpty, long maxTime) {
public synchronized indexContainer getContainer(String wordHash, Set urlselection, boolean deleteIfEmpty, long maxTime) {
long start = System.currentTimeMillis();
if ((maxTime < 0) || (maxTime > 60000)) maxTime=60000; // maximum is one minute
if (plasmaWordIndexFile.wordHash2path(databaseRoot, wordHash).exists()) {
@ -235,7 +235,7 @@ public class plasmaWordIndexFileCluster extends indexAbstractRI implements index
Iterator i = entity.elements(true);
while ((i.hasNext()) && (System.currentTimeMillis() < (start + maxTime))) {
entry = (indexEntry) i.next();
container.add(entry);
if ((urlselection == null) || (urlselection.contains(entry.urlHash()))) container.add(entry);
}
return container;
} else {