changes towards better join-search

- added generation of a compressed index within remote peers during global search - added selection of specific urls within remote peers during secondary global search git-svn-id: https://svn.berlios.de/svnroot/repos/yacy/trunk@2539 6c8d7289-2bf4-0310-a012-ef5d649a1542
2024-09-19 00:01:41 +02:00 · 2006-09-10 22:36:47 +00:00 · 2006-09-10 22:36:47 +00:00 · 74d1dea30b
commit 74d1dea30b
parent 4a494464af
14 changed files with 156 additions and 53 deletions
--- a/htroot/IndexControl_p.java
+++ b/htroot/IndexControl_p.java
@ -149,7 +149,7 @@ public class IndexControl_p {
            if (delurl || delurlref) {
                // generate an urlx array
                indexContainer index = null;
-                index = switchboard.wordIndex.getContainer(keyhash, true, -1);
+                index = switchboard.wordIndex.getContainer(keyhash, null, true, -1);
                Iterator en = index.entries();
                int i = 0;
                urlx = new String[index.size()];
@ -252,7 +252,7 @@ public class IndexControl_p {
            indexContainer index;
            String result;
            long starttime = System.currentTimeMillis();
-            index = switchboard.wordIndex.getContainer(keyhash, true, -1);
+            index = switchboard.wordIndex.getContainer(keyhash, null, true, -1);
            // built urlCache
            Iterator urlIter = index.entries();
            HashMap knownURLs = new HashMap();
@ -424,7 +424,7 @@ public class IndexControl_p {
        // search for a word hash and generate a list of url links
        indexContainer index = null;
        try {
-            index = switchboard.wordIndex.getContainer(keyhash, true, -1);
+            index = switchboard.wordIndex.getContainer(keyhash, null, true, -1);

            final StringBuffer result = new StringBuffer(1024);
            if (index.size() == 0) {
--- a/htroot/yacy/search.html
+++ b/htroot/yacy/search.html
@ -8,4 +8,5 @@ references=#[references]#
 joincount=#[joincount]#
 count=#[linkcount]#
 #[links]#
-#[indexcount]#
+#[indexcount]#
+#[indexabstract]#
--- a/htroot/yacy/search.java
+++ b/htroot/yacy/search.java
@ -49,6 +49,7 @@

 import java.util.HashSet;
 import java.util.Iterator;
+import java.util.Map;
 import java.util.Set;

 import de.anomic.http.httpHeader;
@ -81,7 +82,8 @@ public final class search {
        final String  oseed  = post.get("myseed", ""); // complete seed of the requesting peer
 //      final String  youare = post.get("youare", ""); // seed hash of the target peer, used for testing network stability
        final String  key    = post.get("key", "");    // transmission key for response
-        final String  query  = post.get("query", "");  // a string of word hashes
+        final String  query  = post.get("query", "");  // a string of word hashes that shall be searched and combined
+        final String  urls   = post.get("urls", "");   // a string of url hashes that are preselected for the search: no other may be returned
 //      final String  fwdep  = post.get("fwdep", "");  // forward depth. if "0" then peer may NOT ask another peer for more results
 //      final String  fwden  = post.get("fwden", "");  // forward deny, a list of seed hashes. They may NOT be target of forward hopping
        final long    duetime= post.getLong("duetime", 3000);
@ -117,34 +119,64 @@ public final class search {

        yacyCore.log.logInfo("INIT HASH SEARCH: " + squery.queryHashes + " - " + squery.wantedResults + " links");
        long timestamp1 = System.currentTimeMillis();
+        
+        // prepare a search profile
        plasmaSearchRankingProfile rankingProfile = new plasmaSearchRankingProfile(new String[]{plasmaSearchRankingProfile.ORDER_YBR, plasmaSearchRankingProfile.ORDER_DATE, plasmaSearchRankingProfile.ORDER_QUALITY});
        plasmaSearchTimingProfile localTiming  = new plasmaSearchTimingProfile(squery.maximumTime, squery.wantedResults);
        plasmaSearchTimingProfile remoteTiming = null;
-        plasmaSearchEvent theSearch = new plasmaSearchEvent(squery, rankingProfile, localTiming, remoteTiming, true, yacyCore.log, sb.wordIndex, sb.urlPool.loadedURL, sb.snippetCache);
-        Set containers = theSearch.localSearchContainers();
-        indexContainer localResults = theSearch.localSearchJoin(containers);
-        int joincount = localResults.size();
-        plasmaSearchResult acc = theSearch.order(localResults);

-        // set statistic details of search result
-        prop.put("joincount", Integer.toString(joincount));
+        // retrieve index containers from search request
+        plasmaSearchEvent theSearch = new plasmaSearchEvent(squery, rankingProfile, localTiming, remoteTiming, true, yacyCore.log, sb.wordIndex, sb.urlPool.loadedURL, sb.snippetCache);
+        Set urlselection = null;
+        if ((urls.length() > 0) && (urls.length() % 12 == 0)) {
+            for (int i = 0; i < (urls.length() / 12); i++) urlselection.add(urls.substring(i * 12, (i + 1 * 12)));
+        }
+        Map containers = theSearch.localSearchContainers(urlselection);
+        
+        // set statistic details of search result and find best result index set
+        String maxcounthash = null;
        if (containers == null) {
            prop.put("indexcount", "");
        } else {
-            Iterator ci = containers.iterator();
+            Iterator ci = containers.entrySet().iterator();
            StringBuffer indexcount = new StringBuffer();
+            Map.Entry entry;
+            String wordhash;
+            int maxcount = -1;
            while (ci.hasNext()) {
-                indexContainer container = (indexContainer) ci.next();
+                entry = (Map.Entry) ci.next();
+                wordhash = (String) entry.getKey();
+                indexContainer container = (indexContainer) entry.getValue();
+                if (container.size() > maxcount) maxcounthash = wordhash;
                indexcount.append("indexcount.").append(container.getWordHash()).append('=').append(Integer.toString(container.size())).append(serverCore.crlfString);
            }
            prop.put("indexcount", new String(indexcount));
        }
        
+        // generate compressed index for maxcounthash
+        // this is not needed if the search is restricted to specific urls, because it is a re-search
+        if ((maxcounthash == null) || (urls.length() != 0)) {
+            prop.put("indexabstract","");
+        } else {
+            String indexabstract = "indexabstract." + maxcounthash + "=" + ((indexContainer) containers.get(maxcounthash)).compressedIndex(1000);
+            yacyCore.log.logFine("DEBUG HASH SEARCH: " + indexabstract);
+            prop.put("indexabstract", indexabstract);
+        }
        
+        // join and order the result
+        indexContainer localResults = theSearch.localSearchJoin(containers.values());
+        int joincount = localResults.size();
+        prop.put("joincount", Integer.toString(joincount));
+        plasmaSearchResult acc = theSearch.order(localResults);
+
+        // prepare result
        if ((joincount == 0) || (acc == null)) {
+            
+            // no results
            prop.put("links", "");
            prop.put("linkcount", "0");
            prop.put("references", "");
+
        } else {
            
            // result is a List of urlEntry elements
--- a/source/de/anomic/index/indexAbstractRI.java
+++ b/source/de/anomic/index/indexAbstractRI.java
@ -36,7 +36,7 @@ public abstract class indexAbstractRI implements indexRI {
    }
    
    public long getUpdateTime(String wordHash) {
-        indexContainer entries = getContainer(wordHash, false, -1);
+        indexContainer entries = getContainer(wordHash, null, false, -1);
        if (entries == null) return 0;
        return entries.updated();
    }
--- a/source/de/anomic/index/indexCollectionRI.java
+++ b/source/de/anomic/index/indexCollectionRI.java
@ -108,10 +108,11 @@ public class indexCollectionRI extends indexAbstractRI implements indexRI {

    }
     
-    public indexContainer getContainer(String wordHash, boolean deleteIfEmpty, long maxtime) {
+    public indexContainer getContainer(String wordHash, Set urlselection, boolean deleteIfEmpty, long maxtime) {
        try {
            kelondroRowSet collection = collectionIndex.get(wordHash.getBytes(), deleteIfEmpty);
-            if (collection == null) return null;
+            collection.select(urlselection);
+            if ((collection == null) || (collection.size() == 0)) return null;
            return new indexRowSetContainer(wordHash, collection);
        } catch (IOException e) {
            return null;
--- a/source/de/anomic/index/indexContainer.java
+++ b/source/de/anomic/index/indexContainer.java
@ -32,6 +32,7 @@ import java.util.Iterator;
 import java.util.Set;

 import de.anomic.kelondro.kelondroOrder;
+import de.anomic.server.serverByteBuffer;

 public interface indexContainer {
    
@ -43,7 +44,9 @@ public interface indexContainer {
    
    public void setWordHash(String newWordHash);
    public String getWordHash();
-
+    public serverByteBuffer compressedIndex(long maxtime);
+    public void select(Set urlselection);
+    
    public void setOrdering(kelondroOrder newOrder, int newColumn);
    public kelondroOrder order();
    public int orderColumn();
--- a/source/de/anomic/index/indexRAMCacheRI.java
+++ b/source/de/anomic/index/indexRAMCacheRI.java
@ -386,8 +386,14 @@ public final class indexRAMCacheRI extends indexAbstractRI implements indexRI {
        return (((long) intTime) * (long) 1000) + initTime;
    }
    
-    public indexContainer getContainer(String wordHash, boolean deleteIfEmpty, long maxtime_dummy) {
-        return (indexContainer) wCache.get(wordHash);
+    public indexContainer getContainer(String wordHash, Set urlselection, boolean deleteIfEmpty, long maxtime_dummy) {
+        if (urlselection == null) {
+            return (indexContainer) wCache.get(wordHash);
+        } else {
+            indexContainer ic = ((indexContainer) wCache.get(wordHash)).topLevelClone();
+            ic.select(urlselection);
+            return ic;
+        }
    }

    public indexContainer deleteContainer(String wordHash) {
--- a/source/de/anomic/index/indexRI.java
+++ b/source/de/anomic/index/indexRI.java
@ -53,7 +53,7 @@ public interface indexRI {
        
    public long getUpdateTime(String wordHash);
    
-    public indexContainer getContainer(String wordHash, boolean deleteIfEmpty, long maxtime);
+    public indexContainer getContainer(String wordHash, Set urlselection, boolean deleteIfEmpty, long maxtime);
    public indexContainer deleteContainer(String wordHash);
    
    public boolean removeEntry(String wordHash, String urlHash, boolean deleteComplete);
--- a/source/de/anomic/index/indexRowSetContainer.java
+++ b/source/de/anomic/index/indexRowSetContainer.java
@ -27,9 +27,11 @@
 package de.anomic.index;

 import java.lang.reflect.Method;
+import java.util.Collection;
 import java.util.ConcurrentModificationException;
 import java.util.Iterator;
 import java.util.Set;
+import java.util.Map;
 import java.util.TreeMap;

 import de.anomic.kelondro.kelondroBase64Order;
@ -37,6 +39,7 @@ import de.anomic.kelondro.kelondroNaturalOrder;
 import de.anomic.kelondro.kelondroOrder;
 import de.anomic.kelondro.kelondroRow;
 import de.anomic.kelondro.kelondroRowSet;
+import de.anomic.server.serverByteBuffer;

 public class indexRowSetContainer extends kelondroRowSet implements indexContainer {

@ -64,6 +67,43 @@ public class indexRowSetContainer extends kelondroRowSet implements indexContain
        return newContainer;
    }
    
+    public serverByteBuffer compressedIndex(long maxtime) {
+        // collect references according to domains
+        long timeout = (maxtime < 0) ? Long.MAX_VALUE : System.currentTimeMillis() + maxtime;
+        TreeMap doms = new TreeMap();
+        synchronized(this) {
+            Iterator i = entries();
+            indexEntry iEntry;
+            String dom, paths;
+            while (i.hasNext()) {
+                iEntry = (indexEntry) i.next();
+                dom = iEntry.urlHash().substring(6);
+                if ((paths = (String) doms.get(dom)) == null) {
+                    doms.put(dom, iEntry.urlHash().substring(0, 6));
+                } else {
+                    doms.put(dom, paths + iEntry.urlHash().substring(0, 6));
+                }
+                if (System.currentTimeMillis() > timeout) break;
+            }
+        }
+        // construct a result string
+        serverByteBuffer bb = new serverByteBuffer(this.size() * indexURLEntry.urlEntryRow.width(0) / 2);
+        bb.append('{');
+        Iterator i = doms.entrySet().iterator();
+        Map.Entry entry;
+        while (i.hasNext()) {
+            entry = (Map.Entry) i.next();
+            bb.append((String) entry.getKey());
+            bb.append(':');
+            bb.append((String) entry.getValue());
+            if (System.currentTimeMillis() > timeout) break;
+            if (i.hasNext()) bb.append(',');
+        }
+        bb.append('}');
+        bb.trim();
+        return bb;
+    }
+    
    public void setWordHash(String newWordHash) {
        this.wordHash = newWordHash;
    }
@ -94,15 +134,18 @@ public class indexRowSetContainer extends kelondroRowSet implements indexContain

    public int add(indexContainer c, long maxTime) {
        // returns the number of new elements
-        long startTime = System.currentTimeMillis();
+        long timeout = (maxTime < 0) ? Long.MAX_VALUE : System.currentTimeMillis() + maxTime;
        if (c == null) return 0;
        int x = 0;
        synchronized (c) {
            Iterator i = c.entries();
-            while ((i.hasNext()) && ((maxTime < 0) || ((startTime + maxTime) > System.currentTimeMillis()))) {
+            while (i.hasNext()) {
                try {
                    if (addi((indexEntry) i.next())) x++;
-                } catch (ConcurrentModificationException e) {}
+                } catch (ConcurrentModificationException e) {
+                    e.printStackTrace();
+                }
+                if (System.currentTimeMillis() > timeout) break;
            }
        }
        this.lastTimeWrote = java.lang.Math.max(this.lastTimeWrote, c.updated());
@ -202,7 +245,7 @@ public class indexRowSetContainer extends kelondroRowSet implements indexContain
        return c;
    }
    
-    public static indexContainer joinContainer(Set containers, long time, int maxDistance) {
+    public static indexContainer joinContainer(Collection containers, long time, int maxDistance) {
        
        long stamp = System.currentTimeMillis();
        
--- a/source/de/anomic/kelondro/kelondroRowCollection.java
+++ b/source/de/anomic/kelondro/kelondroRowCollection.java
@ -25,6 +25,7 @@
 package de.anomic.kelondro;

 import java.util.Iterator;
+import java.util.Set;

 public class kelondroRowCollection {

@ -293,6 +294,18 @@ public class kelondroRowCollection {
        }
    }
    
+    public void select(Set keys) {
+        // removes all entries but the ones given by urlselection
+        if (keys == null) return;
+        synchronized (this) {
+            Iterator i = rows();
+            kelondroRow.Entry row;
+            while (i.hasNext()) {
+                row = (kelondroRow.Entry) i.next();
+                if (!(keys.contains(row.getColString(0, null)))) i.remove();
+            }
+        }
+    }
    
    protected final void sort(kelondroOrder newOrder, int newColumn) {
        if ((this.sortOrder == null) ||
--- a/source/de/anomic/plasma/plasmaSearchEvent.java
+++ b/source/de/anomic/plasma/plasmaSearchEvent.java
@ -42,9 +42,11 @@

 package de.anomic.plasma;

+import java.util.Collection;
 import java.util.Iterator;
-import java.util.Set;
+import java.util.Map;
 import java.util.HashSet;
+import java.util.Set;

 import de.anomic.kelondro.kelondroException;
 import de.anomic.server.logging.serverLog;
@ -131,7 +133,7 @@ public final class plasmaSearchEvent extends Thread implements Runnable {
                searchThreads = yacySearch.searchHashes(query.queryHashes, query.prefer, query.urlMask, query.maxDistance, urlStore, rcGlobal, fetchpeers, plasmaSwitchboard.urlBlacklist, snippetCache, profileGlobal, ranking);

                // meanwhile do a local search
-                indexContainer rcLocal = localSearchJoin(localSearchContainers());
+                indexContainer rcLocal = localSearchJoin(localSearchContainers(null).values());
                plasmaSearchResult localResult = orderLocal(rcLocal, timeout);
                
                // catch up global results:
@ -161,7 +163,7 @@ public final class plasmaSearchEvent extends Thread implements Runnable {
                lastEvent = this;
                return result;
            } else {
-                indexContainer rcLocal = localSearchJoin(localSearchContainers());
+                indexContainer rcLocal = localSearchJoin(localSearchContainers(null).values());
                plasmaSearchResult result = order(rcLocal);
                result.localContributions = rcLocal.size();

@ -173,13 +175,14 @@ public final class plasmaSearchEvent extends Thread implements Runnable {
        }
    }

-    public Set localSearchContainers() {
+    public Map localSearchContainers(Set urlselection) {
        // search for the set of hashes and return the set of containers containing the seach result

        // retrieve entities that belong to the hashes
        profileLocal.startTimer();
-        Set containers = wordIndex.getContainers(
+        Map containers = wordIndex.getContainers(
                        query.queryHashes,
+                        urlselection,
                        true,
                        true,
                        profileLocal.getTargetTime(plasmaSearchTimingProfile.PROCESS_COLLECTION));
@ -190,7 +193,7 @@ public final class plasmaSearchEvent extends Thread implements Runnable {
        return containers;
    }
    
-    public indexContainer localSearchJoin(Set containers) {
+    public indexContainer localSearchJoin(Collection containers) {
        // join a search result and return the joincount (number of pages after join)

        // since this is a conjunction we return an empty entity if any word is not known
--- a/source/de/anomic/plasma/plasmaWordIndex.java
+++ b/source/de/anomic/plasma/plasmaWordIndex.java
@ -49,6 +49,7 @@ package de.anomic.plasma;

 import java.io.File;
 import java.io.IOException;
+import java.util.HashMap;
 import java.util.Iterator;
 import java.util.Map;
 import java.util.HashSet;
@ -321,11 +322,11 @@ public final class plasmaWordIndex extends indexAbstractRI implements indexRI {
        return condenser.RESULT_SIMI_WORDS;
    }

-    public indexContainer getContainer(String wordHash, boolean deleteIfEmpty, long maxTime) {
+    public indexContainer getContainer(String wordHash, Set urlselection, boolean deleteIfEmpty, long maxTime) {
        long start = System.currentTimeMillis();

            // get from cache
-            indexContainer container = ramCache.getContainer(wordHash, true, -1);
+            indexContainer container = ramCache.getContainer(wordHash, urlselection, true, -1);

            // We must not use the container from cache to store everything we find,
            // as that container remains linked to in the cache and might be changed later
@ -336,18 +337,18 @@ public final class plasmaWordIndex extends indexAbstractRI implements indexRI {
            // get from collection index
            if (useCollectionIndex) {
                if (container == null) {
-                    container = collections.getContainer(wordHash, true, (maxTime < 0) ? -1 : maxTime);
+                    container = collections.getContainer(wordHash, urlselection, true, (maxTime < 0) ? -1 : maxTime);
                } else {
-                    container.add(collections.getContainer(wordHash, true, (maxTime < 0) ? -1 : maxTime), -1);
+                    container.add(collections.getContainer(wordHash, urlselection, true, (maxTime < 0) ? -1 : maxTime), -1);
                }
            }
        
            // get from assortments
            if (container == null) {
-                container = assortmentCluster.getContainer(wordHash, true, (maxTime < 0) ? -1 : maxTime);
+                container = assortmentCluster.getContainer(wordHash, urlselection, true, (maxTime < 0) ? -1 : maxTime);
            } else {
                // add containers from assortment cluster
-                container.add(assortmentCluster.getContainer(wordHash, true, (maxTime < 0) ? -1 : maxTime), -1);
+                container.add(assortmentCluster.getContainer(wordHash, urlselection, true, (maxTime < 0) ? -1 : maxTime), -1);
            }
        
            // get from backend
@ -355,14 +356,14 @@ public final class plasmaWordIndex extends indexAbstractRI implements indexRI {
                maxTime = maxTime - (System.currentTimeMillis() - start);
                if (maxTime < 0) maxTime = 100;
            }
-            container.add(backend.getContainer(wordHash, deleteIfEmpty, (maxTime < 0) ? -1 : maxTime), -1);
+            container.add(backend.getContainer(wordHash, urlselection, deleteIfEmpty, (maxTime < 0) ? -1 : maxTime), -1);
            return container;
    }

-    public Set getContainers(Set wordHashes, boolean deleteIfEmpty, boolean interruptIfEmpty, long maxTime) {
+    public Map getContainers(Set wordHashes, Set urlselection, boolean deleteIfEmpty, boolean interruptIfEmpty, long maxTime) {
        
        // retrieve entities that belong to the hashes
-        HashSet containers = new HashSet();
+        HashMap containers = new HashMap();
        String singleHash;
        indexContainer singleContainer;
            Iterator i = wordHashes.iterator();
@ -378,12 +379,12 @@ public final class plasmaWordIndex extends indexAbstractRI implements indexRI {
                singleHash = (String) i.next();
            
                // retrieve index
-                singleContainer = getContainer(singleHash, deleteIfEmpty, (maxTime < 0) ? -1 : remaining / (wordHashes.size() - containers.size()));
+                singleContainer = getContainer(singleHash, urlselection, deleteIfEmpty, (maxTime < 0) ? -1 : remaining / (wordHashes.size() - containers.size()));
            
                // check result
-                if (((singleContainer == null) || (singleContainer.size() == 0)) && (interruptIfEmpty)) return new HashSet();
+                if (((singleContainer == null) || (singleContainer.size() == 0)) && (interruptIfEmpty)) return new HashMap();
            
-                containers.add(singleContainer);
+                containers.put(singleHash, singleContainer);
            }
        return containers;
    }
--- a/source/de/anomic/plasma/plasmaWordIndexAssortmentCluster.java
+++ b/source/de/anomic/plasma/plasmaWordIndexAssortmentCluster.java
@ -295,17 +295,17 @@ public final class plasmaWordIndexAssortmentCluster extends indexAbstractRI impl
        return initialSize - urlHashes.size();
    }

-    public indexContainer getContainer(String wordHash, boolean deleteIfEmpty, long maxTime) {
+    public indexContainer getContainer(String wordHash, Set urlselection, boolean deleteIfEmpty, long maxTime) {
        // collect all records from all the assortments and return them
        indexContainer buffer, record = new indexRowSetContainer(wordHash);
-        long limitTime = (maxTime < 0) ? Long.MAX_VALUE : System.currentTimeMillis() + maxTime;
-        long remainingTime;
+        long timeout = (maxTime < 0) ? Long.MAX_VALUE : System.currentTimeMillis() + maxTime;
        for (int i = 0; i < clusterCount; i++) {
            buffer = assortments[i].get(wordHash);
-            remainingTime = limitTime - System.currentTimeMillis();
-            if (0 > remainingTime) break;
-            if (buffer != null) record.add(buffer, remainingTime);
-            
+            if (buffer != null) {
+                buffer.select(urlselection);
+                record.add(buffer, -1);
+            }
+            if (System.currentTimeMillis() > timeout) break;
        }
        return record;
    }
--- a/source/de/anomic/plasma/plasmaWordIndexFileCluster.java
+++ b/source/de/anomic/plasma/plasmaWordIndexFileCluster.java
@ -99,7 +99,7 @@ public class plasmaWordIndexFileCluster extends indexAbstractRI implements index
        }

        public Object next() {
-            return getContainer((String) wordIterator.next(), true, 100);
+            return getContainer((String) wordIterator.next(), null, true, 100);
        }

        public void remove() {
@ -225,7 +225,7 @@ public class plasmaWordIndexFileCluster extends indexAbstractRI implements index
        }
    }

-    public synchronized indexContainer getContainer(String wordHash, boolean deleteIfEmpty, long maxTime) {
+    public synchronized indexContainer getContainer(String wordHash, Set urlselection, boolean deleteIfEmpty, long maxTime) {
        long start = System.currentTimeMillis();
        if ((maxTime < 0) || (maxTime > 60000)) maxTime=60000; // maximum is one minute
        if (plasmaWordIndexFile.wordHash2path(databaseRoot, wordHash).exists()) {
@ -235,7 +235,7 @@ public class plasmaWordIndexFileCluster extends indexAbstractRI implements index
            Iterator i = entity.elements(true);
            while ((i.hasNext()) && (System.currentTimeMillis() < (start + maxTime))) {
                entry = (indexEntry) i.next();
-                container.add(entry);
+                if ((urlselection == null) || (urlselection.contains(entry.urlHash()))) container.add(entry);
            }
            return container;
        } else {