replaced rwi term counting method by one that computes the maximum of the blobs that contibute to the RWI. An addition of the blob sizes is wrong/incorrect and does not reflect the real size. Truncation the size operation to the maximum of all blobs is also incorrect, but not as wrong as the sum of all blob sizes wich double-counts many rwi entries.

git-svn-id: https://svn.berlios.de/svnroot/repos/yacy/trunk@6064 6c8d7289-2bf4-0310-a012-ef5d649a1542
2024-09-19 00:01:41 +02:00 · 2009-06-13 22:59:54 +00:00 · 2009-06-13 22:59:54 +00:00 · 945777aa80
commit 945777aa80
parent 303ccda69f
15 changed files with 49 additions and 34 deletions
--- a/htroot/IndexCleaner_p.java
+++ b/htroot/IndexCleaner_p.java
@ -94,7 +94,7 @@ public class IndexCleaner_p {
            prop.put("rwidb_threadAlive", indexCleanerThread.isAlive() + "");
            prop.put("rwidb_threadToString", indexCleanerThread.toString());
            prop.putNum("rwidb_RWIcountstart", indexCleanerThread.rwiCountAtStart);
-            prop.putNum("rwidb_RWIcountnow", sb.indexSegment.termIndex().size());
+            prop.putNum("rwidb_RWIcountnow", sb.indexSegment.termIndex().sizesMax());
            prop.put("rwidb_wordHashNow", (indexCleanerThread.wordHashNow == null) ? "NULL" : new String(indexCleanerThread.wordHashNow));
            prop.put("rwidb_lastWordHash", (indexCleanerThread.lastWordHash == null) ? "null" : new String(indexCleanerThread.lastWordHash));
            prop.putNum("rwidb_lastDeletionCounter", indexCleanerThread.lastDeletionCounter);
--- a/htroot/IndexControlRWIs_p.java
+++ b/htroot/IndexControlRWIs_p.java
@ -343,7 +343,7 @@ public class IndexControlRWIs_p {
        

        // insert constants
-        prop.putNum("wcount", sb.indexSegment.termIndex().size());
+        prop.putNum("wcount", sb.indexSegment.termIndex().sizesMax());
        // return rewrite properties
        return prop;
    }
--- a/htroot/IndexControlURLs_p.java
+++ b/htroot/IndexControlURLs_p.java
@ -181,7 +181,7 @@ public class IndexControlURLs_p {
        // generate list
        if (post.containsKey("urlhashsimilar")) {
            try {
-                final Iterator<URLMetadataRow> entryIt = new RotateIterator<URLMetadataRow>(sb.indexSegment.urlMetadata().entries(true, urlhash), new String(Base64Order.zero((urlhash == null ? 0 : urlhash.length()))), sb.indexSegment.termIndex().size()); 
+                final Iterator<URLMetadataRow> entryIt = new RotateIterator<URLMetadataRow>(sb.indexSegment.urlMetadata().entries(true, urlhash), new String(Base64Order.zero((urlhash == null ? 0 : urlhash.length()))), sb.indexSegment.termIndex().sizesMax()); 
                final StringBuilder result = new StringBuilder("Sequential List of URL-Hashes:<br />");
                URLMetadataRow entry;
                int i = 0;
--- a/htroot/IndexImport_p.java
+++ b/htroot/IndexImport_p.java
@ -106,7 +106,7 @@ public final class IndexImport_p {
            }
        }
        
-        prop.putNum("wcount", switchboard.indexSegment.termIndex().size());
+        prop.putNum("wcount", switchboard.indexSegment.termIndex().sizesMax());
        prop.putNum("ucount", switchboard.indexSegment.urlMetadata().size());
        
        /*
--- a/htroot/IndexShare_p.java
+++ b/htroot/IndexShare_p.java
@ -55,7 +55,7 @@ public class IndexShare_p {
            prop.put("wordfreq", switchboard.getConfigLong("defaultWordReceiveFrequency",10));
            prop.put("dtable", "");
            prop.put("rtable", "");
-            prop.putNum("wcount", switchboard.indexSegment.termIndex().size());
+            prop.putNum("wcount", switchboard.indexSegment.termIndex().sizesMax());
            prop.putNum("ucount", switchboard.indexSegment.urlMetadata().size());
            return prop; // be save
        }
@ -68,7 +68,7 @@ public class IndexShare_p {
        }

        // insert constants
-        prop.putNum("wcount", switchboard.indexSegment.termIndex().size());
+        prop.putNum("wcount", switchboard.indexSegment.termIndex().sizesMax());
        prop.putNum("ucount", switchboard.indexSegment.urlMetadata().size());
        
        // return rewrite properties
--- a/htroot/PerformanceGraph.java
+++ b/htroot/PerformanceGraph.java
@ -41,7 +41,7 @@ public class PerformanceGraph {
        final int width = post.getInt("width", 660);
        final int height = post.getInt("height", 240);
        
-        return plasmaProfiling.performanceGraph(width, height, sb.indexSegment.urlMetadata().size() + " URLS / " + sb.indexSegment.termIndex().getBackendSize() + " WORDS IN INDEX / " + sb.indexSegment.termIndex().getBufferSize() + " WORDS IN CACHE");
+        return plasmaProfiling.performanceGraph(width, height, sb.indexSegment.urlMetadata().size() + " URLS / " + sb.indexSegment.termIndex().sizesMax() + " WORDS IN INDEX / " + sb.indexSegment.termIndex().getBufferSize() + " WORDS IN CACHE");
    }
    
 }
--- a/htroot/api/queues_p.java
+++ b/htroot/api/queues_p.java
@ -42,7 +42,7 @@ public class queues_p {
        prop.putNum("indexingSize", sb.getThread(plasmaSwitchboardConstants.INDEXER).getJobCount() + sb.crawler.queuePreStack.getActiveQueueSize());
        prop.putNum("indexingMax", (int) sb.getConfigLong(plasmaSwitchboardConstants.INDEXER_SLOTS, 30));
        prop.putNum("urlpublictextSize", sb.indexSegment.urlMetadata().size());
-        prop.putNum("rwipublictextSize", sb.indexSegment.termIndex().size());
+        prop.putNum("rwipublictextSize", sb.indexSegment.termIndex().sizesMax());
        if ((sb.crawler.queuePreStack.size() == 0) && (sb.crawler.queuePreStack.getActiveQueueSize() == 0)) {
            prop.put("list", "0"); //is empty
        } else {
--- a/htroot/yacy/query.java
+++ b/htroot/yacy/query.java
@ -92,7 +92,7 @@ public final class query {

        if (obj.equals("rwicount")) {
            // return the total number of available word indexes
-            prop.put("response", sb.indexSegment.termIndex().size());
+            prop.put("response", sb.indexSegment.termIndex().sizesMax());
            return prop;
        }

--- a/source/de/anomic/kelondro/blob/BLOBArray.java
+++ b/source/de/anomic/kelondro/blob/BLOBArray.java
@ -414,6 +414,17 @@ public class BLOBArray implements BLOB {
        return s;
    }
    
+    /**
+     * ask for the number of blob entries in each blob of the blob array
+     * @return the number of entries in each blob
+     */
+    public synchronized int[] sizes() {
+        int[] s = new int[blobs.size()];
+        int c = 0;
+        for (blobItem bi: blobs) s[c++] = bi.blob.size();
+        return s;
+    }
+    
    /**
     * iterator over all keys
     * @param up
--- a/source/de/anomic/kelondro/text/BufferedIndex.java
+++ b/source/de/anomic/kelondro/text/BufferedIndex.java
@ -87,12 +87,6 @@ public interface BufferedIndex<ReferenceType extends Reference> extends Index<Re
     */
    public long getBufferSizeBytes();

-    /**
-     * get the size of the buffer backend
-     * @return number of word references
-     */
-    public int getBackendSize();
-    
    /**
     * get the size of the buffer content
     * @return number of word references
--- a/source/de/anomic/kelondro/text/IndexCell.java
+++ b/source/de/anomic/kelondro/text/IndexCell.java
@ -260,7 +260,22 @@ public final class IndexCell<ReferenceType extends Reference> extends AbstractBu
    }

    public int size() {
-        return this.ram.size() + this.array.size();
+        throw new UnsupportedOperationException("an accumulated size of index entries would not reflect the real number of words, which cannot be computed easily");
+    }
+
+    public int[] sizes() {
+    	int[] as = this.array.sizes();
+    	int[] asr = new int[as.length + 1];
+    	System.arraycopy(as, 0, asr, 0, as.length);
+    	asr[as.length] = this.ram.size();
+    	return asr;
+    }
+    
+    public int sizesMax() {
+    	int m = 0;
+    	int[] s = sizes();
+    	for (int i = 0; i < s.length; i++) if (s[i] > m) m = s[i];
+    	return m;
    }

    public int minMem() {
@ -312,10 +327,6 @@ public final class IndexCell<ReferenceType extends Reference> extends AbstractBu
        this.array.mountBLOBFile(blobFile);
    }

-    public int getBackendSize() {
-        return this.array.size();
-    }
-
    public long getBufferMaxAge() {
        return System.currentTimeMillis();
    }
--- a/source/de/anomic/kelondro/text/ReferenceContainerArray.java
+++ b/source/de/anomic/kelondro/text/ReferenceContainerArray.java
@ -81,8 +81,8 @@ public final class ReferenceContainerArray<ReferenceType extends Reference> {
    	this.array.clear();
    }
    
-    public int size() {
-        return (this.array == null) ? 0 : this.array.size();
+    public int[] sizes() {
+        return (this.array == null) ? new int[0] : this.array.sizes();
    }
    
    public ByteOrder ordering() {
@ -330,7 +330,6 @@ public final class ReferenceContainerArray<ReferenceType extends Reference> {
                // write a log
                if (System.currentTimeMillis() - lastlog > 30000) {
                    System.out.println("CELL REFERENCE COLLECTION scanned " + count + " RWI index entries. ");
-                    //Log.logInfo("COLLECTION INDEX REFERENCE COLLECTION", "scanned " + count + " RWI index entries. " + (((System.currentTimeMillis() - start) * (array.size() + array.free() - count) / count) / 60000) + " minutes remaining for this array");
                    lastlog = System.currentTimeMillis();
                }
            }
--- a/source/de/anomic/kelondro/text/Segment.java
+++ b/source/de/anomic/kelondro/text/Segment.java
@ -333,7 +333,7 @@ public final class Segment {
        
        public ReferenceCleaner(final byte[] startHash) {
            this.startHash = startHash;
-            this.rwiCountAtStart = termIndex().size();
+            this.rwiCountAtStart = termIndex().sizesMax();
        }
        
        public void run() {
--- a/source/de/anomic/plasma/plasmaDbImporter.java
+++ b/source/de/anomic/plasma/plasmaDbImporter.java
@ -37,7 +37,7 @@ public class plasmaDbImporter extends AbstractImporter implements Importer {
    	super("PLASMADB");
        this.homeWordIndex = homeWI;
        this.importWordIndex = importWI;
-        this.importStartSize = this.importWordIndex.termIndex().size();
+        this.importStartSize = this.importWordIndex.termIndex().sizesMax();
    }

    /**
@ -94,8 +94,8 @@ public class plasmaDbImporter extends AbstractImporter implements Importer {
        
        try {
            this.log.logInfo("Importing DB from '" + this.importWordIndex.getLocation().getAbsolutePath() + "'");
-            this.log.logInfo("Home word index contains " + homeWordIndex.termIndex().size() + " words and " + homeWordIndex.urlMetadata().size() + " URLs.");
-            this.log.logInfo("Import word index contains " + this.importWordIndex.termIndex().size() + " words and " + this.importWordIndex.urlMetadata().size() + " URLs.");                        
+            this.log.logInfo("Home word index contains " + homeWordIndex.termIndex().sizesMax() + " words and " + homeWordIndex.urlMetadata().size() + " URLs.");
+            this.log.logInfo("Import word index contains " + this.importWordIndex.termIndex().sizesMax() + " words and " + this.importWordIndex.urlMetadata().size() + " URLs.");                        
            
            final HashSet<String> unknownUrlBuffer = new HashSet<String>();
            final HashSet<String> importedUrlBuffer = new HashSet<String>();
@ -190,8 +190,8 @@ public class plasmaDbImporter extends AbstractImporter implements Importer {
                                "Speed: "+ 500*1000/duration + " word entities/s" +
                                " | Elapsed time: " + DateFormatter.formatInterval(getElapsedTime()) +
                                " | Estimated time: " + DateFormatter.formatInterval(getEstimatedTime()) + "\n" + 
-                                "Home Words = " + homeWordIndex.termIndex().size() + 
-                                " | Import Words = " + this.importWordIndex.termIndex().size());
+                                "Home Words = " + homeWordIndex.termIndex().sizesMax() + 
+                                " | Import Words = " + this.importWordIndex.termIndex().sizesMax());
                        this.wordChunkStart = this.wordChunkEnd;
                        this.wordChunkStartHash = this.wordChunkEndHash;
                    }                    
@ -213,8 +213,8 @@ public class plasmaDbImporter extends AbstractImporter implements Importer {
                }
            }
            
-            this.log.logInfo("Home word index contains " + homeWordIndex.termIndex().size() + " words and " + homeWordIndex.urlMetadata().size() + " URLs.");
-            this.log.logInfo("Import word index contains " + this.importWordIndex.termIndex().size() + " words and " + this.importWordIndex.urlMetadata().size() + " URLs.");
+            this.log.logInfo("Home word index contains " + homeWordIndex.termIndex().sizesMax() + " words and " + homeWordIndex.urlMetadata().size() + " URLs.");
+            this.log.logInfo("Import word index contains " + this.importWordIndex.termIndex().sizesMax() + " words and " + this.importWordIndex.urlMetadata().size() + " URLs.");
        } catch (final Exception e) {
            this.log.logSevere("Database import failed.",e);
            e.printStackTrace();
--- a/source/de/anomic/plasma/plasmaSwitchboard.java
+++ b/source/de/anomic/plasma/plasmaSwitchboard.java
@ -2036,8 +2036,8 @@ public final class plasmaSwitchboard extends serverAbstractSwitch<IndexingStack.
        if (indexSegment.urlMetadata().size() < 10) {
            return "no DHT distribution: loadedURL.size() = " + indexSegment.urlMetadata().size();
        }
-        if (indexSegment.termIndex().size() < 100) {
-            return "no DHT distribution: not enough words - wordIndex.size() = " + indexSegment.termIndex().size();
+        if (indexSegment.termIndex().sizesMax() < 100) {
+            return "no DHT distribution: not enough words - wordIndex.size() = " + indexSegment.termIndex().sizesMax();
        }
        if ((getConfig(plasmaSwitchboardConstants.INDEX_DIST_ALLOW_WHILE_CRAWLING, "false").equalsIgnoreCase("false")) && (crawlQueues.noticeURL.notEmptyLocal())) {
            return "no DHT distribution: crawl in progress: noticeURL.stackSize() = " + crawlQueues.noticeURL.size() + ", sbQueue.size() = " + crawler.queuePreStack.size();
@ -2149,7 +2149,7 @@ public final class plasmaSwitchboard extends serverAbstractSwitch<IndexingStack.
        peers.mySeed().put(yacySeed.LCOUNT, Integer.toString(indexSegment.urlMetadata().size())); // the number of links that the peer has stored (LURL's)
        peers.mySeed().put(yacySeed.NCOUNT, Integer.toString(crawlQueues.noticeURL.size())); // the number of links that the peer has noticed, but not loaded (NURL's)
        peers.mySeed().put(yacySeed.RCOUNT, Integer.toString(crawlQueues.noticeURL.stackSize(NoticedURL.STACK_TYPE_LIMIT))); // the number of links that the peer provides for remote crawling (ZURL's)
-        peers.mySeed().put(yacySeed.ICOUNT, Integer.toString(indexSegment.termIndex().size())); // the minimum number of words that the peer has indexed (as it says)
+        peers.mySeed().put(yacySeed.ICOUNT, Integer.toString(indexSegment.termIndex().sizesMax())); // the minimum number of words that the peer has indexed (as it says)
        peers.mySeed().put(yacySeed.SCOUNT, Integer.toString(peers.sizeConnected())); // the number of seeds that the peer has stored
        peers.mySeed().put(yacySeed.CCOUNT, Double.toString(((int) ((peers.sizeConnected() + peers.sizeDisconnected() + peers.sizePotential()) * 60.0 / (uptime + 1.01)) * 100) / 100.0)); // the number of clients that the peer connects (as connects/hour)
        peers.mySeed().put(yacySeed.VERSION, getConfig("version", ""));