replaced rwi term counting method by one that computes the maximum of the blobs that contibute to the RWI. An addition of the blob sizes is wrong/incorrect and does not reflect the real size. Truncation the size operation to the maximum of all blobs is also incorrect, but not as wrong as the sum of all blob sizes wich double-counts many rwi entries.

git-svn-id: https://svn.berlios.de/svnroot/repos/yacy/trunk@6064 6c8d7289-2bf4-0310-a012-ef5d649a1542
This commit is contained in:
orbiter 2009-06-13 22:59:54 +00:00
parent 303ccda69f
commit 945777aa80
15 changed files with 49 additions and 34 deletions

View File

@ -94,7 +94,7 @@ public class IndexCleaner_p {
prop.put("rwidb_threadAlive", indexCleanerThread.isAlive() + "");
prop.put("rwidb_threadToString", indexCleanerThread.toString());
prop.putNum("rwidb_RWIcountstart", indexCleanerThread.rwiCountAtStart);
prop.putNum("rwidb_RWIcountnow", sb.indexSegment.termIndex().size());
prop.putNum("rwidb_RWIcountnow", sb.indexSegment.termIndex().sizesMax());
prop.put("rwidb_wordHashNow", (indexCleanerThread.wordHashNow == null) ? "NULL" : new String(indexCleanerThread.wordHashNow));
prop.put("rwidb_lastWordHash", (indexCleanerThread.lastWordHash == null) ? "null" : new String(indexCleanerThread.lastWordHash));
prop.putNum("rwidb_lastDeletionCounter", indexCleanerThread.lastDeletionCounter);

View File

@ -343,7 +343,7 @@ public class IndexControlRWIs_p {
// insert constants
prop.putNum("wcount", sb.indexSegment.termIndex().size());
prop.putNum("wcount", sb.indexSegment.termIndex().sizesMax());
// return rewrite properties
return prop;
}

View File

@ -181,7 +181,7 @@ public class IndexControlURLs_p {
// generate list
if (post.containsKey("urlhashsimilar")) {
try {
final Iterator<URLMetadataRow> entryIt = new RotateIterator<URLMetadataRow>(sb.indexSegment.urlMetadata().entries(true, urlhash), new String(Base64Order.zero((urlhash == null ? 0 : urlhash.length()))), sb.indexSegment.termIndex().size());
final Iterator<URLMetadataRow> entryIt = new RotateIterator<URLMetadataRow>(sb.indexSegment.urlMetadata().entries(true, urlhash), new String(Base64Order.zero((urlhash == null ? 0 : urlhash.length()))), sb.indexSegment.termIndex().sizesMax());
final StringBuilder result = new StringBuilder("Sequential List of URL-Hashes:<br />");
URLMetadataRow entry;
int i = 0;

View File

@ -106,7 +106,7 @@ public final class IndexImport_p {
}
}
prop.putNum("wcount", switchboard.indexSegment.termIndex().size());
prop.putNum("wcount", switchboard.indexSegment.termIndex().sizesMax());
prop.putNum("ucount", switchboard.indexSegment.urlMetadata().size());
/*

View File

@ -55,7 +55,7 @@ public class IndexShare_p {
prop.put("wordfreq", switchboard.getConfigLong("defaultWordReceiveFrequency",10));
prop.put("dtable", "");
prop.put("rtable", "");
prop.putNum("wcount", switchboard.indexSegment.termIndex().size());
prop.putNum("wcount", switchboard.indexSegment.termIndex().sizesMax());
prop.putNum("ucount", switchboard.indexSegment.urlMetadata().size());
return prop; // be save
}
@ -68,7 +68,7 @@ public class IndexShare_p {
}
// insert constants
prop.putNum("wcount", switchboard.indexSegment.termIndex().size());
prop.putNum("wcount", switchboard.indexSegment.termIndex().sizesMax());
prop.putNum("ucount", switchboard.indexSegment.urlMetadata().size());
// return rewrite properties

View File

@ -41,7 +41,7 @@ public class PerformanceGraph {
final int width = post.getInt("width", 660);
final int height = post.getInt("height", 240);
return plasmaProfiling.performanceGraph(width, height, sb.indexSegment.urlMetadata().size() + " URLS / " + sb.indexSegment.termIndex().getBackendSize() + " WORDS IN INDEX / " + sb.indexSegment.termIndex().getBufferSize() + " WORDS IN CACHE");
return plasmaProfiling.performanceGraph(width, height, sb.indexSegment.urlMetadata().size() + " URLS / " + sb.indexSegment.termIndex().sizesMax() + " WORDS IN INDEX / " + sb.indexSegment.termIndex().getBufferSize() + " WORDS IN CACHE");
}
}

View File

@ -42,7 +42,7 @@ public class queues_p {
prop.putNum("indexingSize", sb.getThread(plasmaSwitchboardConstants.INDEXER).getJobCount() + sb.crawler.queuePreStack.getActiveQueueSize());
prop.putNum("indexingMax", (int) sb.getConfigLong(plasmaSwitchboardConstants.INDEXER_SLOTS, 30));
prop.putNum("urlpublictextSize", sb.indexSegment.urlMetadata().size());
prop.putNum("rwipublictextSize", sb.indexSegment.termIndex().size());
prop.putNum("rwipublictextSize", sb.indexSegment.termIndex().sizesMax());
if ((sb.crawler.queuePreStack.size() == 0) && (sb.crawler.queuePreStack.getActiveQueueSize() == 0)) {
prop.put("list", "0"); //is empty
} else {

View File

@ -92,7 +92,7 @@ public final class query {
if (obj.equals("rwicount")) {
// return the total number of available word indexes
prop.put("response", sb.indexSegment.termIndex().size());
prop.put("response", sb.indexSegment.termIndex().sizesMax());
return prop;
}

View File

@ -414,6 +414,17 @@ public class BLOBArray implements BLOB {
return s;
}
/**
* ask for the number of blob entries in each blob of the blob array
* @return the number of entries in each blob
*/
public synchronized int[] sizes() {
int[] s = new int[blobs.size()];
int c = 0;
for (blobItem bi: blobs) s[c++] = bi.blob.size();
return s;
}
/**
* iterator over all keys
* @param up

View File

@ -87,12 +87,6 @@ public interface BufferedIndex<ReferenceType extends Reference> extends Index<Re
*/
public long getBufferSizeBytes();
/**
* get the size of the buffer backend
* @return number of word references
*/
public int getBackendSize();
/**
* get the size of the buffer content
* @return number of word references

View File

@ -260,7 +260,22 @@ public final class IndexCell<ReferenceType extends Reference> extends AbstractBu
}
public int size() {
return this.ram.size() + this.array.size();
throw new UnsupportedOperationException("an accumulated size of index entries would not reflect the real number of words, which cannot be computed easily");
}
public int[] sizes() {
int[] as = this.array.sizes();
int[] asr = new int[as.length + 1];
System.arraycopy(as, 0, asr, 0, as.length);
asr[as.length] = this.ram.size();
return asr;
}
public int sizesMax() {
int m = 0;
int[] s = sizes();
for (int i = 0; i < s.length; i++) if (s[i] > m) m = s[i];
return m;
}
public int minMem() {
@ -312,10 +327,6 @@ public final class IndexCell<ReferenceType extends Reference> extends AbstractBu
this.array.mountBLOBFile(blobFile);
}
public int getBackendSize() {
return this.array.size();
}
public long getBufferMaxAge() {
return System.currentTimeMillis();
}

View File

@ -81,8 +81,8 @@ public final class ReferenceContainerArray<ReferenceType extends Reference> {
this.array.clear();
}
public int size() {
return (this.array == null) ? 0 : this.array.size();
public int[] sizes() {
return (this.array == null) ? new int[0] : this.array.sizes();
}
public ByteOrder ordering() {
@ -330,7 +330,6 @@ public final class ReferenceContainerArray<ReferenceType extends Reference> {
// write a log
if (System.currentTimeMillis() - lastlog > 30000) {
System.out.println("CELL REFERENCE COLLECTION scanned " + count + " RWI index entries. ");
//Log.logInfo("COLLECTION INDEX REFERENCE COLLECTION", "scanned " + count + " RWI index entries. " + (((System.currentTimeMillis() - start) * (array.size() + array.free() - count) / count) / 60000) + " minutes remaining for this array");
lastlog = System.currentTimeMillis();
}
}

View File

@ -333,7 +333,7 @@ public final class Segment {
public ReferenceCleaner(final byte[] startHash) {
this.startHash = startHash;
this.rwiCountAtStart = termIndex().size();
this.rwiCountAtStart = termIndex().sizesMax();
}
public void run() {

View File

@ -37,7 +37,7 @@ public class plasmaDbImporter extends AbstractImporter implements Importer {
super("PLASMADB");
this.homeWordIndex = homeWI;
this.importWordIndex = importWI;
this.importStartSize = this.importWordIndex.termIndex().size();
this.importStartSize = this.importWordIndex.termIndex().sizesMax();
}
/**
@ -94,8 +94,8 @@ public class plasmaDbImporter extends AbstractImporter implements Importer {
try {
this.log.logInfo("Importing DB from '" + this.importWordIndex.getLocation().getAbsolutePath() + "'");
this.log.logInfo("Home word index contains " + homeWordIndex.termIndex().size() + " words and " + homeWordIndex.urlMetadata().size() + " URLs.");
this.log.logInfo("Import word index contains " + this.importWordIndex.termIndex().size() + " words and " + this.importWordIndex.urlMetadata().size() + " URLs.");
this.log.logInfo("Home word index contains " + homeWordIndex.termIndex().sizesMax() + " words and " + homeWordIndex.urlMetadata().size() + " URLs.");
this.log.logInfo("Import word index contains " + this.importWordIndex.termIndex().sizesMax() + " words and " + this.importWordIndex.urlMetadata().size() + " URLs.");
final HashSet<String> unknownUrlBuffer = new HashSet<String>();
final HashSet<String> importedUrlBuffer = new HashSet<String>();
@ -190,8 +190,8 @@ public class plasmaDbImporter extends AbstractImporter implements Importer {
"Speed: "+ 500*1000/duration + " word entities/s" +
" | Elapsed time: " + DateFormatter.formatInterval(getElapsedTime()) +
" | Estimated time: " + DateFormatter.formatInterval(getEstimatedTime()) + "\n" +
"Home Words = " + homeWordIndex.termIndex().size() +
" | Import Words = " + this.importWordIndex.termIndex().size());
"Home Words = " + homeWordIndex.termIndex().sizesMax() +
" | Import Words = " + this.importWordIndex.termIndex().sizesMax());
this.wordChunkStart = this.wordChunkEnd;
this.wordChunkStartHash = this.wordChunkEndHash;
}
@ -213,8 +213,8 @@ public class plasmaDbImporter extends AbstractImporter implements Importer {
}
}
this.log.logInfo("Home word index contains " + homeWordIndex.termIndex().size() + " words and " + homeWordIndex.urlMetadata().size() + " URLs.");
this.log.logInfo("Import word index contains " + this.importWordIndex.termIndex().size() + " words and " + this.importWordIndex.urlMetadata().size() + " URLs.");
this.log.logInfo("Home word index contains " + homeWordIndex.termIndex().sizesMax() + " words and " + homeWordIndex.urlMetadata().size() + " URLs.");
this.log.logInfo("Import word index contains " + this.importWordIndex.termIndex().sizesMax() + " words and " + this.importWordIndex.urlMetadata().size() + " URLs.");
} catch (final Exception e) {
this.log.logSevere("Database import failed.",e);
e.printStackTrace();

View File

@ -2036,8 +2036,8 @@ public final class plasmaSwitchboard extends serverAbstractSwitch<IndexingStack.
if (indexSegment.urlMetadata().size() < 10) {
return "no DHT distribution: loadedURL.size() = " + indexSegment.urlMetadata().size();
}
if (indexSegment.termIndex().size() < 100) {
return "no DHT distribution: not enough words - wordIndex.size() = " + indexSegment.termIndex().size();
if (indexSegment.termIndex().sizesMax() < 100) {
return "no DHT distribution: not enough words - wordIndex.size() = " + indexSegment.termIndex().sizesMax();
}
if ((getConfig(plasmaSwitchboardConstants.INDEX_DIST_ALLOW_WHILE_CRAWLING, "false").equalsIgnoreCase("false")) && (crawlQueues.noticeURL.notEmptyLocal())) {
return "no DHT distribution: crawl in progress: noticeURL.stackSize() = " + crawlQueues.noticeURL.size() + ", sbQueue.size() = " + crawler.queuePreStack.size();
@ -2149,7 +2149,7 @@ public final class plasmaSwitchboard extends serverAbstractSwitch<IndexingStack.
peers.mySeed().put(yacySeed.LCOUNT, Integer.toString(indexSegment.urlMetadata().size())); // the number of links that the peer has stored (LURL's)
peers.mySeed().put(yacySeed.NCOUNT, Integer.toString(crawlQueues.noticeURL.size())); // the number of links that the peer has noticed, but not loaded (NURL's)
peers.mySeed().put(yacySeed.RCOUNT, Integer.toString(crawlQueues.noticeURL.stackSize(NoticedURL.STACK_TYPE_LIMIT))); // the number of links that the peer provides for remote crawling (ZURL's)
peers.mySeed().put(yacySeed.ICOUNT, Integer.toString(indexSegment.termIndex().size())); // the minimum number of words that the peer has indexed (as it says)
peers.mySeed().put(yacySeed.ICOUNT, Integer.toString(indexSegment.termIndex().sizesMax())); // the minimum number of words that the peer has indexed (as it says)
peers.mySeed().put(yacySeed.SCOUNT, Integer.toString(peers.sizeConnected())); // the number of seeds that the peer has stored
peers.mySeed().put(yacySeed.CCOUNT, Double.toString(((int) ((peers.sizeConnected() + peers.sizeDisconnected() + peers.sizePotential()) * 60.0 / (uptime + 1.01)) * 100) / 100.0)); // the number of clients that the peer connects (as connects/hour)
peers.mySeed().put(yacySeed.VERSION, getConfig("version", ""));