From e26ac60c3e308c52b04b1405371577df28a0de91 Mon Sep 17 00:00:00 2001 From: orbiter Date: Sun, 22 May 2005 13:27:54 +0000 Subject: [PATCH] modified assortment data structures git-svn-id: https://svn.berlios.de/svnroot/repos/yacy/trunk@148 6c8d7289-2bf4-0310-a012-ef5d649a1542 --- build.xml | 2 +- htroot/Performance_p.java | 2 +- htroot/yacy/transferRWI.java | 2 +- makerelease.sh | 2 +- .../kelondro/kelondroMergeIterator.java | 17 +++ source/de/anomic/plasma/plasmaSearch.java | 2 +- source/de/anomic/plasma/plasmaWordIndex.java | 4 +- .../plasma/plasmaWordIndexAssortment.java | 85 +++--------- .../plasmaWordIndexAssortmentCluster.java | 127 ++++++++++++++++++ .../anomic/plasma/plasmaWordIndexCache.java | 71 +++++----- .../plasma/plasmaWordIndexEntryContainer.java | 50 +++++-- source/de/anomic/yacy/yacyClient.java | 4 +- 12 files changed, 243 insertions(+), 125 deletions(-) create mode 100644 source/de/anomic/plasma/plasmaWordIndexAssortmentCluster.java diff --git a/build.xml b/build.xml index 988d14b37..19b4de8fc 100644 --- a/build.xml +++ b/build.xml @@ -70,7 +70,7 @@ - + diff --git a/htroot/Performance_p.java b/htroot/Performance_p.java index cda6bc1ca..ae186155f 100644 --- a/htroot/Performance_p.java +++ b/htroot/Performance_p.java @@ -178,7 +178,7 @@ public class Performance_p { prop.put("maxURLinWordCache", "" + switchboard.wordIndex.maxURLinWordCache()); prop.put("maxWaitingWordFlush", switchboard.getConfig("maxWaitingWordFlush", "180")); prop.put("wordCacheMax", switchboard.getConfig("wordCacheMax", "10000")); - prop.put("singletonsSize", switchboard.wordIndex.singletonsSize()); + prop.put("singletonsSize", switchboard.wordIndex.assortmentSizes()[0]); // table thread pool settings GenericObjectPool.Config crawlerPoolConfig = switchboard.cacheLoader.getPoolConfig(); diff --git a/htroot/yacy/transferRWI.java b/htroot/yacy/transferRWI.java index 779440654..505d05651 100644 --- a/htroot/yacy/transferRWI.java +++ b/htroot/yacy/transferRWI.java @@ -107,7 +107,7 @@ public class transferRWI { wordHash = estring.substring(0, p); wordhashes[i] = wordHash; entry = new plasmaWordIndexEntry(estring.substring(p)); - switchboard.wordIndex.addEntries(plasmaWordIndexEntryContainer.instantContainer(wordHash, entry)); + switchboard.wordIndex.addEntries(plasmaWordIndexEntryContainer.instantContainer(wordHash, System.currentTimeMillis(), entry)); urlHash = entry.getUrlHash(); if ((!(unknownURL.contains(urlHash))) && (!(switchboard.loadedURL.exists(urlHash)))) { diff --git a/makerelease.sh b/makerelease.sh index 18413386d..8d0dad688 100755 --- a/makerelease.sh +++ b/makerelease.sh @@ -45,7 +45,7 @@ # Contributions and changes to the program code must be marked as such. # define variables -version='0.374' +version='0.375' datestr=`date +%Y%m%d` #release='yacy_v'$version'_'$datestr release='yacy_dev_v'$version'_'$datestr diff --git a/source/de/anomic/kelondro/kelondroMergeIterator.java b/source/de/anomic/kelondro/kelondroMergeIterator.java index 46fdcccb8..273be392f 100644 --- a/source/de/anomic/kelondro/kelondroMergeIterator.java +++ b/source/de/anomic/kelondro/kelondroMergeIterator.java @@ -43,6 +43,7 @@ package de.anomic.kelondro; import java.util.Iterator; import java.util.Comparator; +import java.util.Set; public class kelondroMergeIterator implements Iterator { @@ -105,4 +106,20 @@ public class kelondroMergeIterator implements Iterator { public void remove() { throw new java.lang.UnsupportedOperationException("merge does not support remove"); } + + public static Iterator cascade(Set /*of*/ iterators, boolean up) { + // this extends the ability to combine two iterators + // to the abiliy of combining a set of iterators + if (iterators == null) return null; + if (iterators.size() == 0) return null; + return cascade(iterators.iterator(), up); + } + + private static Iterator cascade(Iterator /*of*/ iiterators, boolean up) { + if (iiterators == null) return null; + if (!(iiterators.hasNext())) return null; + Iterator one = (Iterator) iiterators.next(); + if (!(iiterators.hasNext())) return one; + return new kelondroMergeIterator(one, cascade(iiterators, up), up); + } } diff --git a/source/de/anomic/plasma/plasmaSearch.java b/source/de/anomic/plasma/plasmaSearch.java index 6eddc5184..123f6598a 100644 --- a/source/de/anomic/plasma/plasmaSearch.java +++ b/source/de/anomic/plasma/plasmaSearch.java @@ -112,7 +112,7 @@ public final class plasmaSearch { wordHash = plasmaWordIndexEntry.word2hash(word); entry = new plasmaWordIndexEntry(urlHash, count, p++, 0, 0, age, quality, language, doctype, true); - wordIndex.addEntries(plasmaWordIndexEntryContainer.instantContainer(wordHash, entry)); + wordIndex.addEntries(plasmaWordIndexEntryContainer.instantContainer(wordHash, System.currentTimeMillis(), entry)); } //System.out.println("DEBUG: plasmaSearch.addPageIndex: added " + condenser.getWords().size() + " words, flushed " + c + " entries"); return condenser.getWords().size(); diff --git a/source/de/anomic/plasma/plasmaWordIndex.java b/source/de/anomic/plasma/plasmaWordIndex.java index d165d998a..88b987df4 100644 --- a/source/de/anomic/plasma/plasmaWordIndex.java +++ b/source/de/anomic/plasma/plasmaWordIndex.java @@ -76,8 +76,8 @@ public final class plasmaWordIndex { return ramCache.wordCacheRAMSize(); } - public int singletonsSize() { - return ramCache.singletonsSize(); + public int[] assortmentSizes() { + return ramCache.assortmentsSizes(); } public void setMaxWords(int maxWords) { diff --git a/source/de/anomic/plasma/plasmaWordIndexAssortment.java b/source/de/anomic/plasma/plasmaWordIndexAssortment.java index e7b339511..cff0e385a 100644 --- a/source/de/anomic/plasma/plasmaWordIndexAssortment.java +++ b/source/de/anomic/plasma/plasmaWordIndexAssortment.java @@ -47,7 +47,7 @@ For each 'x' there is an assortment database, where 1<=x<=max If a word appears on more than 'max' web pages, the corresponing url-list is stored to some kind of back-end database which we consider as the - 'slowes' option to save data. + 'slowest' option to save data. This here is the fastest file-based. */ package de.anomic.plasma; @@ -124,34 +124,21 @@ public final class plasmaWordIndexAssortment { } } - public record newRecord(plasmaWordIndexEntry entry, long creationTime) { - return new record(new plasmaWordIndexEntry[]{entry}, creationTime); - } - - public record newRecord(plasmaWordIndexEntry[] entries, long creationTime) { - return new record(entries, creationTime); - } - - public class record { - public plasmaWordIndexEntry[] entries; - public long creationTime; - public record(plasmaWordIndexEntry[] entries, long creationTime) { - this.entries = entries; - this.creationTime = creationTime; - } - } - - public void store(String wordHash, record newRecord) { + public void store(String wordHash, plasmaWordIndexEntryContainer newContainer) { // stores a word index to assortment database // this throws an exception if the word hash already existed //log.logDebug("storeAssortment: wordHash=" + wordHash + ", urlHash=" + entry.getUrlHash() + ", time=" + creationTime); + if (newContainer.size() != assortmentCapacity) throw new RuntimeException("plasmaWordIndexAssortment.store: wrong container size"); byte[][] row = new byte[this.bufferStructureLength][]; row[0] = wordHash.getBytes(); row[1] = kelondroRecords.long2bytes(1, 4); - row[2] = kelondroRecords.long2bytes(newRecord.creationTime, 8); - for (int i = 0; i < assortmentCapacity; i++) { - row[3 + 2 * i] = newRecord.entries[i].getUrlHash().getBytes(); - row[4 + 2 * i] = newRecord.entries[i].toEncodedForm(true).getBytes(); + row[2] = kelondroRecords.long2bytes(newContainer.updated(), 8); + Iterator entries = newContainer.entries(); + plasmaWordIndexEntry entry; + for (int i = 0; i < assortmentCapacity; i++) { + entry = (plasmaWordIndexEntry) entries.next(); + row[3 + 2 * i] = entry.getUrlHash().getBytes(); + row[4 + 2 * i] = entry.toEncodedForm(true).getBytes(); } byte[][] oldrow = null; try { @@ -168,50 +155,7 @@ public final class plasmaWordIndexAssortment { if (oldrow != null) throw new RuntimeException("Store to assortment ambiguous"); } - /* - public record read(String wordHash) { - // returns a single word index from assortment database; returns null if index does not exist - //log.logDebug("readAssortment: wordHash=" + wordHash); - byte[][] row = null; - try { - row = assortments.get(wordHash.getBytes()); - } catch (IOException e) { - log.logFailure("readAssortment/IO-error: " + e.getMessage() + " - reset assortment-DB"); - e.printStackTrace(); - resetDatabase(); - } catch (kelondroException e) { - log.logFailure("readAssortment/kelondro-error: " + e.getMessage() + " - reset assortment-DB"); - e.printStackTrace(); - resetDatabase(); - } - if (row == null) return null; - long creationTime = kelondroRecords.bytes2long(row[2]); - plasmaWordIndexEntry[] wordEntries = new plasmaWordIndexEntry[this.bufferStructureLength]; - for (int i = 0; i < assortmentCapacity; i++) { - wordEntries[i] = new plasmaWordIndexEntry(new String(row[3 + 2 * i]), new String(row[4 + 2 * i])); - } - return new record(wordEntries, creationTime); - } - - public void remove(String wordHash) { - // deletes a word index from assortment database - //log.logDebug("removeAssortment: wordHash=" + wordHash); - byte[][] row = null; - try { - row = assortments.remove(wordHash.getBytes()); - } catch (IOException e) { - log.logFailure("removeAssortment/IO-error: " + e.getMessage() + " - reset assortment-DB"); - e.printStackTrace(); - resetDatabase(); - } catch (kelondroException e) { - log.logFailure("removeAssortment/kelondro-error: " + e.getMessage() + " - reset assortment-DB"); - e.printStackTrace(); - resetDatabase(); - } - } - */ - - public record remove(String wordHash) { + public plasmaWordIndexEntryContainer remove(String wordHash) { // deletes a word index from assortment database // and returns the content record byte[][] row = null; @@ -229,12 +173,13 @@ public final class plasmaWordIndexAssortment { return null; } if (row == null) return null; - long creationTime = kelondroRecords.bytes2long(row[2]); + long updateTime = kelondroRecords.bytes2long(row[2]); plasmaWordIndexEntry[] wordEntries = new plasmaWordIndexEntry[this.bufferStructureLength]; + plasmaWordIndexEntryContainer container = new plasmaWordIndexEntryContainer(wordHash); for (int i = 0; i < assortmentCapacity; i++) { - wordEntries[i] = new plasmaWordIndexEntry(new String(row[3 + 2 * i]), new String(row[4 + 2 * i])); + container.add(new plasmaWordIndexEntry[]{new plasmaWordIndexEntry(new String(row[3 + 2 * i]), new String(row[4 + 2 * i]))}, updateTime); } - return new record(wordEntries, creationTime); + return container; } private void resetDatabase() { diff --git a/source/de/anomic/plasma/plasmaWordIndexAssortmentCluster.java b/source/de/anomic/plasma/plasmaWordIndexAssortmentCluster.java new file mode 100644 index 000000000..0d640bebd --- /dev/null +++ b/source/de/anomic/plasma/plasmaWordIndexAssortmentCluster.java @@ -0,0 +1,127 @@ +// plasmaWordIndexAssortmentCluster.java +// ------------------------------------- +// part of YACY +// (C) by Michael Peter Christen; mc@anomic.de +// first published on http://www.anomic.de +// Frankfurt, Germany, 2005 +// last major change: 20.5.2005 +// +// This program is free software; you can redistribute it and/or modify +// it under the terms of the GNU General Public License as published by +// the Free Software Foundation; either version 2 of the License, or +// (at your option) any later version. +// +// This program is distributed in the hope that it will be useful, +// but WITHOUT ANY WARRANTY; without even the implied warranty of +// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +// GNU General Public License for more details. +// +// You should have received a copy of the GNU General Public License +// along with this program; if not, write to the Free Software +// Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA +// +// Using this software in any meaning (reading, learning, copying, compiling, +// running) means that you agree that the Author(s) is (are) not responsible +// for cost, loss of data or any harm that may be caused directly or indirectly +// by usage of this softare or this documentation. The usage of this software +// is on your own risk. The installation and usage (starting/running) of this +// software may allow other people or application to access your computer and +// any attached devices and is highly dependent on the configuration of the +// software which must be done by the user of the software; the author(s) is +// (are) also not responsible for proper configuration and usage of the +// software, even if provoked by documentation provided together with +// the software. +// +// Any changes to this file according to the GPL as documented in the file +// gpl.txt aside this file in the shipment you received can be done to the +// lines that follows this copyright notice here, but changes must not be +// done inside the copyright notive above. A re-distribution must contain +// the intact and unchanged copyright notice. +// Contributions and changes to the program code must be marked as such. + +/* + An assortment-cluster is a set of assortments. + Each one carries a different number of URL's + */ + +package de.anomic.plasma; + +import java.io.File; +import java.util.*; +import de.anomic.kelondro.*; +import de.anomic.server.serverLog; + +public final class plasmaWordIndexAssortmentCluster { + + // class variables + private File assortmentsPath; + private int clusterCapacity; + private serverLog log; + private plasmaWordIndexAssortment[] assortments; + private long completeBufferSize; + + public plasmaWordIndexAssortmentCluster(File assortmentsPath, int clusterCapacity, int bufferkb, serverLog log) { + // set class variables + if (!(assortmentsPath.exists())) assortmentsPath.mkdirs(); + this.clusterCapacity = clusterCapacity; + this.completeBufferSize = bufferkb * 1024; + this.log = log; + this.assortments = new plasmaWordIndexAssortment[clusterCapacity]; + + // initialize cluster + for (int i = 0; i < clusterCapacity; i++) { + assortments[i] = new plasmaWordIndexAssortment(assortmentsPath, i + 1, (int) completeBufferSize / clusterCapacity, log); + } + } + + public plasmaWordIndexEntryContainer storeTry(String wordHash, plasmaWordIndexEntryContainer newContainer) { + // this tries to store the record. If the record does not fit, or a same hash already + // exists and would not fit together with the new record, then the record is deleted from + // the assortmen(s) and returned together with the newRecord. + // if storage was successful, NULL is returned. + if (newContainer.size() > clusterCapacity) return newContainer; // it will not fit + plasmaWordIndexEntryContainer buffer; + for (int i = 0; i < clusterCapacity; i++) { + buffer = assortments[i].remove(wordHash); + if (buffer != null) newContainer.add(buffer); + if (newContainer.size() > clusterCapacity) return newContainer; // it will not fit + } + // we collected all records and the result will fit somewhere.. + assortments[newContainer.size() - 1].store(wordHash, newContainer); + // return null to show that we have stored the new Record successfully + return null; + } + + public plasmaWordIndexEntryContainer removeFromAll(String wordHash) { + // collect all records from all the assortments and return them + plasmaWordIndexEntryContainer buffer, record = new plasmaWordIndexEntryContainer(wordHash); + for (int i = 0; i < clusterCapacity; i++) { + buffer = assortments[i].remove(wordHash); + if (buffer != null) record.add(buffer); + } + return record; + } + + public Iterator hashConjunction(String startWordHash, boolean up) { + HashSet iterators = new HashSet(); + for (int i = 0; i < clusterCapacity; i++) iterators.add(assortments[i].hashes(startWordHash, up, true)); + return kelondroMergeIterator.cascade(iterators, up); + } + + public int sizeTotal() { + int total = 0; + for (int i = 0; i < clusterCapacity; i++) total += assortments[i].size(); + return total; + } + + public int[] sizes() { + int[] sizes = new int[clusterCapacity]; + for (int i = 0; i < clusterCapacity; i++) sizes[i] = assortments[i].size(); + return sizes; + } + + public void close() { + for (int i = 0; i < clusterCapacity; i++) assortments[i].close(); + } + +} diff --git a/source/de/anomic/plasma/plasmaWordIndexCache.java b/source/de/anomic/plasma/plasmaWordIndexCache.java index 6db3246d4..05b69e24d 100644 --- a/source/de/anomic/plasma/plasmaWordIndexCache.java +++ b/source/de/anomic/plasma/plasmaWordIndexCache.java @@ -55,6 +55,8 @@ public final class plasmaWordIndexCache implements plasmaWordIndexInterface { private static final String indexDumpFileName = "indexDump0.stack"; private static final String oldSingletonFileName = "indexSingletons0.db"; private static final String newSingletonFileName = "indexAssortment001.db"; + private static final int assortmentLimit = 1; + // class variables private File databaseRoot; @@ -64,7 +66,7 @@ public final class plasmaWordIndexCache implements plasmaWordIndexInterface { private HashMap hashDate; private int maxWords; private serverLog log; - private plasmaWordIndexAssortment singletons; + private plasmaWordIndexAssortmentCluster assortmentCluster; private int singletonBufferSize; //kb // calculated constants @@ -92,7 +94,7 @@ public final class plasmaWordIndexCache implements plasmaWordIndexInterface { this.maxWords = 10000; this.backend = backend; this.log = log; - this.singletons = new plasmaWordIndexAssortment(databaseRoot, 1, singletonBufferSize, log); + this.assortmentCluster = new plasmaWordIndexAssortmentCluster(databaseRoot, assortmentLimit, singletonBufferSize, log); // read in dump of last session try { @@ -212,12 +214,12 @@ public final class plasmaWordIndexCache implements plasmaWordIndexInterface { this.maxWords = maxWords; } - public int singletonsSize() { - return singletons.size(); + public int[] assortmentsSizes() { + return assortmentCluster.sizes(); } public int size() { - return java.lang.Math.max(singletons.size(), java.lang.Math.max(backend.size(), cache.size())); + return java.lang.Math.max(assortmentCluster.sizeTotal(), java.lang.Math.max(backend.size(), cache.size())); } public Iterator wordHashes(String startWordHash, boolean up) { @@ -229,7 +231,7 @@ public final class plasmaWordIndexCache implements plasmaWordIndexInterface { return new kelondroMergeIterator( new kelondroMergeIterator( cache.keySet().iterator(), - singletons.hashes(startWordHash, true, false), + assortmentCluster.hashConjunction(startWordHash, true), true), backend.wordHashes(startWordHash, true), true); @@ -258,28 +260,29 @@ public final class plasmaWordIndexCache implements plasmaWordIndexInterface { } // now decide where to flush that container - plasmaWordIndexAssortment.record singleton = singletons.remove(key); - if (singleton == null) { - // not found in singletons - if (container.size() == 1) { - // it is a singleton: store to singleton - singletons.store(key, singletons.newRecord(container.getOne(), time)); - return 1; + plasmaWordIndexEntryContainer flushedFromAssortment = assortmentCluster.removeFromAll(key); + if (flushedFromAssortment == null) { + // not found in assortments + if (container.size() <= assortmentLimit) { + // this fits into the assortments + plasmaWordIndexEntryContainer feedback = assortmentCluster.storeTry(key, container); + if (feedback == null) { + return container.size(); + } else { + // *** should care about another option here *** + return backend.addEntries(feedback, time); + } } else { // store to back-end; this should be a rare case return backend.addEntries(container, time); } } else { - // we have a singleton and need to integrate this in the flush - plasmaWordIndexEntry oldEntry = singleton.entries[0]; - long oldTime = singleton.creationTime; - - // put new entries to the container - if (!(container.contains(oldEntry.getUrlHash()))) container.add(oldEntry); - + // we have some records and must integrate them into the flush + container.add(flushedFromAssortment); + // possibly reintegrate if (reintegrate) { - // put singleton together with container back to ram + // put assortmentRecord together with container back to ram synchronized (cache) { cache.put(key, container); hashScore.setScore(key, container.size()); @@ -288,22 +291,20 @@ public final class plasmaWordIndexCache implements plasmaWordIndexInterface { return -1; } else { // add this to the backend - return backend.addEntries(container, java.lang.Math.max(time, oldTime)); + return backend.addEntries(container, java.lang.Math.max(time, flushedFromAssortment.updated())); } } } private boolean flushFromSingleton(String key) { // this should only be called if the singleton shall be deleted or returned in an index entity - plasmaWordIndexAssortment.record singleton = singletons.remove(key); - if (singleton == null) { + plasmaWordIndexEntryContainer container = assortmentCluster.removeFromAll(key); + if (container == null) { return false; } else { - // we have a singleton - plasmaWordIndexEntry entry = (plasmaWordIndexEntry) singleton.entries[0]; - long time = singleton.creationTime; + // we have a non-empty entry-container // integrate it to the backend - return backend.addEntries(plasmaWordIndexEntryContainer.instantContainer(key, entry), time) > 0; + return backend.addEntries(container, container.updated()) > 0; } } @@ -395,7 +396,7 @@ public final class plasmaWordIndexCache implements plasmaWordIndexInterface { hashScore.deleteScore(wordHash); hashDate.remove(wordHash); } - singletons.remove(wordHash); + assortmentCluster.removeFromAll(wordHash); backend.deleteIndex(wordHash); } @@ -405,7 +406,7 @@ public final class plasmaWordIndexCache implements plasmaWordIndexInterface { return backend.removeEntries(wordHash, urlHashes, deleteComplete); } - public synchronized int addEntries(plasmaWordIndexEntryContainer container, long creationTime) { + public synchronized int addEntries(plasmaWordIndexEntryContainer container, long updateTime) { //serverLog.logDebug("PLASMA INDEXING", "addEntryToIndexMem: cache.size=" + cache.size() + "; hashScore.size=" + hashScore.size()); if (cache.size() >= this.maxWords) flushFromMemToLimit(); //if (flushc > 0) serverLog.logDebug("PLASMA INDEXING", "addEntryToIndexMem - flushed " + flushc + " entries"); @@ -420,25 +421,25 @@ public final class plasmaWordIndexCache implements plasmaWordIndexInterface { if (added > 0) { cache.put(wordHash, entries); hashScore.addScore(wordHash, added); - hashDate.put(wordHash, new Long(creationTime)); + hashDate.put(wordHash, new Long(updateTime)); } } //System.out.println("DEBUG: cache = " + cache.toString()); return added; } - private void addEntry(String wordHash, plasmaWordIndexEntry newEntry, long creationTime) { + private void addEntry(String wordHash, plasmaWordIndexEntry newEntry, long updateTime) { plasmaWordIndexEntryContainer entries = (plasmaWordIndexEntryContainer) cache.get(wordHash); if (entries == null) entries = new plasmaWordIndexEntryContainer(wordHash); - if (entries.add(newEntry)) { + if (entries.add(new plasmaWordIndexEntry[]{newEntry}, updateTime) > 0) { cache.put(wordHash, entries); hashScore.incScore(wordHash); - hashDate.put(wordHash, new Long(creationTime)); + hashDate.put(wordHash, new Long(updateTime)); } } public void close(int waitingSeconds) { - singletons.close(); + assortmentCluster.close(); try { dump(waitingSeconds); } catch (IOException e){ diff --git a/source/de/anomic/plasma/plasmaWordIndexEntryContainer.java b/source/de/anomic/plasma/plasmaWordIndexEntryContainer.java index 133ccce23..3aa2505c8 100644 --- a/source/de/anomic/plasma/plasmaWordIndexEntryContainer.java +++ b/source/de/anomic/plasma/plasmaWordIndexEntryContainer.java @@ -39,6 +39,17 @@ // the intact and unchanged copyright notice. // Contributions and changes to the program code must be marked as such. + +/* + an indexContainer is a bag of indexEntries for a single word + such an container represents a RWI snipplet: + it collects a new RWI until it is so big that it should be flushed to either + - an indexAssortment: collection of indexContainers of same size or + - the backend storage + + the creationTime is necessary to organize caching of containers +*/ + package de.anomic.plasma; import java.util.HashMap; @@ -48,10 +59,12 @@ import de.anomic.server.serverCodings; public class plasmaWordIndexEntryContainer implements Comparable { private String wordHash; - private HashMap container; + private HashMap container; // urlHash/plasmaWordIndexEntry - Mapping + private long updateTime; public plasmaWordIndexEntryContainer(String wordHash) { this.wordHash = wordHash; + this.updateTime = 0; container = new HashMap(); // a urlhash/plasmaWordIndexEntry - relation } @@ -59,16 +72,19 @@ public class plasmaWordIndexEntryContainer implements Comparable { return container.size(); } + public long updated() { + return updateTime; + } + public String wordHash() { return wordHash; } - public boolean add(plasmaWordIndexEntry entry) { - // returns true if the new entry was added, false if it already existet - String urlHash = entry.getUrlHash(); - if (container.containsKey(urlHash)) return false; - container.put(urlHash, entry); - return true; + public int add(plasmaWordIndexEntry[] entries, long updateTime) { + int c = 0; + for (int i = 0; i < entries.length; i++) if (add(entries[i])) c++; + this.updateTime = java.lang.Math.max(this.updateTime, updateTime); + return c; } public int add(plasmaWordIndexEntryContainer c) { @@ -78,28 +94,38 @@ public class plasmaWordIndexEntryContainer implements Comparable { while (i.hasNext()) { if (add((plasmaWordIndexEntry) i.next())) x++; } + this.updateTime = java.lang.Math.max(this.updateTime, c.updateTime); return x; } + private boolean add(plasmaWordIndexEntry entry) { + // returns true if the new entry was added, false if it already existet + String urlHash = entry.getUrlHash(); + if (container.containsKey(urlHash)) return false; + container.put(urlHash, entry); + return true; + } + public boolean contains(String urlHash) { return container.containsKey(urlHash); } - public plasmaWordIndexEntry getOne() { - return (plasmaWordIndexEntry) container.values().toArray()[0]; + public plasmaWordIndexEntry[] getEntryArray() { + return (plasmaWordIndexEntry[]) container.values().toArray(); } public Iterator entries() { // returns an iterator of plasmaWordIndexEntry objects return container.values().iterator(); } - - public static plasmaWordIndexEntryContainer instantContainer(String wordHash, plasmaWordIndexEntry entry) { + + public static plasmaWordIndexEntryContainer instantContainer(String wordHash, long creationTime, plasmaWordIndexEntry entry) { plasmaWordIndexEntryContainer c = new plasmaWordIndexEntryContainer(wordHash); c.add(entry); + c.updateTime = creationTime; return c; } - + public String toString() { return "C[" + wordHash + "] has " + container.size() + " entries"; } diff --git a/source/de/anomic/yacy/yacyClient.java b/source/de/anomic/yacy/yacyClient.java index c7fcb6a94..7cc79f407 100644 --- a/source/de/anomic/yacy/yacyClient.java +++ b/source/de/anomic/yacy/yacyClient.java @@ -298,7 +298,9 @@ public class yacyClient { plasmaWordIndexEntry entry = new plasmaWordIndexEntry(link.hash(), link.wordCount(), 0, 0, 0, plasmaSearch.calcVirtualAge(link.moddate()), link.quality(), link.language(), link.doctype(), false); - for (int m = 0; m < words; m++) container[m].add(entry); + for (int m = 0; m < words; m++) { + container[m].add(new plasmaWordIndexEntry[]{entry}, System.currentTimeMillis()); + } } // finally insert the containers to the index