refactoring of plasmaWordIndex: less methods in the class, separated the index to CachedIndexCollection

git-svn-id: https://svn.berlios.de/svnroot/repos/yacy/trunk@5710 6c8d7289-2bf4-0310-a012-ef5d649a1542
This commit is contained in:
orbiter 2009-03-13 14:56:25 +00:00
parent 14a1c33823
commit 7f67238f8b
27 changed files with 551 additions and 495 deletions

View File

@ -94,7 +94,7 @@ public class IndexCleaner_p {
prop.put("rwidb_threadAlive", indexCleanerThread.isAlive() + "");
prop.put("rwidb_threadToString", indexCleanerThread.toString());
prop.putNum("rwidb_RWIcountstart", indexCleanerThread.rwiCountAtStart);
prop.putNum("rwidb_RWIcountnow", sb.webIndex.size());
prop.putNum("rwidb_RWIcountnow", sb.webIndex.index().size());
prop.put("rwidb_wordHashNow", indexCleanerThread.wordHashNow);
prop.put("rwidb_lastWordHash", indexCleanerThread.lastWordHash);
prop.putNum("rwidb_lastDeletionCounter", indexCleanerThread.lastDeletionCounter);

View File

@ -124,7 +124,7 @@ public class IndexControlRWIs_p {
if (delurl || delurlref) {
// generate an urlx array
ReferenceContainer index = null;
index = sb.webIndex.getReferences(keyhash, null);
index = sb.webIndex.index().getReferences(keyhash, null);
final Iterator<ReferenceRow> en = index.entries();
int i = 0;
urlx = new String[index.size()];
@ -141,7 +141,7 @@ public class IndexControlRWIs_p {
sb.urlRemove(urlx[i]);
}
}
sb.webIndex.deleteAllReferences(keyhash);
sb.webIndex.index().deleteAllReferences(keyhash);
post.remove("keyhashdeleteall");
post.put("urllist", "generated");
}
@ -158,7 +158,7 @@ public class IndexControlRWIs_p {
}
final Set<String> urlHashes = new HashSet<String>();
for (int i = 0; i < urlx.length; i++) urlHashes.add(urlx[i]);
sb.webIndex.removeReferences(keyhash, urlHashes);
sb.webIndex.index().removeReferences(keyhash, urlHashes);
// this shall lead to a presentation of the list; so handle that the remaining program
// thinks that it was called for a list presentation
post.remove("keyhashdelete");
@ -200,7 +200,7 @@ public class IndexControlRWIs_p {
// prepare index
ReferenceContainer index;
final long starttime = System.currentTimeMillis();
index = sb.webIndex.getReferences(keyhash, null);
index = sb.webIndex.index().getReferences(keyhash, null);
// built urlCache
final Iterator<ReferenceRow> urlIter = index.entries();
final HashMap<String, MetadataRowContainer> knownURLs = new HashMap<String, MetadataRowContainer>();
@ -237,7 +237,7 @@ public class IndexControlRWIs_p {
// generate list
if (post.containsKey("keyhashsimilar")) {
final Iterator<ReferenceContainer> containerIt = sb.webIndex.indexContainerSet(keyhash, false, true, 256).iterator();
final Iterator<ReferenceContainer> containerIt = sb.webIndex.index().indexContainerSet(keyhash, false, true, 256).iterator();
ReferenceContainer container;
int i = 0;
int rows = 0, cols = 0;
@ -315,7 +315,7 @@ public class IndexControlRWIs_p {
} catch (final IOException e) {
}
}
sb.webIndex.removeReferences(keyhash, urlHashes);
sb.webIndex.index().removeReferences(keyhash, urlHashes);
}
if (prop.getInt("searchresult", 0) == 3) plasmaSearchAPI.listHosts(prop, keyhash, sb);
@ -323,7 +323,7 @@ public class IndexControlRWIs_p {
// insert constants
prop.putNum("wcount", sb.webIndex.size());
prop.putNum("wcount", sb.webIndex.index().size());
// return rewrite properties
return prop;
}

View File

@ -182,7 +182,7 @@ public class IndexControlURLs_p {
// generate list
if (post.containsKey("urlhashsimilar")) {
try {
final Iterator<MetadataRowContainer> entryIt = new RotateIterator<MetadataRowContainer>(sb.webIndex.metadata().entries(true, urlhash), new String(Base64Order.zero((urlhash == null ? 0 : urlhash.length()))), sb.webIndex.size());
final Iterator<MetadataRowContainer> entryIt = new RotateIterator<MetadataRowContainer>(sb.webIndex.metadata().entries(true, urlhash), new String(Base64Order.zero((urlhash == null ? 0 : urlhash.length()))), sb.webIndex.index().size());
final StringBuilder result = new StringBuilder("Sequential List of URL-Hashes:<br />");
MetadataRowContainer entry;
int i = 0;

View File

@ -106,7 +106,7 @@ public final class IndexImport_p {
}
}
prop.putNum("wcount", switchboard.webIndex.size());
prop.putNum("wcount", switchboard.webIndex.index().size());
prop.putNum("ucount", switchboard.webIndex.metadata().size());
/*

View File

@ -55,7 +55,7 @@ public class IndexShare_p {
prop.put("wordfreq", switchboard.getConfigLong("defaultWordReceiveFrequency",10));
prop.put("dtable", "");
prop.put("rtable", "");
prop.putNum("wcount", switchboard.webIndex.size());
prop.putNum("wcount", switchboard.webIndex.index().size());
prop.putNum("ucount", switchboard.webIndex.metadata().size());
return prop; // be save
}
@ -68,7 +68,7 @@ public class IndexShare_p {
}
// insert constants
prop.putNum("wcount", switchboard.webIndex.size());
prop.putNum("wcount", switchboard.webIndex.index().size());
prop.putNum("ucount", switchboard.webIndex.metadata().size());
// return rewrite properties

View File

@ -41,7 +41,7 @@ public class PerformanceGraph {
final int width = post.getInt("width", 660);
final int height = post.getInt("height", 240);
return plasmaProfiling.performanceGraph(width, height, sb.webIndex.metadata().size() + " URLS / " + sb.webIndex.collectionsSize() + " WORDS IN COLLECTIONS / " + sb.webIndex.cacheSize() + " WORDS IN CACHE");
return plasmaProfiling.performanceGraph(width, height, sb.webIndex.metadata().size() + " URLS / " + sb.webIndex.index().collectionsSize() + " WORDS IN COLLECTIONS / " + sb.webIndex.index().cacheSize() + " WORDS IN CACHE");
}
}

View File

@ -199,7 +199,7 @@ public class PerformanceQueues_p {
// disallow setting of memprereq for indexer to prevent db from throwing OOMs
prop.put("table_" + c + "_disabled", /*(threadName.endsWith("_indexing")) ? 1 :*/ "0");
prop.put("table_" + c + "_recommendation", threadName.endsWith("_indexing") ? "1" : "0");
prop.putNum("table_" + c + "_recommendation_value", threadName.endsWith("_indexing") ? (switchboard.webIndex.minMem() / 1024) : 0);
prop.putNum("table_" + c + "_recommendation_value", threadName.endsWith("_indexing") ? (switchboard.webIndex.index().minMem() / 1024) : 0);
c++;
}
prop.put("table", c);
@ -229,7 +229,7 @@ public class PerformanceQueues_p {
if ((post != null) && (post.containsKey("cacheSizeSubmit"))) {
final int wordCacheMaxCount = post.getInt("wordCacheMaxCount", 20000);
switchboard.setConfig(plasmaSwitchboardConstants.WORDCACHE_MAX_COUNT, Integer.toString(wordCacheMaxCount));
switchboard.webIndex.setMaxWordCount(wordCacheMaxCount);
switchboard.webIndex.index().setMaxWordCount(wordCacheMaxCount);
final int wordCacheInitCount = post.getInt(plasmaSwitchboardConstants.WORDCACHE_INIT_COUNT, 30000);
switchboard.setConfig(plasmaSwitchboardConstants.WORDCACHE_INIT_COUNT, Integer.toString(wordCacheInitCount));
@ -288,11 +288,11 @@ public class PerformanceQueues_p {
// table cache settings
prop.putNum("urlCacheSize", switchboard.webIndex.metadata().writeCacheSize());
prop.putNum("wordCacheSize", switchboard.webIndex.indexCacheSize());
prop.putNum("wordCacheSizeKBytes", switchboard.webIndex.indexCacheSizeBytes()/1024);
prop.putNum("maxURLinCache", switchboard.webIndex.maxURLinCache());
prop.putNum("maxAgeOfCache", switchboard.webIndex.maxAgeOfCache() / 1000 / 60); // minutes
prop.putNum("minAgeOfCache", switchboard.webIndex.minAgeOfCache() / 1000 / 60); // minutes
prop.putNum("wordCacheSize", switchboard.webIndex.index().indexCacheSize());
prop.putNum("wordCacheSizeKBytes", switchboard.webIndex.index().indexCacheSizeBytes()/1024);
prop.putNum("maxURLinCache", switchboard.webIndex.index().maxURLinCache());
prop.putNum("maxAgeOfCache", switchboard.webIndex.index().maxAgeOfCache() / 1000 / 60); // minutes
prop.putNum("minAgeOfCache", switchboard.webIndex.index().minAgeOfCache() / 1000 / 60); // minutes
prop.putNum("maxWaitingWordFlush", switchboard.getConfigLong("maxWaitingWordFlush", 180));
prop.put("wordCacheMaxCount", switchboard.getConfigLong(plasmaSwitchboardConstants.WORDCACHE_MAX_COUNT, 20000));
prop.put("wordCacheInitCount", switchboard.getConfigLong(plasmaSwitchboardConstants.WORDCACHE_INIT_COUNT, 30000));

View File

@ -42,7 +42,7 @@ public class queues_p {
prop.putNum("indexingSize", sb.getThread(plasmaSwitchboardConstants.INDEXER).getJobCount() + sb.webIndex.queuePreStack.getActiveQueueSize());
prop.putNum("indexingMax", (int) sb.getConfigLong(plasmaSwitchboardConstants.INDEXER_SLOTS, 30));
prop.putNum("urlpublictextSize", sb.webIndex.metadata().size());
prop.putNum("rwipublictextSize", sb.webIndex.size());
prop.putNum("rwipublictextSize", sb.webIndex.index().size());
if ((sb.webIndex.queuePreStack.size() == 0) && (sb.webIndex.queuePreStack.getActiveQueueSize() == 0)) {
prop.put("list", "0"); //is empty
} else {

View File

@ -21,11 +21,11 @@ public class status_p {
prop.setLocalized(false);
prop.put("rejected", "0");
sb.updateMySeed();
final int cacheSize = sb.webIndex.indexCacheSize();
final int cacheSize = sb.webIndex.index().indexCacheSize();
final long cacheMaxSize = sb.getConfigLong(plasmaSwitchboardConstants.WORDCACHE_MAX_COUNT, 10000);
prop.putNum("ppm", sb.currentPPM());
prop.putNum("qpm", sb.webIndex.peers().mySeed().getQPM());
prop.putNum("wordCacheSize", sb.webIndex.indexCacheSize());
prop.putNum("wordCacheSize", sb.webIndex.index().indexCacheSize());
prop.putNum("wordCacheSize", cacheSize);
prop.putNum("wordCacheMaxSize", cacheMaxSize);
prop.put("wordCacheCount", cacheSize);

View File

@ -78,7 +78,7 @@ public final class timeline {
yacyCore.log.logInfo("INIT TIMELINE SEARCH: " + plasmaSearchQuery.anonymizedQueryHashes(query[0]) + " - " + count + " links");
// get the index container with the result vector
HashMap<String, ReferenceContainer>[] localSearchContainerMaps = sb.webIndex.localSearchContainers(query[0], query[1], null);
HashMap<String, ReferenceContainer>[] localSearchContainerMaps = sb.webIndex.index().localSearchContainers(query[0], query[1], null);
final ReferenceContainer index =
ReferenceContainer.joinExcludeContainers(
localSearchContainerMaps[0].values(),

View File

@ -82,13 +82,13 @@ public final class query {
if (obj.equals("rwiurlcount")) {
// the total number of different urls in the rwi is returned
// <env> shall contain a word hash, the number of assigned lurls to this hash is returned
prop.put("response", sb.webIndex.getReferences(env, null).size());
prop.put("response", sb.webIndex.index().getReferences(env, null).size());
return prop;
}
if (obj.equals("rwicount")) {
// return the total number of available word indexes
prop.put("response", sb.webIndex.size());
prop.put("response", sb.webIndex.index().size());
return prop;
}

View File

@ -185,7 +185,7 @@ public final class search {
yacyCore.log.logInfo("INIT HASH SEARCH (abstracts only): " + plasmaSearchQuery.anonymizedQueryHashes(theQuery.queryHashes) + " - " + theQuery.displayResults() + " links");
final long timer = System.currentTimeMillis();
final Map<String, ReferenceContainer>[] containers = sb.webIndex.localSearchContainers(theQuery.queryHashes, theQuery.excludeHashes, plasmaSearchQuery.hashes2Set(urls));
final Map<String, ReferenceContainer>[] containers = sb.webIndex.index().localSearchContainers(theQuery.queryHashes, theQuery.excludeHashes, plasmaSearchQuery.hashes2Set(urls));
serverProfiling.update("SEARCH", new plasmaProfiling.searchEvent(theQuery.id(true), plasmaSearchEvent.COLLECTION, containers[0].size(), System.currentTimeMillis() - timer), false);
if (containers != null) {

View File

@ -100,9 +100,9 @@ public final class transferRWI {
sb.getLog().logInfo("Rejecting RWIs from peer " + otherPeerName + ". Not granted.");
result = "not_granted";
pause = 0;
} else if (sb.webIndex.indexCacheSize() > cachelimit) {
} else if (sb.webIndex.index().indexCacheSize() > cachelimit) {
// we are too busy to receive indexes
sb.getLog().logInfo("Rejecting RWIs from peer " + otherPeerName + ". We are too busy (buffersize=" + sb.webIndex.indexCacheSize() + ").");
sb.getLog().logInfo("Rejecting RWIs from peer " + otherPeerName + ". We are too busy (buffersize=" + sb.webIndex.index().indexCacheSize() + ").");
granted = false; // don't accept more words if there are too many words to flush
result = "busy";
pause = 60000;
@ -157,7 +157,7 @@ public final class transferRWI {
}
// learn entry
sb.webIndex.addEntry(wordHash, iEntry, System.currentTimeMillis());
sb.webIndex.index().addEntry(wordHash, iEntry, System.currentTimeMillis());
serverCore.checkInterruption();
// check if we need to ask for the corresponding URL
@ -193,7 +193,7 @@ public final class transferRWI {
}
result = "ok";
pause = (int) (sb.webIndex.indexCacheSize() * 20000 / sb.getConfigLong(plasmaSwitchboardConstants.WORDCACHE_MAX_COUNT, 100000)); // estimation of necessary pause time
pause = (int) (sb.webIndex.index().indexCacheSize() * 20000 / sb.getConfigLong(plasmaSwitchboardConstants.WORDCACHE_MAX_COUNT, 100000)); // estimation of necessary pause time
}
prop.put("unknownURL", unknownURLs.toString());

View File

@ -315,7 +315,7 @@ public class yacysearch {
// delete the index entry locally
final String delHash = post.get("deleteref", ""); // urlhash
sb.webIndex.removeWordReferences(query[0], delHash);
sb.webIndex.index().removeWordReferences(query[0], delHash);
// make new news message with negative voting
final HashMap<String, String> map = new HashMap<String, String>();

View File

@ -81,7 +81,7 @@ public class Balancer {
if (urlFileStack.size() != urlFileIndex.size() || (urlFileIndex.size() < 10000 && urlFileIndex.size() > 0)) {
// fix the file stack
Log.logInfo("Balancer", "re-creating the " + stackname + " balancer stack, size = " + urlFileIndex.size() + ((urlFileStack.size() == urlFileIndex.size()) ? "" : " (the old stack size was wrong)" ));
urlFileStack = Stack.reset(urlFileStack);
urlFileStack.clear();
try {
final Iterator<byte[]> i = urlFileIndex.keys(true, null);
byte[] hash;
@ -130,7 +130,7 @@ public class Balancer {
}
public synchronized void clear() {
urlFileStack = Stack.reset(urlFileStack);
urlFileStack.clear();
domainStacks.clear();
urlRAMStack.clear();
resetFileIndex();
@ -544,7 +544,7 @@ public class Balancer {
if (nextentry == null) {
// emergency case: this means that something with the stack organization is wrong
// the file appears to be broken. We kill the file.
Stack.reset(urlFileStack);
urlFileStack.clear();
Log.logSevere("BALANCER", "get() failed to fetch entry from file stack. reset stack file.");
} else {
final String nexthash = new String(nextentry.getColBytes(0));

View File

@ -51,19 +51,16 @@ import de.anomic.yacy.yacyURL;
public class IndexingStack {
Stack sbQueueStack;
CrawlProfile profiles;
plasmaWordIndex wordIndex;
private final File sbQueueStackPath;
ConcurrentHashMap<String, QueueEntry> queueInProcess;
private final Stack sbQueueStack;
private final CrawlProfile profiles;
private final plasmaWordIndex wordIndex;
private final ConcurrentHashMap<String, QueueEntry> queueInProcess;
public IndexingStack(final plasmaWordIndex wordIndex, final File sbQueueStackPath, final CrawlProfile profiles) {
this.sbQueueStackPath = sbQueueStackPath;
this.profiles = profiles;
this.wordIndex = wordIndex;
this.queueInProcess = new ConcurrentHashMap<String, QueueEntry>();
initQueueStack();
this.sbQueueStack = Stack.open(sbQueueStackPath, rowdef);
}
public static final Row rowdef = new Row(
@ -77,18 +74,7 @@ public class IndexingStack {
"String urldescr-80",
NaturalOrder.naturalOrder,
0);
private void initQueueStack() {
sbQueueStack = Stack.open(sbQueueStackPath, rowdef);
}
/*
private void resetQueueStack() {
try {sbQueueStack.close();} catch (Exception e) {}
if (sbQueueStackPath.exists()) sbQueueStackPath.delete();
initQueueStack();
}
*/
public int size() {
return (sbQueueStack == null) ? 0 : sbQueueStack.size();
}
@ -131,14 +117,13 @@ public class IndexingStack {
}
public void clear() {
sbQueueStack = Stack.reset(sbQueueStack);
sbQueueStack.clear();
}
public void close() {
if (sbQueueStack != null) {
sbQueueStack.close();
}
sbQueueStack = null;
}
protected void finalize() throws Throwable {

View File

@ -78,17 +78,14 @@ public final class Stack extends FullRecords {
}
}
public static Stack reset(final Stack stack) {
// memorize settings to this file
final File f = new File(stack.filename);
final Row row = stack.row();
// close and delete the file
try {stack.close();} catch (final Exception e) {}
if (f.exists()) f.delete();
// re-open a database with same settings as before
return open(f, row);
public void clear() {
try {
super.clear();
setHandle(root, null);
setHandle(toor, null);
} catch (IOException e) {
e.printStackTrace();
}
}
public Iterator<Row.Entry> stackIterator(final boolean up) {

View File

@ -0,0 +1,449 @@
// plasmaWordIndex.java
// (C) 2005, 2006 by Michael Peter Christen; mc@yacy.net, Frankfurt a. M., Germany
// first published 2005 on http://www.anomic.de
//
// This is a part of YaCy, a peer-to-peer based web search engine
//
// $LastChangedDate: 2009-03-13 11:34:51 +0100 (Fr, 13 Mrz 2009) $
// $LastChangedRevision: 5709 $
// $LastChangedBy: orbiter $
//
// LICENSE
//
// This program is free software; you can redistribute it and/or modify
// it under the terms of the GNU General Public License as published by
// the Free Software Foundation; either version 2 of the License, or
// (at your option) any later version.
//
// This program is distributed in the hope that it will be useful,
// but WITHOUT ANY WARRANTY; without even the implied warranty of
// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
// GNU General Public License for more details.
//
// You should have received a copy of the GNU General Public License
// along with this program; if not, write to the Free Software
// Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
package de.anomic.kelondro.text;
import java.io.File;
import java.io.IOException;
import java.util.ArrayList;
import java.util.HashMap;
import java.util.Iterator;
import java.util.Set;
import java.util.TreeSet;
import de.anomic.kelondro.index.RowCollection;
import de.anomic.kelondro.order.Base64Order;
import de.anomic.kelondro.order.ByteOrder;
import de.anomic.kelondro.order.CloneableIterator;
import de.anomic.kelondro.order.Order;
import de.anomic.kelondro.order.RotateIterator;
import de.anomic.kelondro.text.Index;
import de.anomic.kelondro.text.IndexCache;
import de.anomic.kelondro.text.IndexCollection;
import de.anomic.kelondro.text.ReferenceContainer;
import de.anomic.kelondro.text.ReferenceContainerOrder;
import de.anomic.kelondro.text.ReferenceRow;
import de.anomic.kelondro.text.Word;
import de.anomic.kelondro.util.MemoryControl;
import de.anomic.kelondro.util.Log;
import de.anomic.server.serverProfiling;
public final class CachedIndexCollection implements Index {
// environment constants
public static final long wCacheMaxAge = 1000 * 60 * 30; // milliseconds; 30 minutes
public static final int wCacheMaxChunk = 800; // maximum number of references for each urlhash
public static final int lowcachedivisor = 900;
public static final int maxCollectionPartition = 7; // should be 7
private static final ByteOrder indexOrder = Base64Order.enhancedCoder;
private final IndexCache indexCache;
private final IndexCollection collections; // new database structure to replace AssortmentCluster and FileCluster
public CachedIndexCollection(
File indexPrimaryTextLocation,
final int entityCacheMaxSize,
final boolean useCommons,
final int redundancy,
Log log) throws IOException {
final File textindexcache = new File(indexPrimaryTextLocation, "RICACHE");
if (!(textindexcache.exists())) textindexcache.mkdirs();
if (new File(textindexcache, "index.dhtin.blob").exists()) {
// migration of the both caches into one
this.indexCache = new IndexCache(textindexcache, ReferenceRow.urlEntryRow, entityCacheMaxSize, wCacheMaxChunk, wCacheMaxAge, "index.dhtout.blob", log);
IndexCache dhtInCache = new IndexCache(textindexcache, ReferenceRow.urlEntryRow, entityCacheMaxSize, wCacheMaxChunk, wCacheMaxAge, "index.dhtin.blob", log);
for (ReferenceContainer c: dhtInCache) {
this.indexCache.addReferences(c);
}
new File(textindexcache, "index.dhtin.blob").delete();
} else {
// read in new BLOB
this.indexCache = new IndexCache(textindexcache, ReferenceRow.urlEntryRow, entityCacheMaxSize, wCacheMaxChunk, wCacheMaxAge, "index.dhtout.blob", log);
}
// create collections storage path
final File textindexcollections = new File(indexPrimaryTextLocation, "RICOLLECTION");
if (!(textindexcollections.exists())) textindexcollections.mkdirs();
this.collections = new IndexCollection(
textindexcollections,
"collection",
12,
Base64Order.enhancedCoder,
maxCollectionPartition,
ReferenceRow.urlEntryRow,
useCommons);
}
public void clear() {
indexCache.clear();
try {
collections.clear();
} catch (IOException e) {
e.printStackTrace();
}
}
public int minMem() {
return 1024*1024 /* indexing overhead */ + indexCache.minMem() + collections.minMem();
}
public int maxURLinCache() {
return indexCache.maxURLinCache();
}
public long minAgeOfCache() {
return indexCache.minAgeOfCache();
}
public long maxAgeOfCache() {
return indexCache.maxAgeOfCache();
}
public int indexCacheSize() {
return indexCache.size();
}
public long indexCacheSizeBytes() {
// calculate the real size in bytes of the index cache
long cacheBytes = 0;
final long entryBytes = ReferenceRow.urlEntryRow.objectsize;
final IndexCache cache = (indexCache);
synchronized (cache) {
final Iterator<ReferenceContainer> it = cache.referenceIterator(null, false, true);
while (it.hasNext()) cacheBytes += it.next().size() * entryBytes;
}
return cacheBytes;
}
public void setMaxWordCount(final int maxWords) {
indexCache.setMaxWordCount(maxWords);
}
public void cacheFlushControl(final IndexCache theCache) {
// check for forced flush
int cs = cacheSize();
if (cs > 0) {
// flush elements that are too big. This flushing depends on the fact that the flush rule
// selects the biggest elements first for flushing. If it does not for any reason, the following
// loop would not terminate.
serverProfiling.update("wordcache", Long.valueOf(cs), true);
// To ensure termination an additional counter is used
int l = 0;
while (theCache.size() > 0 && (l++ < 100) && (theCache.maxURLinCache() > wCacheMaxChunk)) {
flushCacheOne(theCache);
}
// next flush more entries if the size exceeds the maximum size of the cache
while (theCache.size() > 0 &&
((theCache.size() > theCache.getMaxWordCount()) ||
(MemoryControl.available() < collections.minMem()))) {
flushCacheOne(theCache);
}
if (cacheSize() != cs) serverProfiling.update("wordcache", Long.valueOf(cacheSize()), true);
}
}
public static ReferenceContainer emptyContainer(final String wordHash, final int elementCount) {
return new ReferenceContainer(wordHash, ReferenceRow.urlEntryRow, elementCount);
}
public void addEntry(final String wordHash, final ReferenceRow entry, final long updateTime) {
// add the entry
indexCache.addEntry(wordHash, entry, updateTime, true);
cacheFlushControl(this.indexCache);
}
public void addReferences(final ReferenceContainer entries) {
assert (entries.row().objectsize == ReferenceRow.urlEntryRow.objectsize);
// add the entry
indexCache.addReferences(entries);
cacheFlushControl(this.indexCache);
}
public void flushCacheFor(int time) {
flushCacheUntil(System.currentTimeMillis() + time);
}
private synchronized void flushCacheUntil(long timeout) {
while (System.currentTimeMillis() < timeout && indexCache.size() > 0) {
flushCacheOne(indexCache);
}
}
private synchronized void flushCacheOne(final IndexCache ram) {
if (ram.size() > 0) collections.addReferences(flushContainer(ram));
}
private ReferenceContainer flushContainer(final IndexCache ram) {
String wordHash;
ReferenceContainer c;
wordHash = ram.maxScoreWordHash();
c = ram.getReferences(wordHash, null);
if ((c != null) && (c.size() > wCacheMaxChunk)) {
return ram.deleteAllReferences(wordHash);
} else {
return ram.deleteAllReferences(ram.bestFlushWordHash());
}
}
public boolean hasReferences(final String wordHash) {
if (indexCache.hasReferences(wordHash)) return true;
if (collections.hasReferences(wordHash)) return true;
return false;
}
public ReferenceContainer getReferences(final String wordHash, final Set<String> urlselection) {
if (wordHash == null) {
// wrong input
return null;
}
// get from cache
ReferenceContainer container;
container = indexCache.getReferences(wordHash, urlselection);
// get from collection index
if (container == null) {
container = collections.getReferences(wordHash, urlselection);
} else {
container.addAllUnique(collections.getReferences(wordHash, urlselection));
}
if (container == null) return null;
// check doubles
final int beforeDouble = container.size();
container.sort();
final ArrayList<RowCollection> d = container.removeDoubles();
RowCollection set;
for (int i = 0; i < d.size(); i++) {
// for each element in the double-set, take that one that is the most recent one
set = d.get(i);
ReferenceRow e, elm = null;
long lm = 0;
for (int j = 0; j < set.size(); j++) {
e = new ReferenceRow(set.get(j, true));
if ((elm == null) || (e.lastModified() > lm)) {
elm = e;
lm = e.lastModified();
}
}
if(elm != null) {
container.addUnique(elm.toKelondroEntry());
}
}
if (container.size() < beforeDouble) System.out.println("*** DEBUG DOUBLECHECK - removed " + (beforeDouble - container.size()) + " index entries from word container " + container.getWordHash());
return container;
}
/**
* return map of wordhash:indexContainer
*
* @param wordHashes
* @param urlselection
* @param deleteIfEmpty
* @param interruptIfEmpty
* @return
*/
public HashMap<String, ReferenceContainer> getContainers(final Set<String> wordHashes, final Set<String> urlselection, final boolean interruptIfEmpty) {
// retrieve entities that belong to the hashes
final HashMap<String, ReferenceContainer> containers = new HashMap<String, ReferenceContainer>(wordHashes.size());
String singleHash;
ReferenceContainer singleContainer;
final Iterator<String> i = wordHashes.iterator();
while (i.hasNext()) {
// get next word hash:
singleHash = i.next();
// retrieve index
singleContainer = getReferences(singleHash, urlselection);
// check result
if (((singleContainer == null) || (singleContainer.size() == 0)) && (interruptIfEmpty)) return new HashMap<String, ReferenceContainer>(0);
containers.put(singleHash, singleContainer);
}
return containers;
}
@SuppressWarnings("unchecked")
public HashMap<String, ReferenceContainer>[] localSearchContainers(
final TreeSet<String> queryHashes,
final TreeSet<String> excludeHashes,
final Set<String> urlselection) {
// search for the set of hashes and return a map of of wordhash:indexContainer containing the seach result
// retrieve entities that belong to the hashes
HashMap<String, ReferenceContainer> inclusionContainers = (queryHashes.size() == 0) ? new HashMap<String, ReferenceContainer>(0) : getContainers(
queryHashes,
urlselection,
true);
if ((inclusionContainers.size() != 0) && (inclusionContainers.size() < queryHashes.size())) inclusionContainers = new HashMap<String, ReferenceContainer>(0); // prevent that only a subset is returned
final HashMap<String, ReferenceContainer> exclusionContainers = (inclusionContainers.size() == 0) ? new HashMap<String, ReferenceContainer>(0) : getContainers(
excludeHashes,
urlselection,
true);
return new HashMap[]{inclusionContainers, exclusionContainers};
}
public int size() {
return java.lang.Math.max(collections.size(), indexCache.size());
}
public int collectionsSize() {
return collections.size();
}
public int cacheSize() {
return indexCache.size();
}
public void close() {
indexCache.close();
collections.close();
}
public ReferenceContainer deleteAllReferences(final String wordHash) {
final ReferenceContainer c = new ReferenceContainer(
wordHash,
ReferenceRow.urlEntryRow,
indexCache.countReferences(wordHash));
c.addAllUnique(indexCache.deleteAllReferences(wordHash));
c.addAllUnique(collections.deleteAllReferences(wordHash));
return c;
}
public boolean removeReference(final String wordHash, final String urlHash) {
boolean removed = false;
removed = removed | (indexCache.removeReference(wordHash, urlHash));
removed = removed | (collections.removeReference(wordHash, urlHash));
return removed;
}
public int removeEntryMultiple(final Set<String> wordHashes, final String urlHash) {
// remove the same url hashes for multiple words
// this is mainly used when correcting a index after a search
final Iterator<String> i = wordHashes.iterator();
int count = 0;
while (i.hasNext()) {
if (removeReference(i.next(), urlHash)) count++;
}
return count;
}
public int removeReferences(final String wordHash, final Set<String> urlHashes) {
int removed = 0;
removed += indexCache.removeReferences(wordHash, urlHashes);
removed += collections.removeReferences(wordHash, urlHashes);
return removed;
}
public String removeEntriesExpl(final String wordHash, final Set<String> urlHashes) {
String removed = "";
removed += indexCache.removeReferences(wordHash, urlHashes) + ", ";
removed += collections.removeReferences(wordHash, urlHashes);
return removed;
}
public void removeEntriesMultiple(final Set<String> wordHashes, final Set<String> urlHashes) {
// remove the same url hashes for multiple words
// this is mainly used when correcting a index after a search
final Iterator<String> i = wordHashes.iterator();
while (i.hasNext()) {
removeReferences(i.next(), urlHashes);
}
}
public int removeWordReferences(final Set<String> words, final String urlhash) {
// sequentially delete all word references
// returns number of deletions
final Iterator<String> iter = words.iterator();
int count = 0;
while (iter.hasNext()) {
// delete the URL reference in this word index
if (removeReference(Word.word2hash(iter.next()), urlhash)) count++;
}
return count;
}
public synchronized TreeSet<ReferenceContainer> indexContainerSet(final String startHash, final boolean ram, final boolean rot, int count) {
// creates a set of indexContainers
// this does not use the cache
final Order<ReferenceContainer> containerOrder = new ReferenceContainerOrder(indexOrder.clone());
containerOrder.rotate(emptyContainer(startHash, 0));
final TreeSet<ReferenceContainer> containers = new TreeSet<ReferenceContainer>(containerOrder);
final Iterator<ReferenceContainer> i = referenceIterator(startHash, rot, ram);
if (ram) count = Math.min(indexCache.size(), count);
ReferenceContainer container;
// this loop does not terminate using the i.hasNex() predicate when rot == true
// because then the underlying iterator is a rotating iterator without termination
// in this case a termination must be ensured with a counter
// It must also be ensured that the counter is in/decreased every loop
while ((count > 0) && (i.hasNext())) {
container = i.next();
if ((container != null) && (container.size() > 0)) {
containers.add(container);
}
count--; // decrease counter even if the container was null or empty to ensure termination
}
return containers; // this may return less containers as demanded
}
public synchronized CloneableIterator<ReferenceContainer> referenceIterator(final String startHash, final boolean rot, final boolean ram) {
final CloneableIterator<ReferenceContainer> i = wordContainers(startHash, ram);
if (rot) {
return new RotateIterator<ReferenceContainer>(i, new String(Base64Order.zero(startHash.length())), indexCache.size() + ((ram) ? 0 : collections.size()));
}
return i;
}
private synchronized CloneableIterator<ReferenceContainer> wordContainers(final String startWordHash, final boolean ram) {
final Order<ReferenceContainer> containerOrder = new ReferenceContainerOrder(indexOrder.clone());
containerOrder.rotate(emptyContainer(startWordHash, 0));
if (ram) {
return indexCache.referenceIterator(startWordHash, false, true);
}
return collections.referenceIterator(startWordHash, false, false);
/*
return new MergeIterator<ReferenceContainer>(
indexCache.referenceIterator(startWordHash, false, true),
collections.referenceIterator(startWordHash, false, false),
containerOrder,
ReferenceContainer.containerMergeMethod,
true);
*/
}
public int countReferences(String key) {
return indexCache.countReferences(key) + collections.countReferences(key);
}
}

View File

@ -38,7 +38,6 @@ import de.anomic.kelondro.index.Row;
import de.anomic.kelondro.index.RowSet;
import de.anomic.kelondro.order.Base64Order;
import de.anomic.kelondro.util.ByteBuffer;
import de.anomic.plasma.plasmaWordIndex;
public class ReferenceContainer extends RowSet {
@ -229,11 +228,11 @@ public class ReferenceContainer extends RowSet {
// join a search result and return the joincount (number of pages after join)
// since this is a conjunction we return an empty entity if any word is not known
if (includeContainers == null) return plasmaWordIndex.emptyContainer(null, 0);
if (includeContainers == null) return CachedIndexCollection.emptyContainer(null, 0);
// join the result
final ReferenceContainer rcLocal = ReferenceContainer.joinContainers(includeContainers, maxDistance);
if (rcLocal == null) return plasmaWordIndex.emptyContainer(null, 0);
if (rcLocal == null) return CachedIndexCollection.emptyContainer(null, 0);
excludeContainers(rcLocal, excludeContainers);
return rcLocal;

View File

@ -36,7 +36,7 @@ public class plasmaDbImporter extends AbstractImporter implements Importer {
super("PLASMADB");
this.homeWordIndex = homeWI;
this.importWordIndex = importWI;
this.importStartSize = this.importWordIndex.size();
this.importStartSize = this.importWordIndex.index().size();
}
/**
@ -93,15 +93,15 @@ public class plasmaDbImporter extends AbstractImporter implements Importer {
try {
this.log.logInfo("Importing DB from '" + this.importWordIndex.getLocation(true).getAbsolutePath() + "'");
this.log.logInfo("Home word index contains " + homeWordIndex.size() + " words and " + homeWordIndex.metadata().size() + " URLs.");
this.log.logInfo("Import word index contains " + this.importWordIndex.size() + " words and " + this.importWordIndex.metadata().size() + " URLs.");
this.log.logInfo("Home word index contains " + homeWordIndex.index().size() + " words and " + homeWordIndex.metadata().size() + " URLs.");
this.log.logInfo("Import word index contains " + this.importWordIndex.index().size() + " words and " + this.importWordIndex.metadata().size() + " URLs.");
final HashSet<String> unknownUrlBuffer = new HashSet<String>();
final HashSet<String> importedUrlBuffer = new HashSet<String>();
// iterate over all words from import db
//Iterator importWordHashIterator = this.importWordIndex.wordHashes(this.wordChunkStartHash, plasmaWordIndex.RL_WORDFILES, false);
Iterator<ReferenceContainer> indexContainerIterator = this.importWordIndex.indexContainerSet(this.wordChunkStartHash, false, false, 100).iterator();
Iterator<ReferenceContainer> indexContainerIterator = this.importWordIndex.index().indexContainerSet(this.wordChunkStartHash, false, false, 100).iterator();
while (!isAborted() && indexContainerIterator.hasNext()) {
final TreeSet<String> entityUrls = new TreeSet<String>();
@ -169,10 +169,10 @@ public class plasmaDbImporter extends AbstractImporter implements Importer {
if (isAborted()) break;
// importing entity container to home db
if (newContainer.size() > 0) { homeWordIndex.addReferences(newContainer); }
if (newContainer.size() > 0) { homeWordIndex.index().addReferences(newContainer); }
// delete complete index entity file
this.importWordIndex.deleteAllReferences(this.wordHash);
this.importWordIndex.index().deleteAllReferences(this.wordHash);
// print out some statistical information
if (this.entryCounter % 500 == 0) {
@ -189,8 +189,8 @@ public class plasmaDbImporter extends AbstractImporter implements Importer {
"Speed: "+ 500*1000/duration + " word entities/s" +
" | Elapsed time: " + DateFormatter.formatInterval(getElapsedTime()) +
" | Estimated time: " + DateFormatter.formatInterval(getEstimatedTime()) + "\n" +
"Home Words = " + homeWordIndex.size() +
" | Import Words = " + this.importWordIndex.size());
"Home Words = " + homeWordIndex.index().size() +
" | Import Words = " + this.importWordIndex.index().size());
this.wordChunkStart = this.wordChunkEnd;
this.wordChunkStartHash = this.wordChunkEndHash;
}
@ -203,7 +203,7 @@ public class plasmaDbImporter extends AbstractImporter implements Importer {
if (!indexContainerIterator.hasNext()) {
// We may not be finished yet, try to get the next chunk of wordHashes
final TreeSet<ReferenceContainer> containers = this.importWordIndex.indexContainerSet(this.wordHash, false, false, 100);
final TreeSet<ReferenceContainer> containers = this.importWordIndex.index().indexContainerSet(this.wordHash, false, false, 100);
indexContainerIterator = containers.iterator();
// Make sure we don't get the same wordhash twice, but don't skip a word
if ((indexContainerIterator.hasNext())&&(!this.wordHash.equals((indexContainerIterator.next()).getWordHash()))) {
@ -212,8 +212,8 @@ public class plasmaDbImporter extends AbstractImporter implements Importer {
}
}
this.log.logInfo("Home word index contains " + homeWordIndex.size() + " words and " + homeWordIndex.metadata().size() + " URLs.");
this.log.logInfo("Import word index contains " + this.importWordIndex.size() + " words and " + this.importWordIndex.metadata().size() + " URLs.");
this.log.logInfo("Home word index contains " + homeWordIndex.index().size() + " words and " + homeWordIndex.metadata().size() + " URLs.");
this.log.logInfo("Import word index contains " + this.importWordIndex.index().size() + " words and " + this.importWordIndex.metadata().size() + " URLs.");
} catch (final Exception e) {
this.log.logSevere("Database import failed.",e);
e.printStackTrace();

View File

@ -248,7 +248,7 @@ public final class plasmaSearchEvent {
if (rw > 0) {
final Set<String> removeWords = cleanEvent.query.queryHashes;
removeWords.addAll(cleanEvent.query.excludeHashes);
cleanEvent.wordIndex.removeEntriesMultiple(removeWords, cleanEvent.failedURLs.keySet());
cleanEvent.wordIndex.index().removeEntriesMultiple(removeWords, cleanEvent.failedURLs.keySet());
Log.logInfo("SearchEvents", "cleaning up event " + cleanEvent.query.id(true) + ", removed " + rw + " URL references on " + removeWords.size() + " words");
}
@ -301,7 +301,7 @@ public final class plasmaSearchEvent {
(query.constraint.get(plasmaCondenser.flag_cat_indexof)) &&
(!(metadata.dc_title().startsWith("Index of")))) {
final Iterator<String> wi = query.queryHashes.iterator();
while (wi.hasNext()) wordIndex.removeReference(wi.next(), page.hash());
while (wi.hasNext()) wordIndex.index().removeReference(wi.next(), page.hash());
registerFailure(page.hash(), "index-of constraint not fullfilled");
return null;
}
@ -824,7 +824,7 @@ public final class plasmaSearchEvent {
String address = null;
if ((seed == null) || ((address = seed.getPublicAddress()) == null)) {
// seed is not known from here
wordIndex.removeWordReferences(
wordIndex.index().removeWordReferences(
plasmaCondenser.getWords(
("yacyshare " +
filename.replace('?', ' ') +

View File

@ -110,7 +110,7 @@ public final class plasmaSearchRankingProcess {
public void execQuery() {
long timer = System.currentTimeMillis();
this.localSearchContainerMaps = wordIndex.localSearchContainers(query.queryHashes, query.excludeHashes, null);
this.localSearchContainerMaps = wordIndex.index().localSearchContainers(query.queryHashes, query.excludeHashes, null);
serverProfiling.update("SEARCH", new plasmaProfiling.searchEvent(query.id(true), plasmaSearchEvent.COLLECTION, this.localSearchContainerMaps[0].size(), System.currentTimeMillis() - timer), false);
// join and exclude the local result

View File

@ -952,12 +952,12 @@ public class plasmaSnippetCache {
assert plasmaSwitchboard.getSwitchboard().webIndex != null;
assert event != null : "eventID = " + eventID;
assert event.getQuery() != null;
plasmaSwitchboard.getSwitchboard().webIndex.removeEntryMultiple(event.getQuery().queryHashes, urlHash);
plasmaSwitchboard.getSwitchboard().webIndex.index().removeEntryMultiple(event.getQuery().queryHashes, urlHash);
event.remove(urlHash);
}
if (snippet.getErrorCode() == ERROR_NO_MATCH) {
log.logInfo("error: '" + snippet.getError() + "', remove words '" + querystring + "' for url = " + snippet.getUrl().toNormalform(false, true) + ", cause: " + snippet.getError());
plasmaSwitchboard.getSwitchboard().webIndex.removeEntryMultiple(snippet.remaingHashes, urlHash);
plasmaSwitchboard.getSwitchboard().webIndex.index().removeEntryMultiple(snippet.remaingHashes, urlHash);
plasmaSearchEvent.getEvent(eventID).remove(urlHash);
}
return snippet.getError();

View File

@ -329,7 +329,7 @@ public final class plasmaSwitchboard extends serverAbstractSwitch<IndexingStack.
// init a DHT transmission dispatcher
this.dhtDispatcher = new Dispatcher(
webIndex,
webIndex.index(),
webIndex.metadata(),
webIndex.peers(),
true,
@ -1119,12 +1119,12 @@ public final class plasmaSwitchboard extends serverAbstractSwitch<IndexingStack.
}
public int rwiCacheSize() {
return webIndex.cacheSize();
return webIndex.index().cacheSize();
}
public boolean rwiCacheFlush() {
if (rwiCacheSize() == 0) return false;
webIndex.flushCacheFor((int) ((this.getConfigLong(plasmaSwitchboardConstants.CACHEFLUSH_BUSYSLEEP, 10000) * this.getConfigLong("performanceIO", 10)) / 100));
webIndex.index().flushCacheFor((int) ((this.getConfigLong(plasmaSwitchboardConstants.CACHEFLUSH_BUSYSLEEP, 10000) * this.getConfigLong("performanceIO", 10)) / 100));
return true;
}
@ -1143,7 +1143,7 @@ public final class plasmaSwitchboard extends serverAbstractSwitch<IndexingStack.
public void deQueueFreeMem() {
// flush some entries from the RAM cache
webIndex.flushCacheFor(5000);
webIndex.index().flushCacheFor(5000);
// empty some caches
webIndex.metadata().clearCache();
plasmaSearchEvent.cleanupEvents(true);
@ -1772,7 +1772,7 @@ public final class plasmaSwitchboard extends serverAbstractSwitch<IndexingStack.
// delete all word references
int count = 0;
if (words != null) count = webIndex.removeWordReferences(words, urlhash);
if (words != null) count = webIndex.index().removeWordReferences(words, urlhash);
// finally delete the url entry itself
webIndex.metadata().remove(urlhash);
@ -1889,8 +1889,8 @@ public final class plasmaSwitchboard extends serverAbstractSwitch<IndexingStack.
if (webIndex.metadata().size() < 10) {
return "no DHT distribution: loadedURL.size() = " + webIndex.metadata().size();
}
if (webIndex.size() < 100) {
return "no DHT distribution: not enough words - wordIndex.size() = " + webIndex.size();
if (webIndex.index().size() < 100) {
return "no DHT distribution: not enough words - wordIndex.size() = " + webIndex.index().size();
}
if ((getConfig(plasmaSwitchboardConstants.INDEX_DIST_ALLOW_WHILE_CRAWLING, "false").equalsIgnoreCase("false")) && (crawlQueues.noticeURL.notEmpty())) {
return "no DHT distribution: crawl in progress: noticeURL.stackSize() = " + crawlQueues.noticeURL.size() + ", sbQueue.size() = " + webIndex.queuePreStack.size();
@ -1992,7 +1992,7 @@ public final class plasmaSwitchboard extends serverAbstractSwitch<IndexingStack.
webIndex.peers().mySeed().put(yacySeed.LCOUNT, Integer.toString(webIndex.metadata().size())); // the number of links that the peer has stored (LURL's)
webIndex.peers().mySeed().put(yacySeed.NCOUNT, Integer.toString(crawlQueues.noticeURL.size())); // the number of links that the peer has noticed, but not loaded (NURL's)
webIndex.peers().mySeed().put(yacySeed.RCOUNT, Integer.toString(crawlQueues.noticeURL.stackSize(NoticedURL.STACK_TYPE_LIMIT))); // the number of links that the peer provides for remote crawling (ZURL's)
webIndex.peers().mySeed().put(yacySeed.ICOUNT, Integer.toString(webIndex.size())); // the minimum number of words that the peer has indexed (as it says)
webIndex.peers().mySeed().put(yacySeed.ICOUNT, Integer.toString(webIndex.index().size())); // the minimum number of words that the peer has indexed (as it says)
webIndex.peers().mySeed().put(yacySeed.SCOUNT, Integer.toString(webIndex.peers().sizeConnected())); // the number of seeds that the peer has stored
webIndex.peers().mySeed().put(yacySeed.CCOUNT, Double.toString(((int) ((webIndex.peers().sizeConnected() + webIndex.peers().sizeDisconnected() + webIndex.peers().sizePotential()) * 60.0 / (uptime + 1.01)) * 100) / 100.0)); // the number of clients that the peer connects (as connects/hour)
webIndex.peers().mySeed().put(yacySeed.VERSION, getConfig("version", ""));

View File

@ -28,55 +28,39 @@ package de.anomic.plasma;
import java.io.File;
import java.io.IOException;
import java.util.ArrayList;
import java.util.Date;
import java.util.HashMap;
import java.util.HashSet;
import java.util.Iterator;
import java.util.Map;
import java.util.Set;
import java.util.TreeSet;
import de.anomic.crawler.CrawlProfile;
import de.anomic.crawler.IndexingStack;
import de.anomic.htmlFilter.htmlFilterContentScraper;
import de.anomic.http.httpdProxyCacheEntry;
import de.anomic.kelondro.index.RowCollection;
import de.anomic.kelondro.order.Base64Order;
import de.anomic.kelondro.order.ByteOrder;
import de.anomic.kelondro.order.CloneableIterator;
import de.anomic.kelondro.order.Order;
import de.anomic.kelondro.order.RotateIterator;
import de.anomic.kelondro.text.Index;
import de.anomic.kelondro.text.IndexCache;
import de.anomic.kelondro.text.IndexCollection;
import de.anomic.kelondro.text.CachedIndexCollection;
import de.anomic.kelondro.text.MetadataRowContainer;
import de.anomic.kelondro.text.ReferenceContainer;
import de.anomic.kelondro.text.ReferenceContainerOrder;
import de.anomic.kelondro.text.ReferenceRow;
import de.anomic.kelondro.text.MetadataRepository;
import de.anomic.kelondro.text.Word;
import de.anomic.kelondro.text.Blacklist;
import de.anomic.kelondro.util.MemoryControl;
import de.anomic.kelondro.util.kelondroException;
import de.anomic.kelondro.util.Log;
import de.anomic.server.serverProfiling;
import de.anomic.tools.iso639;
import de.anomic.xml.RSSFeed;
import de.anomic.xml.RSSMessage;
import de.anomic.yacy.yacySeedDB;
import de.anomic.yacy.yacyURL;
public final class plasmaWordIndex implements Index {
public final class plasmaWordIndex {
// environment constants
public static final long wCacheMaxAge = 1000 * 60 * 30; // milliseconds; 30 minutes
public static final int wCacheMaxChunk = 800; // maximum number of references for each urlhash
public static final int lowcachedivisor = 900;
public static final int maxCollectionPartition = 7; // should be 7
private static final ByteOrder indexOrder = Base64Order.enhancedCoder;
public static final String CRAWL_PROFILE_PROXY = "proxy";
public static final String CRAWL_PROFILE_REMOTE = "remote";
public static final String CRAWL_PROFILE_SNIPPET_LOCAL_TEXT = "snippetLocalText";
@ -93,8 +77,7 @@ public final class plasmaWordIndex implements Index {
public static final long CRAWL_PROFILE_SNIPPET_GLOBAL_MEDIA_RECRAWL_CYCLE = 60L * 24L * 30L;
private final IndexCache indexCache;
private final IndexCollection collections; // new database structure to replace AssortmentCluster and FileCluster
private final CachedIndexCollection index;
private final Log log;
private MetadataRepository metadata;
private final yacySeedDB peers;
@ -139,34 +122,13 @@ public final class plasmaWordIndex implements Index {
}
}
}
final File textindexcache = new File(indexPrimaryTextLocation, "RICACHE");
if (!(textindexcache.exists())) textindexcache.mkdirs();
if (new File(textindexcache, "index.dhtin.blob").exists()) {
// migration of the both caches into one
this.indexCache = new IndexCache(textindexcache, ReferenceRow.urlEntryRow, entityCacheMaxSize, wCacheMaxChunk, wCacheMaxAge, "index.dhtout.blob", log);
IndexCache dhtInCache = new IndexCache(textindexcache, ReferenceRow.urlEntryRow, entityCacheMaxSize, wCacheMaxChunk, wCacheMaxAge, "index.dhtin.blob", log);
for (ReferenceContainer c: dhtInCache) {
this.indexCache.addReferences(c);
}
new File(textindexcache, "index.dhtin.blob").delete();
} else {
// read in new BLOB
this.indexCache = new IndexCache(textindexcache, ReferenceRow.urlEntryRow, entityCacheMaxSize, wCacheMaxChunk, wCacheMaxAge, "index.dhtout.blob", log);
}
this.index = new CachedIndexCollection(
indexPrimaryTextLocation,
entityCacheMaxSize,
useCommons,
redundancy,
log);
// create collections storage path
final File textindexcollections = new File(indexPrimaryTextLocation, "RICOLLECTION");
if (!(textindexcollections.exists())) textindexcollections.mkdirs();
this.collections = new IndexCollection(
textindexcollections,
"collection",
12,
Base64Order.enhancedCoder,
maxCollectionPartition,
ReferenceRow.urlEntryRow,
useCommons);
// create LURL-db
metadata = new MetadataRepository(new File(this.secondaryRoot, "TEXT"));
@ -249,13 +211,12 @@ public final class plasmaWordIndex implements Index {
return this.peers;
}
public CachedIndexCollection index() {
return this.index;
}
public void clear() {
indexCache.clear();
try {
collections.clear();
} catch (IOException e) {
e.printStackTrace();
}
index.clear();
try {
metadata.clear();
} catch (final IOException e) {
@ -377,111 +338,7 @@ public final class plasmaWordIndex implements Index {
public File getLocation(final boolean primary) {
return (primary) ? this.primaryRoot : this.secondaryRoot;
}
public int minMem() {
return 1024*1024 /* indexing overhead */ + indexCache.minMem() + collections.minMem();
}
public int maxURLinCache() {
return indexCache.maxURLinCache();
}
public long minAgeOfCache() {
return indexCache.minAgeOfCache();
}
public long maxAgeOfCache() {
return indexCache.maxAgeOfCache();
}
public int indexCacheSize() {
return indexCache.size();
}
public long indexCacheSizeBytes() {
// calculate the real size in bytes of the index cache
long cacheBytes = 0;
final long entryBytes = ReferenceRow.urlEntryRow.objectsize;
final IndexCache cache = (indexCache);
synchronized (cache) {
final Iterator<ReferenceContainer> it = cache.referenceIterator(null, false, true);
while (it.hasNext()) cacheBytes += it.next().size() * entryBytes;
}
return cacheBytes;
}
public void setMaxWordCount(final int maxWords) {
indexCache.setMaxWordCount(maxWords);
}
public void cacheFlushControl(final IndexCache theCache) {
// check for forced flush
int cs = cacheSize();
if (cs > 0) {
// flush elements that are too big. This flushing depends on the fact that the flush rule
// selects the biggest elements first for flushing. If it does not for any reason, the following
// loop would not terminate.
serverProfiling.update("wordcache", Long.valueOf(cs), true);
// To ensure termination an additional counter is used
int l = 0;
while (theCache.size() > 0 && (l++ < 100) && (theCache.maxURLinCache() > wCacheMaxChunk)) {
flushCacheOne(theCache);
}
// next flush more entries if the size exceeds the maximum size of the cache
while (theCache.size() > 0 &&
((theCache.size() > theCache.getMaxWordCount()) ||
(MemoryControl.available() < collections.minMem()))) {
flushCacheOne(theCache);
}
if (cacheSize() != cs) serverProfiling.update("wordcache", Long.valueOf(cacheSize()), true);
}
}
public static ReferenceContainer emptyContainer(final String wordHash, final int elementCount) {
return new ReferenceContainer(wordHash, ReferenceRow.urlEntryRow, elementCount);
}
public void addEntry(final String wordHash, final ReferenceRow entry, final long updateTime) {
// add the entry
indexCache.addEntry(wordHash, entry, updateTime, true);
cacheFlushControl(this.indexCache);
}
public void addReferences(final ReferenceContainer entries) {
assert (entries.row().objectsize == ReferenceRow.urlEntryRow.objectsize);
// add the entry
indexCache.addReferences(entries);
cacheFlushControl(this.indexCache);
}
public void flushCacheFor(int time) {
flushCacheUntil(System.currentTimeMillis() + time);
}
private synchronized void flushCacheUntil(long timeout) {
while (System.currentTimeMillis() < timeout && indexCache.size() > 0) {
flushCacheOne(indexCache);
}
}
private synchronized void flushCacheOne(final IndexCache ram) {
if (ram.size() > 0) collections.addReferences(flushContainer(ram));
}
private ReferenceContainer flushContainer(final IndexCache ram) {
String wordHash;
ReferenceContainer c;
wordHash = ram.maxScoreWordHash();
c = ram.getReferences(wordHash, null);
if ((c != null) && (c.size() > wCacheMaxChunk)) {
return ram.deleteAllReferences(wordHash);
} else {
return ram.deleteAllReferences(ram.bestFlushWordHash());
}
}
/**
* this is called by the switchboard to put in a new page into the index
* use all the words in one condenser object to simultanous create index entries
@ -526,221 +383,20 @@ public final class plasmaWordIndex implements Index {
doctype,
outlinksSame, outlinksOther,
wprop.flags);
addEntry(Word.word2hash(word), ientry, System.currentTimeMillis());
this.index.addEntry(Word.word2hash(word), ientry, System.currentTimeMillis());
wordCount++;
}
return wordCount;
}
public boolean hasReferences(final String wordHash) {
if (indexCache.hasReferences(wordHash)) return true;
if (collections.hasReferences(wordHash)) return true;
return false;
}
public ReferenceContainer getReferences(final String wordHash, final Set<String> urlselection) {
if ((wordHash == null) || (wordHash.length() != yacySeedDB.commonHashLength)) {
// wrong input
return null;
}
// get from cache
ReferenceContainer container;
container = indexCache.getReferences(wordHash, urlselection);
// get from collection index
if (container == null) {
container = collections.getReferences(wordHash, urlselection);
} else {
container.addAllUnique(collections.getReferences(wordHash, urlselection));
}
if (container == null) return null;
// check doubles
final int beforeDouble = container.size();
container.sort();
final ArrayList<RowCollection> d = container.removeDoubles();
RowCollection set;
for (int i = 0; i < d.size(); i++) {
// for each element in the double-set, take that one that is the most recent one
set = d.get(i);
ReferenceRow e, elm = null;
long lm = 0;
for (int j = 0; j < set.size(); j++) {
e = new ReferenceRow(set.get(j, true));
if ((elm == null) || (e.lastModified() > lm)) {
elm = e;
lm = e.lastModified();
}
}
if(elm != null) {
container.addUnique(elm.toKelondroEntry());
}
}
if (container.size() < beforeDouble) System.out.println("*** DEBUG DOUBLECHECK - removed " + (beforeDouble - container.size()) + " index entries from word container " + container.getWordHash());
return container;
}
/**
* return map of wordhash:indexContainer
*
* @param wordHashes
* @param urlselection
* @param deleteIfEmpty
* @param interruptIfEmpty
* @return
*/
public HashMap<String, ReferenceContainer> getContainers(final Set<String> wordHashes, final Set<String> urlselection, final boolean interruptIfEmpty) {
// retrieve entities that belong to the hashes
final HashMap<String, ReferenceContainer> containers = new HashMap<String, ReferenceContainer>(wordHashes.size());
String singleHash;
ReferenceContainer singleContainer;
final Iterator<String> i = wordHashes.iterator();
while (i.hasNext()) {
// get next word hash:
singleHash = i.next();
// retrieve index
singleContainer = getReferences(singleHash, urlselection);
// check result
if (((singleContainer == null) || (singleContainer.size() == 0)) && (interruptIfEmpty)) return new HashMap<String, ReferenceContainer>(0);
containers.put(singleHash, singleContainer);
}
return containers;
}
@SuppressWarnings("unchecked")
public HashMap<String, ReferenceContainer>[] localSearchContainers(
final TreeSet<String> queryHashes,
final TreeSet<String> excludeHashes,
final Set<String> urlselection) {
// search for the set of hashes and return a map of of wordhash:indexContainer containing the seach result
// retrieve entities that belong to the hashes
HashMap<String, ReferenceContainer> inclusionContainers = (queryHashes.size() == 0) ? new HashMap<String, ReferenceContainer>(0) : getContainers(
queryHashes,
urlselection,
true);
if ((inclusionContainers.size() != 0) && (inclusionContainers.size() < queryHashes.size())) inclusionContainers = new HashMap<String, ReferenceContainer>(0); // prevent that only a subset is returned
final HashMap<String, ReferenceContainer> exclusionContainers = (inclusionContainers.size() == 0) ? new HashMap<String, ReferenceContainer>(0) : getContainers(
excludeHashes,
urlselection,
true);
return new HashMap[]{inclusionContainers, exclusionContainers};
}
public int size() {
return java.lang.Math.max(collections.size(), indexCache.size());
}
public int collectionsSize() {
return collections.size();
}
public int cacheSize() {
return indexCache.size();
}
public void close() {
indexCache.close();
collections.close();
index.close();
metadata.close();
peers.close();
profilesActiveCrawls.close();
queuePreStack.close();
}
public ReferenceContainer deleteAllReferences(final String wordHash) {
final ReferenceContainer c = new ReferenceContainer(
wordHash,
ReferenceRow.urlEntryRow,
indexCache.countReferences(wordHash));
c.addAllUnique(indexCache.deleteAllReferences(wordHash));
c.addAllUnique(collections.deleteAllReferences(wordHash));
return c;
}
public boolean removeReference(final String wordHash, final String urlHash) {
boolean removed = false;
removed = removed | (indexCache.removeReference(wordHash, urlHash));
removed = removed | (collections.removeReference(wordHash, urlHash));
return removed;
}
public int removeEntryMultiple(final Set<String> wordHashes, final String urlHash) {
// remove the same url hashes for multiple words
// this is mainly used when correcting a index after a search
final Iterator<String> i = wordHashes.iterator();
int count = 0;
while (i.hasNext()) {
if (removeReference(i.next(), urlHash)) count++;
}
return count;
}
public int removeReferences(final String wordHash, final Set<String> urlHashes) {
int removed = 0;
removed += indexCache.removeReferences(wordHash, urlHashes);
removed += collections.removeReferences(wordHash, urlHashes);
return removed;
}
public String removeEntriesExpl(final String wordHash, final Set<String> urlHashes) {
String removed = "";
removed += indexCache.removeReferences(wordHash, urlHashes) + ", ";
removed += collections.removeReferences(wordHash, urlHashes);
return removed;
}
public void removeEntriesMultiple(final Set<String> wordHashes, final Set<String> urlHashes) {
// remove the same url hashes for multiple words
// this is mainly used when correcting a index after a search
final Iterator<String> i = wordHashes.iterator();
while (i.hasNext()) {
removeReferences(i.next(), urlHashes);
}
}
public int removeWordReferences(final Set<String> words, final String urlhash) {
// sequentially delete all word references
// returns number of deletions
final Iterator<String> iter = words.iterator();
int count = 0;
while (iter.hasNext()) {
// delete the URL reference in this word index
if (removeReference(Word.word2hash(iter.next()), urlhash)) count++;
}
return count;
}
public synchronized TreeSet<ReferenceContainer> indexContainerSet(final String startHash, final boolean ram, final boolean rot, int count) {
// creates a set of indexContainers
// this does not use the cache
final Order<ReferenceContainer> containerOrder = new ReferenceContainerOrder(indexOrder.clone());
containerOrder.rotate(emptyContainer(startHash, 0));
final TreeSet<ReferenceContainer> containers = new TreeSet<ReferenceContainer>(containerOrder);
final Iterator<ReferenceContainer> i = referenceIterator(startHash, rot, ram);
if (ram) count = Math.min(indexCache.size(), count);
ReferenceContainer container;
// this loop does not terminate using the i.hasNex() predicate when rot == true
// because then the underlying iterator is a rotating iterator without termination
// in this case a termination must be ensured with a counter
// It must also be ensured that the counter is in/decreased every loop
while ((count > 0) && (i.hasNext())) {
container = i.next();
if ((container != null) && (container.size() > 0)) {
containers.add(container);
}
count--; // decrease counter even if the container was null or empty to ensure termination
}
return containers; // this may return less containers as demanded
}
public MetadataRowContainer storeDocument(final IndexingStack.QueueEntry entry, final plasmaParserDocument document, final plasmaCondenser condenser) throws IOException {
final long startTime = System.currentTimeMillis();
@ -856,32 +512,6 @@ public final class plasmaWordIndex implements Index {
return newEntry;
}
public synchronized CloneableIterator<ReferenceContainer> referenceIterator(final String startHash, final boolean rot, final boolean ram) {
final CloneableIterator<ReferenceContainer> i = wordContainers(startHash, ram);
if (rot) {
return new RotateIterator<ReferenceContainer>(i, new String(Base64Order.zero(startHash.length())), indexCache.size() + ((ram) ? 0 : collections.size()));
}
return i;
}
private synchronized CloneableIterator<ReferenceContainer> wordContainers(final String startWordHash, final boolean ram) {
final Order<ReferenceContainer> containerOrder = new ReferenceContainerOrder(indexOrder.clone());
containerOrder.rotate(emptyContainer(startWordHash, 0));
if (ram) {
return indexCache.referenceIterator(startWordHash, false, true);
}
return collections.referenceIterator(startWordHash, false, false);
/*
return new MergeIterator<ReferenceContainer>(
indexCache.referenceIterator(startWordHash, false, true),
collections.referenceIterator(startWordHash, false, false),
containerOrder,
ReferenceContainer.containerMergeMethod,
true);
*/
}
// The Cleaner class was provided as "UrldbCleaner" by Hydrox
public synchronized ReferenceCleaner getReferenceCleaner(final String startHash) {
return new ReferenceCleaner(startHash);
@ -899,7 +529,7 @@ public final class plasmaWordIndex implements Index {
public ReferenceCleaner(final String startHash) {
this.startHash = startHash;
this.rwiCountAtStart = size();
this.rwiCountAtStart = index().size();
}
public void run() {
@ -908,7 +538,7 @@ public final class plasmaWordIndex implements Index {
ReferenceRow entry = null;
yacyURL url = null;
final HashSet<String> urlHashs = new HashSet<String>();
Iterator<ReferenceContainer> indexContainerIterator = indexContainerSet(startHash, false, false, 100).iterator();
Iterator<ReferenceContainer> indexContainerIterator = index.indexContainerSet(startHash, false, false, 100).iterator();
while (indexContainerIterator.hasNext() && run) {
waiter();
container = indexContainerIterator.next();
@ -930,7 +560,7 @@ public final class plasmaWordIndex implements Index {
}
}
if (urlHashs.size() > 0) {
final int removed = removeReferences(container.getWordHash(), urlHashs);
final int removed = index.removeReferences(container.getWordHash(), urlHashs);
Log.logFine("INDEXCLEANER", container.getWordHash() + ": " + removed + " of " + container.size() + " URL-entries deleted");
lastWordHash = container.getWordHash();
lastDeletionCounter = urlHashs.size();
@ -938,7 +568,7 @@ public final class plasmaWordIndex implements Index {
}
if (!containerIterator.hasNext()) {
// We may not be finished yet, try to get the next chunk of wordHashes
final TreeSet<ReferenceContainer> containers = indexContainerSet(container.getWordHash(), false, false, 100);
final TreeSet<ReferenceContainer> containers = index.indexContainerSet(container.getWordHash(), false, false, 100);
indexContainerIterator = containers.iterator();
// Make sure we don't get the same wordhash twice, but don't skip a word
if ((indexContainerIterator.hasNext()) && (!container.getWordHash().equals(indexContainerIterator.next().getWordHash()))) {
@ -988,9 +618,4 @@ public final class plasmaWordIndex implements Index {
}
}
}
public int countReferences(String key) {
return indexCache.countReferences(key) + collections.countReferences(key);
}
}

View File

@ -69,6 +69,7 @@ import de.anomic.http.httpRequestHeader;
import de.anomic.kelondro.order.Base64Order;
import de.anomic.kelondro.order.Bitfield;
import de.anomic.kelondro.order.Digest;
import de.anomic.kelondro.text.CachedIndexCollection;
import de.anomic.kelondro.text.MetadataRowContainer;
import de.anomic.kelondro.text.Reference;
import de.anomic.kelondro.text.ReferenceContainer;
@ -529,7 +530,7 @@ public final class yacyClient {
final int words = wordhashes.length() / yacySeedDB.commonHashLength;
final ReferenceContainer[] container = new ReferenceContainer[words];
for (int i = 0; i < words; i++) {
container[i] = plasmaWordIndex.emptyContainer(wordhashes.substring(i * yacySeedDB.commonHashLength, (i + 1) * yacySeedDB.commonHashLength), count);
container[i] = CachedIndexCollection.emptyContainer(wordhashes.substring(i * yacySeedDB.commonHashLength, (i + 1) * yacySeedDB.commonHashLength), count);
}
// insert results to containers
@ -638,7 +639,7 @@ public final class yacyClient {
// insert the containers to the index
for (int m = 0; m < words; m++) {
wordIndex.addReferences(container[m]);
wordIndex.index().addReferences(container[m]);
}
// generate statistics

View File

@ -676,7 +676,7 @@ public final class yacy {
if (cacheMem < 2048000) throw new OutOfMemoryError("Not enough memory available to start clean up.");
final plasmaWordIndex wordIndex = new plasmaWordIndex(networkName, log, indexPrimaryRoot, indexSecondaryRoot, 10000, false, 1, 0);
final Iterator<ReferenceContainer> indexContainerIterator = wordIndex.referenceIterator("AAAAAAAAAAAA", false, false);
final Iterator<ReferenceContainer> indexContainerIterator = wordIndex.index().referenceIterator("AAAAAAAAAAAA", false, false);
long urlCounter = 0, wordCounter = 0;
long wordChunkStart = System.currentTimeMillis(), wordChunkEnd = 0;
@ -867,7 +867,7 @@ public final class yacy {
Iterator<ReferenceContainer> indexContainerIterator = null;
if (resource.equals("all")) {
WordIndex = new plasmaWordIndex("freeworld", log, indexPrimaryRoot, indexSecondaryRoot, 10000, false, 1, 0);
indexContainerIterator = WordIndex.referenceIterator(wordChunkStartHash, false, false);
indexContainerIterator = WordIndex.index().referenceIterator(wordChunkStartHash, false, false);
}
int counter = 0;
ReferenceContainer container = null;