mirror of
https://github.com/yacy/yacy_search_server.git
synced 2024-09-19 00:01:41 +02:00
refactoring of plasmaWordIndex: less methods in the class, separated the index to CachedIndexCollection
git-svn-id: https://svn.berlios.de/svnroot/repos/yacy/trunk@5710 6c8d7289-2bf4-0310-a012-ef5d649a1542
This commit is contained in:
parent
14a1c33823
commit
7f67238f8b
|
@ -94,7 +94,7 @@ public class IndexCleaner_p {
|
|||
prop.put("rwidb_threadAlive", indexCleanerThread.isAlive() + "");
|
||||
prop.put("rwidb_threadToString", indexCleanerThread.toString());
|
||||
prop.putNum("rwidb_RWIcountstart", indexCleanerThread.rwiCountAtStart);
|
||||
prop.putNum("rwidb_RWIcountnow", sb.webIndex.size());
|
||||
prop.putNum("rwidb_RWIcountnow", sb.webIndex.index().size());
|
||||
prop.put("rwidb_wordHashNow", indexCleanerThread.wordHashNow);
|
||||
prop.put("rwidb_lastWordHash", indexCleanerThread.lastWordHash);
|
||||
prop.putNum("rwidb_lastDeletionCounter", indexCleanerThread.lastDeletionCounter);
|
||||
|
|
|
@ -124,7 +124,7 @@ public class IndexControlRWIs_p {
|
|||
if (delurl || delurlref) {
|
||||
// generate an urlx array
|
||||
ReferenceContainer index = null;
|
||||
index = sb.webIndex.getReferences(keyhash, null);
|
||||
index = sb.webIndex.index().getReferences(keyhash, null);
|
||||
final Iterator<ReferenceRow> en = index.entries();
|
||||
int i = 0;
|
||||
urlx = new String[index.size()];
|
||||
|
@ -141,7 +141,7 @@ public class IndexControlRWIs_p {
|
|||
sb.urlRemove(urlx[i]);
|
||||
}
|
||||
}
|
||||
sb.webIndex.deleteAllReferences(keyhash);
|
||||
sb.webIndex.index().deleteAllReferences(keyhash);
|
||||
post.remove("keyhashdeleteall");
|
||||
post.put("urllist", "generated");
|
||||
}
|
||||
|
@ -158,7 +158,7 @@ public class IndexControlRWIs_p {
|
|||
}
|
||||
final Set<String> urlHashes = new HashSet<String>();
|
||||
for (int i = 0; i < urlx.length; i++) urlHashes.add(urlx[i]);
|
||||
sb.webIndex.removeReferences(keyhash, urlHashes);
|
||||
sb.webIndex.index().removeReferences(keyhash, urlHashes);
|
||||
// this shall lead to a presentation of the list; so handle that the remaining program
|
||||
// thinks that it was called for a list presentation
|
||||
post.remove("keyhashdelete");
|
||||
|
@ -200,7 +200,7 @@ public class IndexControlRWIs_p {
|
|||
// prepare index
|
||||
ReferenceContainer index;
|
||||
final long starttime = System.currentTimeMillis();
|
||||
index = sb.webIndex.getReferences(keyhash, null);
|
||||
index = sb.webIndex.index().getReferences(keyhash, null);
|
||||
// built urlCache
|
||||
final Iterator<ReferenceRow> urlIter = index.entries();
|
||||
final HashMap<String, MetadataRowContainer> knownURLs = new HashMap<String, MetadataRowContainer>();
|
||||
|
@ -237,7 +237,7 @@ public class IndexControlRWIs_p {
|
|||
|
||||
// generate list
|
||||
if (post.containsKey("keyhashsimilar")) {
|
||||
final Iterator<ReferenceContainer> containerIt = sb.webIndex.indexContainerSet(keyhash, false, true, 256).iterator();
|
||||
final Iterator<ReferenceContainer> containerIt = sb.webIndex.index().indexContainerSet(keyhash, false, true, 256).iterator();
|
||||
ReferenceContainer container;
|
||||
int i = 0;
|
||||
int rows = 0, cols = 0;
|
||||
|
@ -315,7 +315,7 @@ public class IndexControlRWIs_p {
|
|||
} catch (final IOException e) {
|
||||
}
|
||||
}
|
||||
sb.webIndex.removeReferences(keyhash, urlHashes);
|
||||
sb.webIndex.index().removeReferences(keyhash, urlHashes);
|
||||
}
|
||||
|
||||
if (prop.getInt("searchresult", 0) == 3) plasmaSearchAPI.listHosts(prop, keyhash, sb);
|
||||
|
@ -323,7 +323,7 @@ public class IndexControlRWIs_p {
|
|||
|
||||
|
||||
// insert constants
|
||||
prop.putNum("wcount", sb.webIndex.size());
|
||||
prop.putNum("wcount", sb.webIndex.index().size());
|
||||
// return rewrite properties
|
||||
return prop;
|
||||
}
|
||||
|
|
|
@ -182,7 +182,7 @@ public class IndexControlURLs_p {
|
|||
// generate list
|
||||
if (post.containsKey("urlhashsimilar")) {
|
||||
try {
|
||||
final Iterator<MetadataRowContainer> entryIt = new RotateIterator<MetadataRowContainer>(sb.webIndex.metadata().entries(true, urlhash), new String(Base64Order.zero((urlhash == null ? 0 : urlhash.length()))), sb.webIndex.size());
|
||||
final Iterator<MetadataRowContainer> entryIt = new RotateIterator<MetadataRowContainer>(sb.webIndex.metadata().entries(true, urlhash), new String(Base64Order.zero((urlhash == null ? 0 : urlhash.length()))), sb.webIndex.index().size());
|
||||
final StringBuilder result = new StringBuilder("Sequential List of URL-Hashes:<br />");
|
||||
MetadataRowContainer entry;
|
||||
int i = 0;
|
||||
|
|
|
@ -106,7 +106,7 @@ public final class IndexImport_p {
|
|||
}
|
||||
}
|
||||
|
||||
prop.putNum("wcount", switchboard.webIndex.size());
|
||||
prop.putNum("wcount", switchboard.webIndex.index().size());
|
||||
prop.putNum("ucount", switchboard.webIndex.metadata().size());
|
||||
|
||||
/*
|
||||
|
|
|
@ -55,7 +55,7 @@ public class IndexShare_p {
|
|||
prop.put("wordfreq", switchboard.getConfigLong("defaultWordReceiveFrequency",10));
|
||||
prop.put("dtable", "");
|
||||
prop.put("rtable", "");
|
||||
prop.putNum("wcount", switchboard.webIndex.size());
|
||||
prop.putNum("wcount", switchboard.webIndex.index().size());
|
||||
prop.putNum("ucount", switchboard.webIndex.metadata().size());
|
||||
return prop; // be save
|
||||
}
|
||||
|
@ -68,7 +68,7 @@ public class IndexShare_p {
|
|||
}
|
||||
|
||||
// insert constants
|
||||
prop.putNum("wcount", switchboard.webIndex.size());
|
||||
prop.putNum("wcount", switchboard.webIndex.index().size());
|
||||
prop.putNum("ucount", switchboard.webIndex.metadata().size());
|
||||
|
||||
// return rewrite properties
|
||||
|
|
|
@ -41,7 +41,7 @@ public class PerformanceGraph {
|
|||
final int width = post.getInt("width", 660);
|
||||
final int height = post.getInt("height", 240);
|
||||
|
||||
return plasmaProfiling.performanceGraph(width, height, sb.webIndex.metadata().size() + " URLS / " + sb.webIndex.collectionsSize() + " WORDS IN COLLECTIONS / " + sb.webIndex.cacheSize() + " WORDS IN CACHE");
|
||||
return plasmaProfiling.performanceGraph(width, height, sb.webIndex.metadata().size() + " URLS / " + sb.webIndex.index().collectionsSize() + " WORDS IN COLLECTIONS / " + sb.webIndex.index().cacheSize() + " WORDS IN CACHE");
|
||||
}
|
||||
|
||||
}
|
|
@ -199,7 +199,7 @@ public class PerformanceQueues_p {
|
|||
// disallow setting of memprereq for indexer to prevent db from throwing OOMs
|
||||
prop.put("table_" + c + "_disabled", /*(threadName.endsWith("_indexing")) ? 1 :*/ "0");
|
||||
prop.put("table_" + c + "_recommendation", threadName.endsWith("_indexing") ? "1" : "0");
|
||||
prop.putNum("table_" + c + "_recommendation_value", threadName.endsWith("_indexing") ? (switchboard.webIndex.minMem() / 1024) : 0);
|
||||
prop.putNum("table_" + c + "_recommendation_value", threadName.endsWith("_indexing") ? (switchboard.webIndex.index().minMem() / 1024) : 0);
|
||||
c++;
|
||||
}
|
||||
prop.put("table", c);
|
||||
|
@ -229,7 +229,7 @@ public class PerformanceQueues_p {
|
|||
if ((post != null) && (post.containsKey("cacheSizeSubmit"))) {
|
||||
final int wordCacheMaxCount = post.getInt("wordCacheMaxCount", 20000);
|
||||
switchboard.setConfig(plasmaSwitchboardConstants.WORDCACHE_MAX_COUNT, Integer.toString(wordCacheMaxCount));
|
||||
switchboard.webIndex.setMaxWordCount(wordCacheMaxCount);
|
||||
switchboard.webIndex.index().setMaxWordCount(wordCacheMaxCount);
|
||||
|
||||
final int wordCacheInitCount = post.getInt(plasmaSwitchboardConstants.WORDCACHE_INIT_COUNT, 30000);
|
||||
switchboard.setConfig(plasmaSwitchboardConstants.WORDCACHE_INIT_COUNT, Integer.toString(wordCacheInitCount));
|
||||
|
@ -288,11 +288,11 @@ public class PerformanceQueues_p {
|
|||
|
||||
// table cache settings
|
||||
prop.putNum("urlCacheSize", switchboard.webIndex.metadata().writeCacheSize());
|
||||
prop.putNum("wordCacheSize", switchboard.webIndex.indexCacheSize());
|
||||
prop.putNum("wordCacheSizeKBytes", switchboard.webIndex.indexCacheSizeBytes()/1024);
|
||||
prop.putNum("maxURLinCache", switchboard.webIndex.maxURLinCache());
|
||||
prop.putNum("maxAgeOfCache", switchboard.webIndex.maxAgeOfCache() / 1000 / 60); // minutes
|
||||
prop.putNum("minAgeOfCache", switchboard.webIndex.minAgeOfCache() / 1000 / 60); // minutes
|
||||
prop.putNum("wordCacheSize", switchboard.webIndex.index().indexCacheSize());
|
||||
prop.putNum("wordCacheSizeKBytes", switchboard.webIndex.index().indexCacheSizeBytes()/1024);
|
||||
prop.putNum("maxURLinCache", switchboard.webIndex.index().maxURLinCache());
|
||||
prop.putNum("maxAgeOfCache", switchboard.webIndex.index().maxAgeOfCache() / 1000 / 60); // minutes
|
||||
prop.putNum("minAgeOfCache", switchboard.webIndex.index().minAgeOfCache() / 1000 / 60); // minutes
|
||||
prop.putNum("maxWaitingWordFlush", switchboard.getConfigLong("maxWaitingWordFlush", 180));
|
||||
prop.put("wordCacheMaxCount", switchboard.getConfigLong(plasmaSwitchboardConstants.WORDCACHE_MAX_COUNT, 20000));
|
||||
prop.put("wordCacheInitCount", switchboard.getConfigLong(plasmaSwitchboardConstants.WORDCACHE_INIT_COUNT, 30000));
|
||||
|
|
|
@ -42,7 +42,7 @@ public class queues_p {
|
|||
prop.putNum("indexingSize", sb.getThread(plasmaSwitchboardConstants.INDEXER).getJobCount() + sb.webIndex.queuePreStack.getActiveQueueSize());
|
||||
prop.putNum("indexingMax", (int) sb.getConfigLong(plasmaSwitchboardConstants.INDEXER_SLOTS, 30));
|
||||
prop.putNum("urlpublictextSize", sb.webIndex.metadata().size());
|
||||
prop.putNum("rwipublictextSize", sb.webIndex.size());
|
||||
prop.putNum("rwipublictextSize", sb.webIndex.index().size());
|
||||
if ((sb.webIndex.queuePreStack.size() == 0) && (sb.webIndex.queuePreStack.getActiveQueueSize() == 0)) {
|
||||
prop.put("list", "0"); //is empty
|
||||
} else {
|
||||
|
|
|
@ -21,11 +21,11 @@ public class status_p {
|
|||
prop.setLocalized(false);
|
||||
prop.put("rejected", "0");
|
||||
sb.updateMySeed();
|
||||
final int cacheSize = sb.webIndex.indexCacheSize();
|
||||
final int cacheSize = sb.webIndex.index().indexCacheSize();
|
||||
final long cacheMaxSize = sb.getConfigLong(plasmaSwitchboardConstants.WORDCACHE_MAX_COUNT, 10000);
|
||||
prop.putNum("ppm", sb.currentPPM());
|
||||
prop.putNum("qpm", sb.webIndex.peers().mySeed().getQPM());
|
||||
prop.putNum("wordCacheSize", sb.webIndex.indexCacheSize());
|
||||
prop.putNum("wordCacheSize", sb.webIndex.index().indexCacheSize());
|
||||
prop.putNum("wordCacheSize", cacheSize);
|
||||
prop.putNum("wordCacheMaxSize", cacheMaxSize);
|
||||
prop.put("wordCacheCount", cacheSize);
|
||||
|
|
|
@ -78,7 +78,7 @@ public final class timeline {
|
|||
yacyCore.log.logInfo("INIT TIMELINE SEARCH: " + plasmaSearchQuery.anonymizedQueryHashes(query[0]) + " - " + count + " links");
|
||||
|
||||
// get the index container with the result vector
|
||||
HashMap<String, ReferenceContainer>[] localSearchContainerMaps = sb.webIndex.localSearchContainers(query[0], query[1], null);
|
||||
HashMap<String, ReferenceContainer>[] localSearchContainerMaps = sb.webIndex.index().localSearchContainers(query[0], query[1], null);
|
||||
final ReferenceContainer index =
|
||||
ReferenceContainer.joinExcludeContainers(
|
||||
localSearchContainerMaps[0].values(),
|
||||
|
|
|
@ -82,13 +82,13 @@ public final class query {
|
|||
if (obj.equals("rwiurlcount")) {
|
||||
// the total number of different urls in the rwi is returned
|
||||
// <env> shall contain a word hash, the number of assigned lurls to this hash is returned
|
||||
prop.put("response", sb.webIndex.getReferences(env, null).size());
|
||||
prop.put("response", sb.webIndex.index().getReferences(env, null).size());
|
||||
return prop;
|
||||
}
|
||||
|
||||
if (obj.equals("rwicount")) {
|
||||
// return the total number of available word indexes
|
||||
prop.put("response", sb.webIndex.size());
|
||||
prop.put("response", sb.webIndex.index().size());
|
||||
return prop;
|
||||
}
|
||||
|
||||
|
|
|
@ -185,7 +185,7 @@ public final class search {
|
|||
yacyCore.log.logInfo("INIT HASH SEARCH (abstracts only): " + plasmaSearchQuery.anonymizedQueryHashes(theQuery.queryHashes) + " - " + theQuery.displayResults() + " links");
|
||||
|
||||
final long timer = System.currentTimeMillis();
|
||||
final Map<String, ReferenceContainer>[] containers = sb.webIndex.localSearchContainers(theQuery.queryHashes, theQuery.excludeHashes, plasmaSearchQuery.hashes2Set(urls));
|
||||
final Map<String, ReferenceContainer>[] containers = sb.webIndex.index().localSearchContainers(theQuery.queryHashes, theQuery.excludeHashes, plasmaSearchQuery.hashes2Set(urls));
|
||||
|
||||
serverProfiling.update("SEARCH", new plasmaProfiling.searchEvent(theQuery.id(true), plasmaSearchEvent.COLLECTION, containers[0].size(), System.currentTimeMillis() - timer), false);
|
||||
if (containers != null) {
|
||||
|
|
|
@ -100,9 +100,9 @@ public final class transferRWI {
|
|||
sb.getLog().logInfo("Rejecting RWIs from peer " + otherPeerName + ". Not granted.");
|
||||
result = "not_granted";
|
||||
pause = 0;
|
||||
} else if (sb.webIndex.indexCacheSize() > cachelimit) {
|
||||
} else if (sb.webIndex.index().indexCacheSize() > cachelimit) {
|
||||
// we are too busy to receive indexes
|
||||
sb.getLog().logInfo("Rejecting RWIs from peer " + otherPeerName + ". We are too busy (buffersize=" + sb.webIndex.indexCacheSize() + ").");
|
||||
sb.getLog().logInfo("Rejecting RWIs from peer " + otherPeerName + ". We are too busy (buffersize=" + sb.webIndex.index().indexCacheSize() + ").");
|
||||
granted = false; // don't accept more words if there are too many words to flush
|
||||
result = "busy";
|
||||
pause = 60000;
|
||||
|
@ -157,7 +157,7 @@ public final class transferRWI {
|
|||
}
|
||||
|
||||
// learn entry
|
||||
sb.webIndex.addEntry(wordHash, iEntry, System.currentTimeMillis());
|
||||
sb.webIndex.index().addEntry(wordHash, iEntry, System.currentTimeMillis());
|
||||
serverCore.checkInterruption();
|
||||
|
||||
// check if we need to ask for the corresponding URL
|
||||
|
@ -193,7 +193,7 @@ public final class transferRWI {
|
|||
}
|
||||
result = "ok";
|
||||
|
||||
pause = (int) (sb.webIndex.indexCacheSize() * 20000 / sb.getConfigLong(plasmaSwitchboardConstants.WORDCACHE_MAX_COUNT, 100000)); // estimation of necessary pause time
|
||||
pause = (int) (sb.webIndex.index().indexCacheSize() * 20000 / sb.getConfigLong(plasmaSwitchboardConstants.WORDCACHE_MAX_COUNT, 100000)); // estimation of necessary pause time
|
||||
}
|
||||
|
||||
prop.put("unknownURL", unknownURLs.toString());
|
||||
|
|
|
@ -315,7 +315,7 @@ public class yacysearch {
|
|||
|
||||
// delete the index entry locally
|
||||
final String delHash = post.get("deleteref", ""); // urlhash
|
||||
sb.webIndex.removeWordReferences(query[0], delHash);
|
||||
sb.webIndex.index().removeWordReferences(query[0], delHash);
|
||||
|
||||
// make new news message with negative voting
|
||||
final HashMap<String, String> map = new HashMap<String, String>();
|
||||
|
|
|
@ -81,7 +81,7 @@ public class Balancer {
|
|||
if (urlFileStack.size() != urlFileIndex.size() || (urlFileIndex.size() < 10000 && urlFileIndex.size() > 0)) {
|
||||
// fix the file stack
|
||||
Log.logInfo("Balancer", "re-creating the " + stackname + " balancer stack, size = " + urlFileIndex.size() + ((urlFileStack.size() == urlFileIndex.size()) ? "" : " (the old stack size was wrong)" ));
|
||||
urlFileStack = Stack.reset(urlFileStack);
|
||||
urlFileStack.clear();
|
||||
try {
|
||||
final Iterator<byte[]> i = urlFileIndex.keys(true, null);
|
||||
byte[] hash;
|
||||
|
@ -130,7 +130,7 @@ public class Balancer {
|
|||
}
|
||||
|
||||
public synchronized void clear() {
|
||||
urlFileStack = Stack.reset(urlFileStack);
|
||||
urlFileStack.clear();
|
||||
domainStacks.clear();
|
||||
urlRAMStack.clear();
|
||||
resetFileIndex();
|
||||
|
@ -544,7 +544,7 @@ public class Balancer {
|
|||
if (nextentry == null) {
|
||||
// emergency case: this means that something with the stack organization is wrong
|
||||
// the file appears to be broken. We kill the file.
|
||||
Stack.reset(urlFileStack);
|
||||
urlFileStack.clear();
|
||||
Log.logSevere("BALANCER", "get() failed to fetch entry from file stack. reset stack file.");
|
||||
} else {
|
||||
final String nexthash = new String(nextentry.getColBytes(0));
|
||||
|
|
|
@ -51,19 +51,16 @@ import de.anomic.yacy.yacyURL;
|
|||
|
||||
public class IndexingStack {
|
||||
|
||||
Stack sbQueueStack;
|
||||
CrawlProfile profiles;
|
||||
plasmaWordIndex wordIndex;
|
||||
private final File sbQueueStackPath;
|
||||
ConcurrentHashMap<String, QueueEntry> queueInProcess;
|
||||
private final Stack sbQueueStack;
|
||||
private final CrawlProfile profiles;
|
||||
private final plasmaWordIndex wordIndex;
|
||||
private final ConcurrentHashMap<String, QueueEntry> queueInProcess;
|
||||
|
||||
public IndexingStack(final plasmaWordIndex wordIndex, final File sbQueueStackPath, final CrawlProfile profiles) {
|
||||
this.sbQueueStackPath = sbQueueStackPath;
|
||||
this.profiles = profiles;
|
||||
this.wordIndex = wordIndex;
|
||||
this.queueInProcess = new ConcurrentHashMap<String, QueueEntry>();
|
||||
|
||||
initQueueStack();
|
||||
this.sbQueueStack = Stack.open(sbQueueStackPath, rowdef);
|
||||
}
|
||||
|
||||
public static final Row rowdef = new Row(
|
||||
|
@ -77,18 +74,7 @@ public class IndexingStack {
|
|||
"String urldescr-80",
|
||||
NaturalOrder.naturalOrder,
|
||||
0);
|
||||
|
||||
private void initQueueStack() {
|
||||
sbQueueStack = Stack.open(sbQueueStackPath, rowdef);
|
||||
}
|
||||
|
||||
/*
|
||||
private void resetQueueStack() {
|
||||
try {sbQueueStack.close();} catch (Exception e) {}
|
||||
if (sbQueueStackPath.exists()) sbQueueStackPath.delete();
|
||||
initQueueStack();
|
||||
}
|
||||
*/
|
||||
|
||||
public int size() {
|
||||
return (sbQueueStack == null) ? 0 : sbQueueStack.size();
|
||||
}
|
||||
|
@ -131,14 +117,13 @@ public class IndexingStack {
|
|||
}
|
||||
|
||||
public void clear() {
|
||||
sbQueueStack = Stack.reset(sbQueueStack);
|
||||
sbQueueStack.clear();
|
||||
}
|
||||
|
||||
public void close() {
|
||||
if (sbQueueStack != null) {
|
||||
sbQueueStack.close();
|
||||
}
|
||||
sbQueueStack = null;
|
||||
}
|
||||
|
||||
protected void finalize() throws Throwable {
|
||||
|
|
|
@ -78,17 +78,14 @@ public final class Stack extends FullRecords {
|
|||
}
|
||||
}
|
||||
|
||||
public static Stack reset(final Stack stack) {
|
||||
// memorize settings to this file
|
||||
final File f = new File(stack.filename);
|
||||
final Row row = stack.row();
|
||||
|
||||
// close and delete the file
|
||||
try {stack.close();} catch (final Exception e) {}
|
||||
if (f.exists()) f.delete();
|
||||
|
||||
// re-open a database with same settings as before
|
||||
return open(f, row);
|
||||
public void clear() {
|
||||
try {
|
||||
super.clear();
|
||||
setHandle(root, null);
|
||||
setHandle(toor, null);
|
||||
} catch (IOException e) {
|
||||
e.printStackTrace();
|
||||
}
|
||||
}
|
||||
|
||||
public Iterator<Row.Entry> stackIterator(final boolean up) {
|
||||
|
|
449
source/de/anomic/kelondro/text/CachedIndexCollection.java
Normal file
449
source/de/anomic/kelondro/text/CachedIndexCollection.java
Normal file
|
@ -0,0 +1,449 @@
|
|||
// plasmaWordIndex.java
|
||||
// (C) 2005, 2006 by Michael Peter Christen; mc@yacy.net, Frankfurt a. M., Germany
|
||||
// first published 2005 on http://www.anomic.de
|
||||
//
|
||||
// This is a part of YaCy, a peer-to-peer based web search engine
|
||||
//
|
||||
// $LastChangedDate: 2009-03-13 11:34:51 +0100 (Fr, 13 Mrz 2009) $
|
||||
// $LastChangedRevision: 5709 $
|
||||
// $LastChangedBy: orbiter $
|
||||
//
|
||||
// LICENSE
|
||||
//
|
||||
// This program is free software; you can redistribute it and/or modify
|
||||
// it under the terms of the GNU General Public License as published by
|
||||
// the Free Software Foundation; either version 2 of the License, or
|
||||
// (at your option) any later version.
|
||||
//
|
||||
// This program is distributed in the hope that it will be useful,
|
||||
// but WITHOUT ANY WARRANTY; without even the implied warranty of
|
||||
// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
|
||||
// GNU General Public License for more details.
|
||||
//
|
||||
// You should have received a copy of the GNU General Public License
|
||||
// along with this program; if not, write to the Free Software
|
||||
// Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
|
||||
|
||||
package de.anomic.kelondro.text;
|
||||
|
||||
import java.io.File;
|
||||
import java.io.IOException;
|
||||
import java.util.ArrayList;
|
||||
import java.util.HashMap;
|
||||
import java.util.Iterator;
|
||||
import java.util.Set;
|
||||
import java.util.TreeSet;
|
||||
|
||||
import de.anomic.kelondro.index.RowCollection;
|
||||
import de.anomic.kelondro.order.Base64Order;
|
||||
import de.anomic.kelondro.order.ByteOrder;
|
||||
import de.anomic.kelondro.order.CloneableIterator;
|
||||
import de.anomic.kelondro.order.Order;
|
||||
import de.anomic.kelondro.order.RotateIterator;
|
||||
import de.anomic.kelondro.text.Index;
|
||||
import de.anomic.kelondro.text.IndexCache;
|
||||
import de.anomic.kelondro.text.IndexCollection;
|
||||
import de.anomic.kelondro.text.ReferenceContainer;
|
||||
import de.anomic.kelondro.text.ReferenceContainerOrder;
|
||||
import de.anomic.kelondro.text.ReferenceRow;
|
||||
import de.anomic.kelondro.text.Word;
|
||||
import de.anomic.kelondro.util.MemoryControl;
|
||||
import de.anomic.kelondro.util.Log;
|
||||
import de.anomic.server.serverProfiling;
|
||||
|
||||
public final class CachedIndexCollection implements Index {
|
||||
|
||||
// environment constants
|
||||
public static final long wCacheMaxAge = 1000 * 60 * 30; // milliseconds; 30 minutes
|
||||
public static final int wCacheMaxChunk = 800; // maximum number of references for each urlhash
|
||||
public static final int lowcachedivisor = 900;
|
||||
public static final int maxCollectionPartition = 7; // should be 7
|
||||
private static final ByteOrder indexOrder = Base64Order.enhancedCoder;
|
||||
|
||||
|
||||
|
||||
private final IndexCache indexCache;
|
||||
private final IndexCollection collections; // new database structure to replace AssortmentCluster and FileCluster
|
||||
|
||||
public CachedIndexCollection(
|
||||
File indexPrimaryTextLocation,
|
||||
final int entityCacheMaxSize,
|
||||
final boolean useCommons,
|
||||
final int redundancy,
|
||||
Log log) throws IOException {
|
||||
|
||||
final File textindexcache = new File(indexPrimaryTextLocation, "RICACHE");
|
||||
if (!(textindexcache.exists())) textindexcache.mkdirs();
|
||||
if (new File(textindexcache, "index.dhtin.blob").exists()) {
|
||||
// migration of the both caches into one
|
||||
this.indexCache = new IndexCache(textindexcache, ReferenceRow.urlEntryRow, entityCacheMaxSize, wCacheMaxChunk, wCacheMaxAge, "index.dhtout.blob", log);
|
||||
IndexCache dhtInCache = new IndexCache(textindexcache, ReferenceRow.urlEntryRow, entityCacheMaxSize, wCacheMaxChunk, wCacheMaxAge, "index.dhtin.blob", log);
|
||||
for (ReferenceContainer c: dhtInCache) {
|
||||
this.indexCache.addReferences(c);
|
||||
}
|
||||
new File(textindexcache, "index.dhtin.blob").delete();
|
||||
} else {
|
||||
// read in new BLOB
|
||||
this.indexCache = new IndexCache(textindexcache, ReferenceRow.urlEntryRow, entityCacheMaxSize, wCacheMaxChunk, wCacheMaxAge, "index.dhtout.blob", log);
|
||||
}
|
||||
|
||||
// create collections storage path
|
||||
final File textindexcollections = new File(indexPrimaryTextLocation, "RICOLLECTION");
|
||||
if (!(textindexcollections.exists())) textindexcollections.mkdirs();
|
||||
this.collections = new IndexCollection(
|
||||
textindexcollections,
|
||||
"collection",
|
||||
12,
|
||||
Base64Order.enhancedCoder,
|
||||
maxCollectionPartition,
|
||||
ReferenceRow.urlEntryRow,
|
||||
useCommons);
|
||||
}
|
||||
|
||||
public void clear() {
|
||||
indexCache.clear();
|
||||
try {
|
||||
collections.clear();
|
||||
} catch (IOException e) {
|
||||
e.printStackTrace();
|
||||
}
|
||||
}
|
||||
|
||||
public int minMem() {
|
||||
return 1024*1024 /* indexing overhead */ + indexCache.minMem() + collections.minMem();
|
||||
}
|
||||
|
||||
public int maxURLinCache() {
|
||||
return indexCache.maxURLinCache();
|
||||
}
|
||||
|
||||
public long minAgeOfCache() {
|
||||
return indexCache.minAgeOfCache();
|
||||
}
|
||||
|
||||
public long maxAgeOfCache() {
|
||||
return indexCache.maxAgeOfCache();
|
||||
}
|
||||
|
||||
public int indexCacheSize() {
|
||||
return indexCache.size();
|
||||
}
|
||||
|
||||
public long indexCacheSizeBytes() {
|
||||
// calculate the real size in bytes of the index cache
|
||||
long cacheBytes = 0;
|
||||
final long entryBytes = ReferenceRow.urlEntryRow.objectsize;
|
||||
final IndexCache cache = (indexCache);
|
||||
synchronized (cache) {
|
||||
final Iterator<ReferenceContainer> it = cache.referenceIterator(null, false, true);
|
||||
while (it.hasNext()) cacheBytes += it.next().size() * entryBytes;
|
||||
}
|
||||
return cacheBytes;
|
||||
}
|
||||
|
||||
public void setMaxWordCount(final int maxWords) {
|
||||
indexCache.setMaxWordCount(maxWords);
|
||||
}
|
||||
|
||||
public void cacheFlushControl(final IndexCache theCache) {
|
||||
// check for forced flush
|
||||
int cs = cacheSize();
|
||||
if (cs > 0) {
|
||||
// flush elements that are too big. This flushing depends on the fact that the flush rule
|
||||
// selects the biggest elements first for flushing. If it does not for any reason, the following
|
||||
// loop would not terminate.
|
||||
serverProfiling.update("wordcache", Long.valueOf(cs), true);
|
||||
// To ensure termination an additional counter is used
|
||||
int l = 0;
|
||||
while (theCache.size() > 0 && (l++ < 100) && (theCache.maxURLinCache() > wCacheMaxChunk)) {
|
||||
flushCacheOne(theCache);
|
||||
}
|
||||
// next flush more entries if the size exceeds the maximum size of the cache
|
||||
while (theCache.size() > 0 &&
|
||||
((theCache.size() > theCache.getMaxWordCount()) ||
|
||||
(MemoryControl.available() < collections.minMem()))) {
|
||||
flushCacheOne(theCache);
|
||||
}
|
||||
if (cacheSize() != cs) serverProfiling.update("wordcache", Long.valueOf(cacheSize()), true);
|
||||
}
|
||||
}
|
||||
|
||||
public static ReferenceContainer emptyContainer(final String wordHash, final int elementCount) {
|
||||
return new ReferenceContainer(wordHash, ReferenceRow.urlEntryRow, elementCount);
|
||||
}
|
||||
|
||||
public void addEntry(final String wordHash, final ReferenceRow entry, final long updateTime) {
|
||||
// add the entry
|
||||
indexCache.addEntry(wordHash, entry, updateTime, true);
|
||||
cacheFlushControl(this.indexCache);
|
||||
}
|
||||
|
||||
public void addReferences(final ReferenceContainer entries) {
|
||||
assert (entries.row().objectsize == ReferenceRow.urlEntryRow.objectsize);
|
||||
|
||||
// add the entry
|
||||
indexCache.addReferences(entries);
|
||||
cacheFlushControl(this.indexCache);
|
||||
}
|
||||
|
||||
public void flushCacheFor(int time) {
|
||||
flushCacheUntil(System.currentTimeMillis() + time);
|
||||
}
|
||||
|
||||
private synchronized void flushCacheUntil(long timeout) {
|
||||
while (System.currentTimeMillis() < timeout && indexCache.size() > 0) {
|
||||
flushCacheOne(indexCache);
|
||||
}
|
||||
}
|
||||
|
||||
private synchronized void flushCacheOne(final IndexCache ram) {
|
||||
if (ram.size() > 0) collections.addReferences(flushContainer(ram));
|
||||
}
|
||||
|
||||
private ReferenceContainer flushContainer(final IndexCache ram) {
|
||||
String wordHash;
|
||||
ReferenceContainer c;
|
||||
wordHash = ram.maxScoreWordHash();
|
||||
c = ram.getReferences(wordHash, null);
|
||||
if ((c != null) && (c.size() > wCacheMaxChunk)) {
|
||||
return ram.deleteAllReferences(wordHash);
|
||||
} else {
|
||||
return ram.deleteAllReferences(ram.bestFlushWordHash());
|
||||
}
|
||||
}
|
||||
|
||||
public boolean hasReferences(final String wordHash) {
|
||||
if (indexCache.hasReferences(wordHash)) return true;
|
||||
if (collections.hasReferences(wordHash)) return true;
|
||||
return false;
|
||||
}
|
||||
|
||||
public ReferenceContainer getReferences(final String wordHash, final Set<String> urlselection) {
|
||||
if (wordHash == null) {
|
||||
// wrong input
|
||||
return null;
|
||||
}
|
||||
|
||||
// get from cache
|
||||
ReferenceContainer container;
|
||||
container = indexCache.getReferences(wordHash, urlselection);
|
||||
|
||||
// get from collection index
|
||||
if (container == null) {
|
||||
container = collections.getReferences(wordHash, urlselection);
|
||||
} else {
|
||||
container.addAllUnique(collections.getReferences(wordHash, urlselection));
|
||||
}
|
||||
|
||||
if (container == null) return null;
|
||||
|
||||
// check doubles
|
||||
final int beforeDouble = container.size();
|
||||
container.sort();
|
||||
final ArrayList<RowCollection> d = container.removeDoubles();
|
||||
RowCollection set;
|
||||
for (int i = 0; i < d.size(); i++) {
|
||||
// for each element in the double-set, take that one that is the most recent one
|
||||
set = d.get(i);
|
||||
ReferenceRow e, elm = null;
|
||||
long lm = 0;
|
||||
for (int j = 0; j < set.size(); j++) {
|
||||
e = new ReferenceRow(set.get(j, true));
|
||||
if ((elm == null) || (e.lastModified() > lm)) {
|
||||
elm = e;
|
||||
lm = e.lastModified();
|
||||
}
|
||||
}
|
||||
if(elm != null) {
|
||||
container.addUnique(elm.toKelondroEntry());
|
||||
}
|
||||
}
|
||||
if (container.size() < beforeDouble) System.out.println("*** DEBUG DOUBLECHECK - removed " + (beforeDouble - container.size()) + " index entries from word container " + container.getWordHash());
|
||||
|
||||
return container;
|
||||
}
|
||||
|
||||
/**
|
||||
* return map of wordhash:indexContainer
|
||||
*
|
||||
* @param wordHashes
|
||||
* @param urlselection
|
||||
* @param deleteIfEmpty
|
||||
* @param interruptIfEmpty
|
||||
* @return
|
||||
*/
|
||||
public HashMap<String, ReferenceContainer> getContainers(final Set<String> wordHashes, final Set<String> urlselection, final boolean interruptIfEmpty) {
|
||||
// retrieve entities that belong to the hashes
|
||||
final HashMap<String, ReferenceContainer> containers = new HashMap<String, ReferenceContainer>(wordHashes.size());
|
||||
String singleHash;
|
||||
ReferenceContainer singleContainer;
|
||||
final Iterator<String> i = wordHashes.iterator();
|
||||
while (i.hasNext()) {
|
||||
|
||||
// get next word hash:
|
||||
singleHash = i.next();
|
||||
|
||||
// retrieve index
|
||||
singleContainer = getReferences(singleHash, urlselection);
|
||||
|
||||
// check result
|
||||
if (((singleContainer == null) || (singleContainer.size() == 0)) && (interruptIfEmpty)) return new HashMap<String, ReferenceContainer>(0);
|
||||
|
||||
containers.put(singleHash, singleContainer);
|
||||
}
|
||||
return containers;
|
||||
}
|
||||
|
||||
@SuppressWarnings("unchecked")
|
||||
public HashMap<String, ReferenceContainer>[] localSearchContainers(
|
||||
final TreeSet<String> queryHashes,
|
||||
final TreeSet<String> excludeHashes,
|
||||
final Set<String> urlselection) {
|
||||
// search for the set of hashes and return a map of of wordhash:indexContainer containing the seach result
|
||||
|
||||
// retrieve entities that belong to the hashes
|
||||
HashMap<String, ReferenceContainer> inclusionContainers = (queryHashes.size() == 0) ? new HashMap<String, ReferenceContainer>(0) : getContainers(
|
||||
queryHashes,
|
||||
urlselection,
|
||||
true);
|
||||
if ((inclusionContainers.size() != 0) && (inclusionContainers.size() < queryHashes.size())) inclusionContainers = new HashMap<String, ReferenceContainer>(0); // prevent that only a subset is returned
|
||||
final HashMap<String, ReferenceContainer> exclusionContainers = (inclusionContainers.size() == 0) ? new HashMap<String, ReferenceContainer>(0) : getContainers(
|
||||
excludeHashes,
|
||||
urlselection,
|
||||
true);
|
||||
return new HashMap[]{inclusionContainers, exclusionContainers};
|
||||
}
|
||||
|
||||
public int size() {
|
||||
return java.lang.Math.max(collections.size(), indexCache.size());
|
||||
}
|
||||
|
||||
public int collectionsSize() {
|
||||
return collections.size();
|
||||
}
|
||||
|
||||
public int cacheSize() {
|
||||
return indexCache.size();
|
||||
}
|
||||
|
||||
public void close() {
|
||||
indexCache.close();
|
||||
collections.close();
|
||||
}
|
||||
|
||||
public ReferenceContainer deleteAllReferences(final String wordHash) {
|
||||
final ReferenceContainer c = new ReferenceContainer(
|
||||
wordHash,
|
||||
ReferenceRow.urlEntryRow,
|
||||
indexCache.countReferences(wordHash));
|
||||
c.addAllUnique(indexCache.deleteAllReferences(wordHash));
|
||||
c.addAllUnique(collections.deleteAllReferences(wordHash));
|
||||
return c;
|
||||
}
|
||||
|
||||
public boolean removeReference(final String wordHash, final String urlHash) {
|
||||
boolean removed = false;
|
||||
removed = removed | (indexCache.removeReference(wordHash, urlHash));
|
||||
removed = removed | (collections.removeReference(wordHash, urlHash));
|
||||
return removed;
|
||||
}
|
||||
|
||||
public int removeEntryMultiple(final Set<String> wordHashes, final String urlHash) {
|
||||
// remove the same url hashes for multiple words
|
||||
// this is mainly used when correcting a index after a search
|
||||
final Iterator<String> i = wordHashes.iterator();
|
||||
int count = 0;
|
||||
while (i.hasNext()) {
|
||||
if (removeReference(i.next(), urlHash)) count++;
|
||||
}
|
||||
return count;
|
||||
}
|
||||
|
||||
public int removeReferences(final String wordHash, final Set<String> urlHashes) {
|
||||
int removed = 0;
|
||||
removed += indexCache.removeReferences(wordHash, urlHashes);
|
||||
removed += collections.removeReferences(wordHash, urlHashes);
|
||||
return removed;
|
||||
}
|
||||
|
||||
public String removeEntriesExpl(final String wordHash, final Set<String> urlHashes) {
|
||||
String removed = "";
|
||||
removed += indexCache.removeReferences(wordHash, urlHashes) + ", ";
|
||||
removed += collections.removeReferences(wordHash, urlHashes);
|
||||
return removed;
|
||||
}
|
||||
|
||||
public void removeEntriesMultiple(final Set<String> wordHashes, final Set<String> urlHashes) {
|
||||
// remove the same url hashes for multiple words
|
||||
// this is mainly used when correcting a index after a search
|
||||
final Iterator<String> i = wordHashes.iterator();
|
||||
while (i.hasNext()) {
|
||||
removeReferences(i.next(), urlHashes);
|
||||
}
|
||||
}
|
||||
|
||||
public int removeWordReferences(final Set<String> words, final String urlhash) {
|
||||
// sequentially delete all word references
|
||||
// returns number of deletions
|
||||
final Iterator<String> iter = words.iterator();
|
||||
int count = 0;
|
||||
while (iter.hasNext()) {
|
||||
// delete the URL reference in this word index
|
||||
if (removeReference(Word.word2hash(iter.next()), urlhash)) count++;
|
||||
}
|
||||
return count;
|
||||
}
|
||||
|
||||
public synchronized TreeSet<ReferenceContainer> indexContainerSet(final String startHash, final boolean ram, final boolean rot, int count) {
|
||||
// creates a set of indexContainers
|
||||
// this does not use the cache
|
||||
final Order<ReferenceContainer> containerOrder = new ReferenceContainerOrder(indexOrder.clone());
|
||||
containerOrder.rotate(emptyContainer(startHash, 0));
|
||||
final TreeSet<ReferenceContainer> containers = new TreeSet<ReferenceContainer>(containerOrder);
|
||||
final Iterator<ReferenceContainer> i = referenceIterator(startHash, rot, ram);
|
||||
if (ram) count = Math.min(indexCache.size(), count);
|
||||
ReferenceContainer container;
|
||||
// this loop does not terminate using the i.hasNex() predicate when rot == true
|
||||
// because then the underlying iterator is a rotating iterator without termination
|
||||
// in this case a termination must be ensured with a counter
|
||||
// It must also be ensured that the counter is in/decreased every loop
|
||||
while ((count > 0) && (i.hasNext())) {
|
||||
container = i.next();
|
||||
if ((container != null) && (container.size() > 0)) {
|
||||
containers.add(container);
|
||||
}
|
||||
count--; // decrease counter even if the container was null or empty to ensure termination
|
||||
}
|
||||
return containers; // this may return less containers as demanded
|
||||
}
|
||||
|
||||
public synchronized CloneableIterator<ReferenceContainer> referenceIterator(final String startHash, final boolean rot, final boolean ram) {
|
||||
final CloneableIterator<ReferenceContainer> i = wordContainers(startHash, ram);
|
||||
if (rot) {
|
||||
return new RotateIterator<ReferenceContainer>(i, new String(Base64Order.zero(startHash.length())), indexCache.size() + ((ram) ? 0 : collections.size()));
|
||||
}
|
||||
return i;
|
||||
}
|
||||
|
||||
private synchronized CloneableIterator<ReferenceContainer> wordContainers(final String startWordHash, final boolean ram) {
|
||||
final Order<ReferenceContainer> containerOrder = new ReferenceContainerOrder(indexOrder.clone());
|
||||
containerOrder.rotate(emptyContainer(startWordHash, 0));
|
||||
if (ram) {
|
||||
return indexCache.referenceIterator(startWordHash, false, true);
|
||||
}
|
||||
return collections.referenceIterator(startWordHash, false, false);
|
||||
/*
|
||||
return new MergeIterator<ReferenceContainer>(
|
||||
indexCache.referenceIterator(startWordHash, false, true),
|
||||
collections.referenceIterator(startWordHash, false, false),
|
||||
containerOrder,
|
||||
ReferenceContainer.containerMergeMethod,
|
||||
true);
|
||||
*/
|
||||
}
|
||||
|
||||
public int countReferences(String key) {
|
||||
return indexCache.countReferences(key) + collections.countReferences(key);
|
||||
}
|
||||
|
||||
}
|
|
@ -38,7 +38,6 @@ import de.anomic.kelondro.index.Row;
|
|||
import de.anomic.kelondro.index.RowSet;
|
||||
import de.anomic.kelondro.order.Base64Order;
|
||||
import de.anomic.kelondro.util.ByteBuffer;
|
||||
import de.anomic.plasma.plasmaWordIndex;
|
||||
|
||||
public class ReferenceContainer extends RowSet {
|
||||
|
||||
|
@ -229,11 +228,11 @@ public class ReferenceContainer extends RowSet {
|
|||
// join a search result and return the joincount (number of pages after join)
|
||||
|
||||
// since this is a conjunction we return an empty entity if any word is not known
|
||||
if (includeContainers == null) return plasmaWordIndex.emptyContainer(null, 0);
|
||||
if (includeContainers == null) return CachedIndexCollection.emptyContainer(null, 0);
|
||||
|
||||
// join the result
|
||||
final ReferenceContainer rcLocal = ReferenceContainer.joinContainers(includeContainers, maxDistance);
|
||||
if (rcLocal == null) return plasmaWordIndex.emptyContainer(null, 0);
|
||||
if (rcLocal == null) return CachedIndexCollection.emptyContainer(null, 0);
|
||||
excludeContainers(rcLocal, excludeContainers);
|
||||
|
||||
return rcLocal;
|
||||
|
|
|
@ -36,7 +36,7 @@ public class plasmaDbImporter extends AbstractImporter implements Importer {
|
|||
super("PLASMADB");
|
||||
this.homeWordIndex = homeWI;
|
||||
this.importWordIndex = importWI;
|
||||
this.importStartSize = this.importWordIndex.size();
|
||||
this.importStartSize = this.importWordIndex.index().size();
|
||||
}
|
||||
|
||||
/**
|
||||
|
@ -93,15 +93,15 @@ public class plasmaDbImporter extends AbstractImporter implements Importer {
|
|||
|
||||
try {
|
||||
this.log.logInfo("Importing DB from '" + this.importWordIndex.getLocation(true).getAbsolutePath() + "'");
|
||||
this.log.logInfo("Home word index contains " + homeWordIndex.size() + " words and " + homeWordIndex.metadata().size() + " URLs.");
|
||||
this.log.logInfo("Import word index contains " + this.importWordIndex.size() + " words and " + this.importWordIndex.metadata().size() + " URLs.");
|
||||
this.log.logInfo("Home word index contains " + homeWordIndex.index().size() + " words and " + homeWordIndex.metadata().size() + " URLs.");
|
||||
this.log.logInfo("Import word index contains " + this.importWordIndex.index().size() + " words and " + this.importWordIndex.metadata().size() + " URLs.");
|
||||
|
||||
final HashSet<String> unknownUrlBuffer = new HashSet<String>();
|
||||
final HashSet<String> importedUrlBuffer = new HashSet<String>();
|
||||
|
||||
// iterate over all words from import db
|
||||
//Iterator importWordHashIterator = this.importWordIndex.wordHashes(this.wordChunkStartHash, plasmaWordIndex.RL_WORDFILES, false);
|
||||
Iterator<ReferenceContainer> indexContainerIterator = this.importWordIndex.indexContainerSet(this.wordChunkStartHash, false, false, 100).iterator();
|
||||
Iterator<ReferenceContainer> indexContainerIterator = this.importWordIndex.index().indexContainerSet(this.wordChunkStartHash, false, false, 100).iterator();
|
||||
while (!isAborted() && indexContainerIterator.hasNext()) {
|
||||
|
||||
final TreeSet<String> entityUrls = new TreeSet<String>();
|
||||
|
@ -169,10 +169,10 @@ public class plasmaDbImporter extends AbstractImporter implements Importer {
|
|||
if (isAborted()) break;
|
||||
|
||||
// importing entity container to home db
|
||||
if (newContainer.size() > 0) { homeWordIndex.addReferences(newContainer); }
|
||||
if (newContainer.size() > 0) { homeWordIndex.index().addReferences(newContainer); }
|
||||
|
||||
// delete complete index entity file
|
||||
this.importWordIndex.deleteAllReferences(this.wordHash);
|
||||
this.importWordIndex.index().deleteAllReferences(this.wordHash);
|
||||
|
||||
// print out some statistical information
|
||||
if (this.entryCounter % 500 == 0) {
|
||||
|
@ -189,8 +189,8 @@ public class plasmaDbImporter extends AbstractImporter implements Importer {
|
|||
"Speed: "+ 500*1000/duration + " word entities/s" +
|
||||
" | Elapsed time: " + DateFormatter.formatInterval(getElapsedTime()) +
|
||||
" | Estimated time: " + DateFormatter.formatInterval(getEstimatedTime()) + "\n" +
|
||||
"Home Words = " + homeWordIndex.size() +
|
||||
" | Import Words = " + this.importWordIndex.size());
|
||||
"Home Words = " + homeWordIndex.index().size() +
|
||||
" | Import Words = " + this.importWordIndex.index().size());
|
||||
this.wordChunkStart = this.wordChunkEnd;
|
||||
this.wordChunkStartHash = this.wordChunkEndHash;
|
||||
}
|
||||
|
@ -203,7 +203,7 @@ public class plasmaDbImporter extends AbstractImporter implements Importer {
|
|||
|
||||
if (!indexContainerIterator.hasNext()) {
|
||||
// We may not be finished yet, try to get the next chunk of wordHashes
|
||||
final TreeSet<ReferenceContainer> containers = this.importWordIndex.indexContainerSet(this.wordHash, false, false, 100);
|
||||
final TreeSet<ReferenceContainer> containers = this.importWordIndex.index().indexContainerSet(this.wordHash, false, false, 100);
|
||||
indexContainerIterator = containers.iterator();
|
||||
// Make sure we don't get the same wordhash twice, but don't skip a word
|
||||
if ((indexContainerIterator.hasNext())&&(!this.wordHash.equals((indexContainerIterator.next()).getWordHash()))) {
|
||||
|
@ -212,8 +212,8 @@ public class plasmaDbImporter extends AbstractImporter implements Importer {
|
|||
}
|
||||
}
|
||||
|
||||
this.log.logInfo("Home word index contains " + homeWordIndex.size() + " words and " + homeWordIndex.metadata().size() + " URLs.");
|
||||
this.log.logInfo("Import word index contains " + this.importWordIndex.size() + " words and " + this.importWordIndex.metadata().size() + " URLs.");
|
||||
this.log.logInfo("Home word index contains " + homeWordIndex.index().size() + " words and " + homeWordIndex.metadata().size() + " URLs.");
|
||||
this.log.logInfo("Import word index contains " + this.importWordIndex.index().size() + " words and " + this.importWordIndex.metadata().size() + " URLs.");
|
||||
} catch (final Exception e) {
|
||||
this.log.logSevere("Database import failed.",e);
|
||||
e.printStackTrace();
|
||||
|
|
|
@ -248,7 +248,7 @@ public final class plasmaSearchEvent {
|
|||
if (rw > 0) {
|
||||
final Set<String> removeWords = cleanEvent.query.queryHashes;
|
||||
removeWords.addAll(cleanEvent.query.excludeHashes);
|
||||
cleanEvent.wordIndex.removeEntriesMultiple(removeWords, cleanEvent.failedURLs.keySet());
|
||||
cleanEvent.wordIndex.index().removeEntriesMultiple(removeWords, cleanEvent.failedURLs.keySet());
|
||||
Log.logInfo("SearchEvents", "cleaning up event " + cleanEvent.query.id(true) + ", removed " + rw + " URL references on " + removeWords.size() + " words");
|
||||
}
|
||||
|
||||
|
@ -301,7 +301,7 @@ public final class plasmaSearchEvent {
|
|||
(query.constraint.get(plasmaCondenser.flag_cat_indexof)) &&
|
||||
(!(metadata.dc_title().startsWith("Index of")))) {
|
||||
final Iterator<String> wi = query.queryHashes.iterator();
|
||||
while (wi.hasNext()) wordIndex.removeReference(wi.next(), page.hash());
|
||||
while (wi.hasNext()) wordIndex.index().removeReference(wi.next(), page.hash());
|
||||
registerFailure(page.hash(), "index-of constraint not fullfilled");
|
||||
return null;
|
||||
}
|
||||
|
@ -824,7 +824,7 @@ public final class plasmaSearchEvent {
|
|||
String address = null;
|
||||
if ((seed == null) || ((address = seed.getPublicAddress()) == null)) {
|
||||
// seed is not known from here
|
||||
wordIndex.removeWordReferences(
|
||||
wordIndex.index().removeWordReferences(
|
||||
plasmaCondenser.getWords(
|
||||
("yacyshare " +
|
||||
filename.replace('?', ' ') +
|
||||
|
|
|
@ -110,7 +110,7 @@ public final class plasmaSearchRankingProcess {
|
|||
public void execQuery() {
|
||||
|
||||
long timer = System.currentTimeMillis();
|
||||
this.localSearchContainerMaps = wordIndex.localSearchContainers(query.queryHashes, query.excludeHashes, null);
|
||||
this.localSearchContainerMaps = wordIndex.index().localSearchContainers(query.queryHashes, query.excludeHashes, null);
|
||||
serverProfiling.update("SEARCH", new plasmaProfiling.searchEvent(query.id(true), plasmaSearchEvent.COLLECTION, this.localSearchContainerMaps[0].size(), System.currentTimeMillis() - timer), false);
|
||||
|
||||
// join and exclude the local result
|
||||
|
|
|
@ -952,12 +952,12 @@ public class plasmaSnippetCache {
|
|||
assert plasmaSwitchboard.getSwitchboard().webIndex != null;
|
||||
assert event != null : "eventID = " + eventID;
|
||||
assert event.getQuery() != null;
|
||||
plasmaSwitchboard.getSwitchboard().webIndex.removeEntryMultiple(event.getQuery().queryHashes, urlHash);
|
||||
plasmaSwitchboard.getSwitchboard().webIndex.index().removeEntryMultiple(event.getQuery().queryHashes, urlHash);
|
||||
event.remove(urlHash);
|
||||
}
|
||||
if (snippet.getErrorCode() == ERROR_NO_MATCH) {
|
||||
log.logInfo("error: '" + snippet.getError() + "', remove words '" + querystring + "' for url = " + snippet.getUrl().toNormalform(false, true) + ", cause: " + snippet.getError());
|
||||
plasmaSwitchboard.getSwitchboard().webIndex.removeEntryMultiple(snippet.remaingHashes, urlHash);
|
||||
plasmaSwitchboard.getSwitchboard().webIndex.index().removeEntryMultiple(snippet.remaingHashes, urlHash);
|
||||
plasmaSearchEvent.getEvent(eventID).remove(urlHash);
|
||||
}
|
||||
return snippet.getError();
|
||||
|
|
|
@ -329,7 +329,7 @@ public final class plasmaSwitchboard extends serverAbstractSwitch<IndexingStack.
|
|||
|
||||
// init a DHT transmission dispatcher
|
||||
this.dhtDispatcher = new Dispatcher(
|
||||
webIndex,
|
||||
webIndex.index(),
|
||||
webIndex.metadata(),
|
||||
webIndex.peers(),
|
||||
true,
|
||||
|
@ -1119,12 +1119,12 @@ public final class plasmaSwitchboard extends serverAbstractSwitch<IndexingStack.
|
|||
}
|
||||
|
||||
public int rwiCacheSize() {
|
||||
return webIndex.cacheSize();
|
||||
return webIndex.index().cacheSize();
|
||||
}
|
||||
|
||||
public boolean rwiCacheFlush() {
|
||||
if (rwiCacheSize() == 0) return false;
|
||||
webIndex.flushCacheFor((int) ((this.getConfigLong(plasmaSwitchboardConstants.CACHEFLUSH_BUSYSLEEP, 10000) * this.getConfigLong("performanceIO", 10)) / 100));
|
||||
webIndex.index().flushCacheFor((int) ((this.getConfigLong(plasmaSwitchboardConstants.CACHEFLUSH_BUSYSLEEP, 10000) * this.getConfigLong("performanceIO", 10)) / 100));
|
||||
return true;
|
||||
}
|
||||
|
||||
|
@ -1143,7 +1143,7 @@ public final class plasmaSwitchboard extends serverAbstractSwitch<IndexingStack.
|
|||
|
||||
public void deQueueFreeMem() {
|
||||
// flush some entries from the RAM cache
|
||||
webIndex.flushCacheFor(5000);
|
||||
webIndex.index().flushCacheFor(5000);
|
||||
// empty some caches
|
||||
webIndex.metadata().clearCache();
|
||||
plasmaSearchEvent.cleanupEvents(true);
|
||||
|
@ -1772,7 +1772,7 @@ public final class plasmaSwitchboard extends serverAbstractSwitch<IndexingStack.
|
|||
|
||||
// delete all word references
|
||||
int count = 0;
|
||||
if (words != null) count = webIndex.removeWordReferences(words, urlhash);
|
||||
if (words != null) count = webIndex.index().removeWordReferences(words, urlhash);
|
||||
|
||||
// finally delete the url entry itself
|
||||
webIndex.metadata().remove(urlhash);
|
||||
|
@ -1889,8 +1889,8 @@ public final class plasmaSwitchboard extends serverAbstractSwitch<IndexingStack.
|
|||
if (webIndex.metadata().size() < 10) {
|
||||
return "no DHT distribution: loadedURL.size() = " + webIndex.metadata().size();
|
||||
}
|
||||
if (webIndex.size() < 100) {
|
||||
return "no DHT distribution: not enough words - wordIndex.size() = " + webIndex.size();
|
||||
if (webIndex.index().size() < 100) {
|
||||
return "no DHT distribution: not enough words - wordIndex.size() = " + webIndex.index().size();
|
||||
}
|
||||
if ((getConfig(plasmaSwitchboardConstants.INDEX_DIST_ALLOW_WHILE_CRAWLING, "false").equalsIgnoreCase("false")) && (crawlQueues.noticeURL.notEmpty())) {
|
||||
return "no DHT distribution: crawl in progress: noticeURL.stackSize() = " + crawlQueues.noticeURL.size() + ", sbQueue.size() = " + webIndex.queuePreStack.size();
|
||||
|
@ -1992,7 +1992,7 @@ public final class plasmaSwitchboard extends serverAbstractSwitch<IndexingStack.
|
|||
webIndex.peers().mySeed().put(yacySeed.LCOUNT, Integer.toString(webIndex.metadata().size())); // the number of links that the peer has stored (LURL's)
|
||||
webIndex.peers().mySeed().put(yacySeed.NCOUNT, Integer.toString(crawlQueues.noticeURL.size())); // the number of links that the peer has noticed, but not loaded (NURL's)
|
||||
webIndex.peers().mySeed().put(yacySeed.RCOUNT, Integer.toString(crawlQueues.noticeURL.stackSize(NoticedURL.STACK_TYPE_LIMIT))); // the number of links that the peer provides for remote crawling (ZURL's)
|
||||
webIndex.peers().mySeed().put(yacySeed.ICOUNT, Integer.toString(webIndex.size())); // the minimum number of words that the peer has indexed (as it says)
|
||||
webIndex.peers().mySeed().put(yacySeed.ICOUNT, Integer.toString(webIndex.index().size())); // the minimum number of words that the peer has indexed (as it says)
|
||||
webIndex.peers().mySeed().put(yacySeed.SCOUNT, Integer.toString(webIndex.peers().sizeConnected())); // the number of seeds that the peer has stored
|
||||
webIndex.peers().mySeed().put(yacySeed.CCOUNT, Double.toString(((int) ((webIndex.peers().sizeConnected() + webIndex.peers().sizeDisconnected() + webIndex.peers().sizePotential()) * 60.0 / (uptime + 1.01)) * 100) / 100.0)); // the number of clients that the peer connects (as connects/hour)
|
||||
webIndex.peers().mySeed().put(yacySeed.VERSION, getConfig("version", ""));
|
||||
|
|
|
@ -28,55 +28,39 @@ package de.anomic.plasma;
|
|||
|
||||
import java.io.File;
|
||||
import java.io.IOException;
|
||||
import java.util.ArrayList;
|
||||
import java.util.Date;
|
||||
import java.util.HashMap;
|
||||
import java.util.HashSet;
|
||||
import java.util.Iterator;
|
||||
import java.util.Map;
|
||||
import java.util.Set;
|
||||
import java.util.TreeSet;
|
||||
|
||||
import de.anomic.crawler.CrawlProfile;
|
||||
import de.anomic.crawler.IndexingStack;
|
||||
import de.anomic.htmlFilter.htmlFilterContentScraper;
|
||||
import de.anomic.http.httpdProxyCacheEntry;
|
||||
import de.anomic.kelondro.index.RowCollection;
|
||||
import de.anomic.kelondro.order.Base64Order;
|
||||
import de.anomic.kelondro.order.ByteOrder;
|
||||
import de.anomic.kelondro.order.CloneableIterator;
|
||||
import de.anomic.kelondro.order.Order;
|
||||
import de.anomic.kelondro.order.RotateIterator;
|
||||
import de.anomic.kelondro.text.Index;
|
||||
import de.anomic.kelondro.text.IndexCache;
|
||||
import de.anomic.kelondro.text.IndexCollection;
|
||||
import de.anomic.kelondro.text.CachedIndexCollection;
|
||||
import de.anomic.kelondro.text.MetadataRowContainer;
|
||||
import de.anomic.kelondro.text.ReferenceContainer;
|
||||
import de.anomic.kelondro.text.ReferenceContainerOrder;
|
||||
import de.anomic.kelondro.text.ReferenceRow;
|
||||
import de.anomic.kelondro.text.MetadataRepository;
|
||||
import de.anomic.kelondro.text.Word;
|
||||
import de.anomic.kelondro.text.Blacklist;
|
||||
import de.anomic.kelondro.util.MemoryControl;
|
||||
import de.anomic.kelondro.util.kelondroException;
|
||||
import de.anomic.kelondro.util.Log;
|
||||
import de.anomic.server.serverProfiling;
|
||||
import de.anomic.tools.iso639;
|
||||
import de.anomic.xml.RSSFeed;
|
||||
import de.anomic.xml.RSSMessage;
|
||||
import de.anomic.yacy.yacySeedDB;
|
||||
import de.anomic.yacy.yacyURL;
|
||||
|
||||
public final class plasmaWordIndex implements Index {
|
||||
public final class plasmaWordIndex {
|
||||
|
||||
// environment constants
|
||||
public static final long wCacheMaxAge = 1000 * 60 * 30; // milliseconds; 30 minutes
|
||||
public static final int wCacheMaxChunk = 800; // maximum number of references for each urlhash
|
||||
public static final int lowcachedivisor = 900;
|
||||
public static final int maxCollectionPartition = 7; // should be 7
|
||||
private static final ByteOrder indexOrder = Base64Order.enhancedCoder;
|
||||
|
||||
|
||||
public static final String CRAWL_PROFILE_PROXY = "proxy";
|
||||
public static final String CRAWL_PROFILE_REMOTE = "remote";
|
||||
public static final String CRAWL_PROFILE_SNIPPET_LOCAL_TEXT = "snippetLocalText";
|
||||
|
@ -93,8 +77,7 @@ public final class plasmaWordIndex implements Index {
|
|||
public static final long CRAWL_PROFILE_SNIPPET_GLOBAL_MEDIA_RECRAWL_CYCLE = 60L * 24L * 30L;
|
||||
|
||||
|
||||
private final IndexCache indexCache;
|
||||
private final IndexCollection collections; // new database structure to replace AssortmentCluster and FileCluster
|
||||
private final CachedIndexCollection index;
|
||||
private final Log log;
|
||||
private MetadataRepository metadata;
|
||||
private final yacySeedDB peers;
|
||||
|
@ -139,34 +122,13 @@ public final class plasmaWordIndex implements Index {
|
|||
}
|
||||
}
|
||||
}
|
||||
|
||||
final File textindexcache = new File(indexPrimaryTextLocation, "RICACHE");
|
||||
if (!(textindexcache.exists())) textindexcache.mkdirs();
|
||||
if (new File(textindexcache, "index.dhtin.blob").exists()) {
|
||||
// migration of the both caches into one
|
||||
this.indexCache = new IndexCache(textindexcache, ReferenceRow.urlEntryRow, entityCacheMaxSize, wCacheMaxChunk, wCacheMaxAge, "index.dhtout.blob", log);
|
||||
IndexCache dhtInCache = new IndexCache(textindexcache, ReferenceRow.urlEntryRow, entityCacheMaxSize, wCacheMaxChunk, wCacheMaxAge, "index.dhtin.blob", log);
|
||||
for (ReferenceContainer c: dhtInCache) {
|
||||
this.indexCache.addReferences(c);
|
||||
}
|
||||
new File(textindexcache, "index.dhtin.blob").delete();
|
||||
} else {
|
||||
// read in new BLOB
|
||||
this.indexCache = new IndexCache(textindexcache, ReferenceRow.urlEntryRow, entityCacheMaxSize, wCacheMaxChunk, wCacheMaxAge, "index.dhtout.blob", log);
|
||||
}
|
||||
this.index = new CachedIndexCollection(
|
||||
indexPrimaryTextLocation,
|
||||
entityCacheMaxSize,
|
||||
useCommons,
|
||||
redundancy,
|
||||
log);
|
||||
|
||||
// create collections storage path
|
||||
final File textindexcollections = new File(indexPrimaryTextLocation, "RICOLLECTION");
|
||||
if (!(textindexcollections.exists())) textindexcollections.mkdirs();
|
||||
this.collections = new IndexCollection(
|
||||
textindexcollections,
|
||||
"collection",
|
||||
12,
|
||||
Base64Order.enhancedCoder,
|
||||
maxCollectionPartition,
|
||||
ReferenceRow.urlEntryRow,
|
||||
useCommons);
|
||||
|
||||
// create LURL-db
|
||||
metadata = new MetadataRepository(new File(this.secondaryRoot, "TEXT"));
|
||||
|
||||
|
@ -249,13 +211,12 @@ public final class plasmaWordIndex implements Index {
|
|||
return this.peers;
|
||||
}
|
||||
|
||||
public CachedIndexCollection index() {
|
||||
return this.index;
|
||||
}
|
||||
|
||||
public void clear() {
|
||||
indexCache.clear();
|
||||
try {
|
||||
collections.clear();
|
||||
} catch (IOException e) {
|
||||
e.printStackTrace();
|
||||
}
|
||||
index.clear();
|
||||
try {
|
||||
metadata.clear();
|
||||
} catch (final IOException e) {
|
||||
|
@ -377,111 +338,7 @@ public final class plasmaWordIndex implements Index {
|
|||
public File getLocation(final boolean primary) {
|
||||
return (primary) ? this.primaryRoot : this.secondaryRoot;
|
||||
}
|
||||
|
||||
public int minMem() {
|
||||
return 1024*1024 /* indexing overhead */ + indexCache.minMem() + collections.minMem();
|
||||
}
|
||||
|
||||
public int maxURLinCache() {
|
||||
return indexCache.maxURLinCache();
|
||||
}
|
||||
|
||||
public long minAgeOfCache() {
|
||||
return indexCache.minAgeOfCache();
|
||||
}
|
||||
|
||||
public long maxAgeOfCache() {
|
||||
return indexCache.maxAgeOfCache();
|
||||
}
|
||||
|
||||
public int indexCacheSize() {
|
||||
return indexCache.size();
|
||||
}
|
||||
|
||||
public long indexCacheSizeBytes() {
|
||||
// calculate the real size in bytes of the index cache
|
||||
long cacheBytes = 0;
|
||||
final long entryBytes = ReferenceRow.urlEntryRow.objectsize;
|
||||
final IndexCache cache = (indexCache);
|
||||
synchronized (cache) {
|
||||
final Iterator<ReferenceContainer> it = cache.referenceIterator(null, false, true);
|
||||
while (it.hasNext()) cacheBytes += it.next().size() * entryBytes;
|
||||
}
|
||||
return cacheBytes;
|
||||
}
|
||||
|
||||
public void setMaxWordCount(final int maxWords) {
|
||||
indexCache.setMaxWordCount(maxWords);
|
||||
}
|
||||
|
||||
public void cacheFlushControl(final IndexCache theCache) {
|
||||
// check for forced flush
|
||||
int cs = cacheSize();
|
||||
if (cs > 0) {
|
||||
// flush elements that are too big. This flushing depends on the fact that the flush rule
|
||||
// selects the biggest elements first for flushing. If it does not for any reason, the following
|
||||
// loop would not terminate.
|
||||
serverProfiling.update("wordcache", Long.valueOf(cs), true);
|
||||
// To ensure termination an additional counter is used
|
||||
int l = 0;
|
||||
while (theCache.size() > 0 && (l++ < 100) && (theCache.maxURLinCache() > wCacheMaxChunk)) {
|
||||
flushCacheOne(theCache);
|
||||
}
|
||||
// next flush more entries if the size exceeds the maximum size of the cache
|
||||
while (theCache.size() > 0 &&
|
||||
((theCache.size() > theCache.getMaxWordCount()) ||
|
||||
(MemoryControl.available() < collections.minMem()))) {
|
||||
flushCacheOne(theCache);
|
||||
}
|
||||
if (cacheSize() != cs) serverProfiling.update("wordcache", Long.valueOf(cacheSize()), true);
|
||||
}
|
||||
}
|
||||
|
||||
public static ReferenceContainer emptyContainer(final String wordHash, final int elementCount) {
|
||||
return new ReferenceContainer(wordHash, ReferenceRow.urlEntryRow, elementCount);
|
||||
}
|
||||
|
||||
public void addEntry(final String wordHash, final ReferenceRow entry, final long updateTime) {
|
||||
// add the entry
|
||||
indexCache.addEntry(wordHash, entry, updateTime, true);
|
||||
cacheFlushControl(this.indexCache);
|
||||
}
|
||||
|
||||
public void addReferences(final ReferenceContainer entries) {
|
||||
assert (entries.row().objectsize == ReferenceRow.urlEntryRow.objectsize);
|
||||
|
||||
// add the entry
|
||||
indexCache.addReferences(entries);
|
||||
cacheFlushControl(this.indexCache);
|
||||
}
|
||||
|
||||
public void flushCacheFor(int time) {
|
||||
flushCacheUntil(System.currentTimeMillis() + time);
|
||||
}
|
||||
|
||||
private synchronized void flushCacheUntil(long timeout) {
|
||||
while (System.currentTimeMillis() < timeout && indexCache.size() > 0) {
|
||||
flushCacheOne(indexCache);
|
||||
}
|
||||
}
|
||||
|
||||
private synchronized void flushCacheOne(final IndexCache ram) {
|
||||
if (ram.size() > 0) collections.addReferences(flushContainer(ram));
|
||||
}
|
||||
|
||||
private ReferenceContainer flushContainer(final IndexCache ram) {
|
||||
String wordHash;
|
||||
ReferenceContainer c;
|
||||
wordHash = ram.maxScoreWordHash();
|
||||
c = ram.getReferences(wordHash, null);
|
||||
if ((c != null) && (c.size() > wCacheMaxChunk)) {
|
||||
return ram.deleteAllReferences(wordHash);
|
||||
} else {
|
||||
return ram.deleteAllReferences(ram.bestFlushWordHash());
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
/**
|
||||
* this is called by the switchboard to put in a new page into the index
|
||||
* use all the words in one condenser object to simultanous create index entries
|
||||
|
@ -526,221 +383,20 @@ public final class plasmaWordIndex implements Index {
|
|||
doctype,
|
||||
outlinksSame, outlinksOther,
|
||||
wprop.flags);
|
||||
addEntry(Word.word2hash(word), ientry, System.currentTimeMillis());
|
||||
this.index.addEntry(Word.word2hash(word), ientry, System.currentTimeMillis());
|
||||
wordCount++;
|
||||
}
|
||||
|
||||
return wordCount;
|
||||
}
|
||||
|
||||
public boolean hasReferences(final String wordHash) {
|
||||
if (indexCache.hasReferences(wordHash)) return true;
|
||||
if (collections.hasReferences(wordHash)) return true;
|
||||
return false;
|
||||
}
|
||||
|
||||
public ReferenceContainer getReferences(final String wordHash, final Set<String> urlselection) {
|
||||
if ((wordHash == null) || (wordHash.length() != yacySeedDB.commonHashLength)) {
|
||||
// wrong input
|
||||
return null;
|
||||
}
|
||||
|
||||
// get from cache
|
||||
ReferenceContainer container;
|
||||
container = indexCache.getReferences(wordHash, urlselection);
|
||||
|
||||
// get from collection index
|
||||
if (container == null) {
|
||||
container = collections.getReferences(wordHash, urlselection);
|
||||
} else {
|
||||
container.addAllUnique(collections.getReferences(wordHash, urlselection));
|
||||
}
|
||||
|
||||
if (container == null) return null;
|
||||
|
||||
// check doubles
|
||||
final int beforeDouble = container.size();
|
||||
container.sort();
|
||||
final ArrayList<RowCollection> d = container.removeDoubles();
|
||||
RowCollection set;
|
||||
for (int i = 0; i < d.size(); i++) {
|
||||
// for each element in the double-set, take that one that is the most recent one
|
||||
set = d.get(i);
|
||||
ReferenceRow e, elm = null;
|
||||
long lm = 0;
|
||||
for (int j = 0; j < set.size(); j++) {
|
||||
e = new ReferenceRow(set.get(j, true));
|
||||
if ((elm == null) || (e.lastModified() > lm)) {
|
||||
elm = e;
|
||||
lm = e.lastModified();
|
||||
}
|
||||
}
|
||||
if(elm != null) {
|
||||
container.addUnique(elm.toKelondroEntry());
|
||||
}
|
||||
}
|
||||
if (container.size() < beforeDouble) System.out.println("*** DEBUG DOUBLECHECK - removed " + (beforeDouble - container.size()) + " index entries from word container " + container.getWordHash());
|
||||
|
||||
return container;
|
||||
}
|
||||
|
||||
/**
|
||||
* return map of wordhash:indexContainer
|
||||
*
|
||||
* @param wordHashes
|
||||
* @param urlselection
|
||||
* @param deleteIfEmpty
|
||||
* @param interruptIfEmpty
|
||||
* @return
|
||||
*/
|
||||
public HashMap<String, ReferenceContainer> getContainers(final Set<String> wordHashes, final Set<String> urlselection, final boolean interruptIfEmpty) {
|
||||
// retrieve entities that belong to the hashes
|
||||
final HashMap<String, ReferenceContainer> containers = new HashMap<String, ReferenceContainer>(wordHashes.size());
|
||||
String singleHash;
|
||||
ReferenceContainer singleContainer;
|
||||
final Iterator<String> i = wordHashes.iterator();
|
||||
while (i.hasNext()) {
|
||||
|
||||
// get next word hash:
|
||||
singleHash = i.next();
|
||||
|
||||
// retrieve index
|
||||
singleContainer = getReferences(singleHash, urlselection);
|
||||
|
||||
// check result
|
||||
if (((singleContainer == null) || (singleContainer.size() == 0)) && (interruptIfEmpty)) return new HashMap<String, ReferenceContainer>(0);
|
||||
|
||||
containers.put(singleHash, singleContainer);
|
||||
}
|
||||
return containers;
|
||||
}
|
||||
|
||||
@SuppressWarnings("unchecked")
|
||||
public HashMap<String, ReferenceContainer>[] localSearchContainers(
|
||||
final TreeSet<String> queryHashes,
|
||||
final TreeSet<String> excludeHashes,
|
||||
final Set<String> urlselection) {
|
||||
// search for the set of hashes and return a map of of wordhash:indexContainer containing the seach result
|
||||
|
||||
// retrieve entities that belong to the hashes
|
||||
HashMap<String, ReferenceContainer> inclusionContainers = (queryHashes.size() == 0) ? new HashMap<String, ReferenceContainer>(0) : getContainers(
|
||||
queryHashes,
|
||||
urlselection,
|
||||
true);
|
||||
if ((inclusionContainers.size() != 0) && (inclusionContainers.size() < queryHashes.size())) inclusionContainers = new HashMap<String, ReferenceContainer>(0); // prevent that only a subset is returned
|
||||
final HashMap<String, ReferenceContainer> exclusionContainers = (inclusionContainers.size() == 0) ? new HashMap<String, ReferenceContainer>(0) : getContainers(
|
||||
excludeHashes,
|
||||
urlselection,
|
||||
true);
|
||||
return new HashMap[]{inclusionContainers, exclusionContainers};
|
||||
}
|
||||
|
||||
public int size() {
|
||||
return java.lang.Math.max(collections.size(), indexCache.size());
|
||||
}
|
||||
|
||||
public int collectionsSize() {
|
||||
return collections.size();
|
||||
}
|
||||
|
||||
public int cacheSize() {
|
||||
return indexCache.size();
|
||||
}
|
||||
|
||||
public void close() {
|
||||
indexCache.close();
|
||||
collections.close();
|
||||
index.close();
|
||||
metadata.close();
|
||||
peers.close();
|
||||
profilesActiveCrawls.close();
|
||||
queuePreStack.close();
|
||||
}
|
||||
|
||||
public ReferenceContainer deleteAllReferences(final String wordHash) {
|
||||
final ReferenceContainer c = new ReferenceContainer(
|
||||
wordHash,
|
||||
ReferenceRow.urlEntryRow,
|
||||
indexCache.countReferences(wordHash));
|
||||
c.addAllUnique(indexCache.deleteAllReferences(wordHash));
|
||||
c.addAllUnique(collections.deleteAllReferences(wordHash));
|
||||
return c;
|
||||
}
|
||||
|
||||
public boolean removeReference(final String wordHash, final String urlHash) {
|
||||
boolean removed = false;
|
||||
removed = removed | (indexCache.removeReference(wordHash, urlHash));
|
||||
removed = removed | (collections.removeReference(wordHash, urlHash));
|
||||
return removed;
|
||||
}
|
||||
|
||||
public int removeEntryMultiple(final Set<String> wordHashes, final String urlHash) {
|
||||
// remove the same url hashes for multiple words
|
||||
// this is mainly used when correcting a index after a search
|
||||
final Iterator<String> i = wordHashes.iterator();
|
||||
int count = 0;
|
||||
while (i.hasNext()) {
|
||||
if (removeReference(i.next(), urlHash)) count++;
|
||||
}
|
||||
return count;
|
||||
}
|
||||
|
||||
public int removeReferences(final String wordHash, final Set<String> urlHashes) {
|
||||
int removed = 0;
|
||||
removed += indexCache.removeReferences(wordHash, urlHashes);
|
||||
removed += collections.removeReferences(wordHash, urlHashes);
|
||||
return removed;
|
||||
}
|
||||
|
||||
public String removeEntriesExpl(final String wordHash, final Set<String> urlHashes) {
|
||||
String removed = "";
|
||||
removed += indexCache.removeReferences(wordHash, urlHashes) + ", ";
|
||||
removed += collections.removeReferences(wordHash, urlHashes);
|
||||
return removed;
|
||||
}
|
||||
|
||||
public void removeEntriesMultiple(final Set<String> wordHashes, final Set<String> urlHashes) {
|
||||
// remove the same url hashes for multiple words
|
||||
// this is mainly used when correcting a index after a search
|
||||
final Iterator<String> i = wordHashes.iterator();
|
||||
while (i.hasNext()) {
|
||||
removeReferences(i.next(), urlHashes);
|
||||
}
|
||||
}
|
||||
|
||||
public int removeWordReferences(final Set<String> words, final String urlhash) {
|
||||
// sequentially delete all word references
|
||||
// returns number of deletions
|
||||
final Iterator<String> iter = words.iterator();
|
||||
int count = 0;
|
||||
while (iter.hasNext()) {
|
||||
// delete the URL reference in this word index
|
||||
if (removeReference(Word.word2hash(iter.next()), urlhash)) count++;
|
||||
}
|
||||
return count;
|
||||
}
|
||||
|
||||
public synchronized TreeSet<ReferenceContainer> indexContainerSet(final String startHash, final boolean ram, final boolean rot, int count) {
|
||||
// creates a set of indexContainers
|
||||
// this does not use the cache
|
||||
final Order<ReferenceContainer> containerOrder = new ReferenceContainerOrder(indexOrder.clone());
|
||||
containerOrder.rotate(emptyContainer(startHash, 0));
|
||||
final TreeSet<ReferenceContainer> containers = new TreeSet<ReferenceContainer>(containerOrder);
|
||||
final Iterator<ReferenceContainer> i = referenceIterator(startHash, rot, ram);
|
||||
if (ram) count = Math.min(indexCache.size(), count);
|
||||
ReferenceContainer container;
|
||||
// this loop does not terminate using the i.hasNex() predicate when rot == true
|
||||
// because then the underlying iterator is a rotating iterator without termination
|
||||
// in this case a termination must be ensured with a counter
|
||||
// It must also be ensured that the counter is in/decreased every loop
|
||||
while ((count > 0) && (i.hasNext())) {
|
||||
container = i.next();
|
||||
if ((container != null) && (container.size() > 0)) {
|
||||
containers.add(container);
|
||||
}
|
||||
count--; // decrease counter even if the container was null or empty to ensure termination
|
||||
}
|
||||
return containers; // this may return less containers as demanded
|
||||
}
|
||||
|
||||
public MetadataRowContainer storeDocument(final IndexingStack.QueueEntry entry, final plasmaParserDocument document, final plasmaCondenser condenser) throws IOException {
|
||||
final long startTime = System.currentTimeMillis();
|
||||
|
@ -856,32 +512,6 @@ public final class plasmaWordIndex implements Index {
|
|||
return newEntry;
|
||||
}
|
||||
|
||||
public synchronized CloneableIterator<ReferenceContainer> referenceIterator(final String startHash, final boolean rot, final boolean ram) {
|
||||
final CloneableIterator<ReferenceContainer> i = wordContainers(startHash, ram);
|
||||
if (rot) {
|
||||
return new RotateIterator<ReferenceContainer>(i, new String(Base64Order.zero(startHash.length())), indexCache.size() + ((ram) ? 0 : collections.size()));
|
||||
}
|
||||
return i;
|
||||
}
|
||||
|
||||
private synchronized CloneableIterator<ReferenceContainer> wordContainers(final String startWordHash, final boolean ram) {
|
||||
final Order<ReferenceContainer> containerOrder = new ReferenceContainerOrder(indexOrder.clone());
|
||||
containerOrder.rotate(emptyContainer(startWordHash, 0));
|
||||
if (ram) {
|
||||
return indexCache.referenceIterator(startWordHash, false, true);
|
||||
}
|
||||
return collections.referenceIterator(startWordHash, false, false);
|
||||
/*
|
||||
return new MergeIterator<ReferenceContainer>(
|
||||
indexCache.referenceIterator(startWordHash, false, true),
|
||||
collections.referenceIterator(startWordHash, false, false),
|
||||
containerOrder,
|
||||
ReferenceContainer.containerMergeMethod,
|
||||
true);
|
||||
*/
|
||||
}
|
||||
|
||||
|
||||
// The Cleaner class was provided as "UrldbCleaner" by Hydrox
|
||||
public synchronized ReferenceCleaner getReferenceCleaner(final String startHash) {
|
||||
return new ReferenceCleaner(startHash);
|
||||
|
@ -899,7 +529,7 @@ public final class plasmaWordIndex implements Index {
|
|||
|
||||
public ReferenceCleaner(final String startHash) {
|
||||
this.startHash = startHash;
|
||||
this.rwiCountAtStart = size();
|
||||
this.rwiCountAtStart = index().size();
|
||||
}
|
||||
|
||||
public void run() {
|
||||
|
@ -908,7 +538,7 @@ public final class plasmaWordIndex implements Index {
|
|||
ReferenceRow entry = null;
|
||||
yacyURL url = null;
|
||||
final HashSet<String> urlHashs = new HashSet<String>();
|
||||
Iterator<ReferenceContainer> indexContainerIterator = indexContainerSet(startHash, false, false, 100).iterator();
|
||||
Iterator<ReferenceContainer> indexContainerIterator = index.indexContainerSet(startHash, false, false, 100).iterator();
|
||||
while (indexContainerIterator.hasNext() && run) {
|
||||
waiter();
|
||||
container = indexContainerIterator.next();
|
||||
|
@ -930,7 +560,7 @@ public final class plasmaWordIndex implements Index {
|
|||
}
|
||||
}
|
||||
if (urlHashs.size() > 0) {
|
||||
final int removed = removeReferences(container.getWordHash(), urlHashs);
|
||||
final int removed = index.removeReferences(container.getWordHash(), urlHashs);
|
||||
Log.logFine("INDEXCLEANER", container.getWordHash() + ": " + removed + " of " + container.size() + " URL-entries deleted");
|
||||
lastWordHash = container.getWordHash();
|
||||
lastDeletionCounter = urlHashs.size();
|
||||
|
@ -938,7 +568,7 @@ public final class plasmaWordIndex implements Index {
|
|||
}
|
||||
if (!containerIterator.hasNext()) {
|
||||
// We may not be finished yet, try to get the next chunk of wordHashes
|
||||
final TreeSet<ReferenceContainer> containers = indexContainerSet(container.getWordHash(), false, false, 100);
|
||||
final TreeSet<ReferenceContainer> containers = index.indexContainerSet(container.getWordHash(), false, false, 100);
|
||||
indexContainerIterator = containers.iterator();
|
||||
// Make sure we don't get the same wordhash twice, but don't skip a word
|
||||
if ((indexContainerIterator.hasNext()) && (!container.getWordHash().equals(indexContainerIterator.next().getWordHash()))) {
|
||||
|
@ -988,9 +618,4 @@ public final class plasmaWordIndex implements Index {
|
|||
}
|
||||
}
|
||||
}
|
||||
|
||||
public int countReferences(String key) {
|
||||
return indexCache.countReferences(key) + collections.countReferences(key);
|
||||
}
|
||||
|
||||
}
|
||||
|
|
|
@ -69,6 +69,7 @@ import de.anomic.http.httpRequestHeader;
|
|||
import de.anomic.kelondro.order.Base64Order;
|
||||
import de.anomic.kelondro.order.Bitfield;
|
||||
import de.anomic.kelondro.order.Digest;
|
||||
import de.anomic.kelondro.text.CachedIndexCollection;
|
||||
import de.anomic.kelondro.text.MetadataRowContainer;
|
||||
import de.anomic.kelondro.text.Reference;
|
||||
import de.anomic.kelondro.text.ReferenceContainer;
|
||||
|
@ -529,7 +530,7 @@ public final class yacyClient {
|
|||
final int words = wordhashes.length() / yacySeedDB.commonHashLength;
|
||||
final ReferenceContainer[] container = new ReferenceContainer[words];
|
||||
for (int i = 0; i < words; i++) {
|
||||
container[i] = plasmaWordIndex.emptyContainer(wordhashes.substring(i * yacySeedDB.commonHashLength, (i + 1) * yacySeedDB.commonHashLength), count);
|
||||
container[i] = CachedIndexCollection.emptyContainer(wordhashes.substring(i * yacySeedDB.commonHashLength, (i + 1) * yacySeedDB.commonHashLength), count);
|
||||
}
|
||||
|
||||
// insert results to containers
|
||||
|
@ -638,7 +639,7 @@ public final class yacyClient {
|
|||
|
||||
// insert the containers to the index
|
||||
for (int m = 0; m < words; m++) {
|
||||
wordIndex.addReferences(container[m]);
|
||||
wordIndex.index().addReferences(container[m]);
|
||||
}
|
||||
|
||||
// generate statistics
|
||||
|
|
|
@ -676,7 +676,7 @@ public final class yacy {
|
|||
if (cacheMem < 2048000) throw new OutOfMemoryError("Not enough memory available to start clean up.");
|
||||
|
||||
final plasmaWordIndex wordIndex = new plasmaWordIndex(networkName, log, indexPrimaryRoot, indexSecondaryRoot, 10000, false, 1, 0);
|
||||
final Iterator<ReferenceContainer> indexContainerIterator = wordIndex.referenceIterator("AAAAAAAAAAAA", false, false);
|
||||
final Iterator<ReferenceContainer> indexContainerIterator = wordIndex.index().referenceIterator("AAAAAAAAAAAA", false, false);
|
||||
|
||||
long urlCounter = 0, wordCounter = 0;
|
||||
long wordChunkStart = System.currentTimeMillis(), wordChunkEnd = 0;
|
||||
|
@ -867,7 +867,7 @@ public final class yacy {
|
|||
Iterator<ReferenceContainer> indexContainerIterator = null;
|
||||
if (resource.equals("all")) {
|
||||
WordIndex = new plasmaWordIndex("freeworld", log, indexPrimaryRoot, indexSecondaryRoot, 10000, false, 1, 0);
|
||||
indexContainerIterator = WordIndex.referenceIterator(wordChunkStartHash, false, false);
|
||||
indexContainerIterator = WordIndex.index().referenceIterator(wordChunkStartHash, false, false);
|
||||
}
|
||||
int counter = 0;
|
||||
ReferenceContainer container = null;
|
||||
|
|
Loading…
Reference in New Issue
Block a user