yacy_search_server/source/de/anomic/plasma/plasmaWordIndex.java
orbiter 30888e7a2f implementation of search constraints
Such constraints may formulate specific restrictions to web searches
This is implemented by scraping information for constraints from a web
page during parsing, and storing flags to the pages within the web index.

In this first step, only information for index pages ("index of", directory listings)
are scraped and stored in flags
- added new flag class kelondroBitfield
- added scraper method in condenser
- added bitfield structure for all scrape types (see also condenser)
- added bitfield structure for appearance locations (see RWIEntry)
- added handover protocol for remote search and index distribution
- extended kelondroColumn class to hold bitfield types
- added another search attribute on search page (index.html)
- extended search-filter to enable filtering of non-matching constraints
- set all new database types to be default
- refactoring: moved word hash generation to condenser class

git-svn-id: https://svn.berlios.de/svnroot/repos/yacy/trunk@2999 6c8d7289-2bf4-0310-a012-ef5d649a1542
2006-11-23 02:16:30 +00:00

914 lines
41 KiB
Java

// plasmaWordIndex.java
// (C) 2005, 2006 by Michael Peter Christen; mc@anomic.de, Frankfurt a. M., Germany
// first published 2005 on http://www.anomic.de
//
// This is a part of YaCy, a peer-to-peer based web search engine
//
// $LastChangedDate$
// $LastChangedRevision$
// $LastChangedBy$
//
// LICENSE
//
// This program is free software; you can redistribute it and/or modify
// it under the terms of the GNU General Public License as published by
// the Free Software Foundation; either version 2 of the License, or
// (at your option) any later version.
//
// This program is distributed in the hope that it will be useful,
// but WITHOUT ANY WARRANTY; without even the implied warranty of
// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
// GNU General Public License for more details.
//
// You should have received a copy of the GNU General Public License
// along with this program; if not, write to the Free Software
// Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
package de.anomic.plasma;
import java.io.File;
import java.io.IOException;
import java.util.Date;
import java.util.HashMap;
import java.util.HashSet;
import java.util.Iterator;
import java.util.Map;
import java.util.Set;
import java.util.TreeSet;
import de.anomic.htmlFilter.htmlFilterContentScraper;
import de.anomic.index.indexCollectionRI;
import de.anomic.index.indexContainer;
import de.anomic.index.indexContainerOrder;
import de.anomic.index.indexRWIEntry;
import de.anomic.index.indexRAMRI;
import de.anomic.index.indexRI;
import de.anomic.index.indexRWIEntryNew;
import de.anomic.index.indexRWIEntryOld;
import de.anomic.index.indexURLEntry;
import de.anomic.kelondro.kelondroBase64Order;
import de.anomic.kelondro.kelondroBitfield;
import de.anomic.kelondro.kelondroException;
import de.anomic.kelondro.kelondroMergeIterator;
import de.anomic.kelondro.kelondroOrder;
import de.anomic.kelondro.kelondroRow;
import de.anomic.net.URL;
import de.anomic.plasma.urlPattern.plasmaURLPattern;
import de.anomic.server.logging.serverLog;
import de.anomic.yacy.yacyDHTAction;
public final class plasmaWordIndex implements indexRI {
private static final String indexAssortmentClusterPath = "ACLUSTER";
private static final int assortmentCount = 64;
private static final kelondroRow payloadrowold = indexRWIEntryOld.urlEntryRow;
private static final kelondroRow payloadrownew = indexRWIEntryNew.urlEntryRow;
private final File oldDatabaseRoot;
private final kelondroOrder indexOrder = kelondroBase64Order.enhancedCoder;
private final indexRAMRI dhtOutCache, dhtInCache;
private final indexCollectionRI collections; // new database structure to replace AssortmentCluster and FileCluster
private int assortmentBufferSize; // kb
private final plasmaWordIndexAssortmentCluster assortmentCluster; // old database structure, to be replaced by CollectionRI
private final plasmaWordIndexFileCluster backend; // old database structure, to be replaced by CollectionRI
public boolean busyCacheFlush; // shows if a cache flush is currently performed
public boolean useCollectionIndex; // flag for usage of new collectionIndex db
private int idleDivisor, busyDivisor;
public plasmaWordIndex(File oldDatabaseRoot, File newIndexRoot, boolean dummy, int bufferkb, long preloadTime, serverLog log, boolean useCollectionIndex) throws IOException {
this.oldDatabaseRoot = oldDatabaseRoot;
this.backend = new plasmaWordIndexFileCluster(oldDatabaseRoot, payloadrowold, log);
File textindexcache = new File(newIndexRoot, "PUBLIC/TEXT/RICACHE");
if (!(textindexcache.exists())) textindexcache.mkdirs();
if (useCollectionIndex) {
this.dhtOutCache = new indexRAMRI(textindexcache, payloadrownew, 1024, "dump1.array", log, true);
this.dhtInCache = new indexRAMRI(textindexcache, payloadrownew, 1024, "dump2.array", log, true);
} else {
this.dhtOutCache = new indexRAMRI(oldDatabaseRoot, payloadrowold, 64, "indexDump1.array", log, false);
this.dhtInCache = new indexRAMRI(oldDatabaseRoot, payloadrowold, 64, "indexDump2.array", log, false);
}
// create assortment cluster path
File assortmentClusterPath = new File(oldDatabaseRoot, indexAssortmentClusterPath);
this.assortmentBufferSize = bufferkb;
// create collections storage path
File textindexcollections = new File(newIndexRoot, "PUBLIC/TEXT/RICOLLECTION");
if (!(textindexcollections.exists())) textindexcollections.mkdirs();
if (useCollectionIndex) {
this.collections = new indexCollectionRI(textindexcollections, "collection", bufferkb * 1024, preloadTime, payloadrownew);
this.assortmentCluster = null;
} else {
this.collections = null;
if (!(assortmentClusterPath.exists())) assortmentClusterPath.mkdirs();
this.assortmentCluster = new plasmaWordIndexAssortmentCluster(assortmentClusterPath, assortmentCount, payloadrowold, assortmentBufferSize, preloadTime, log);
}
busyCacheFlush = false;
this.useCollectionIndex = useCollectionIndex;
this.busyDivisor = 5000;
this.idleDivisor = 420;
}
public kelondroRow payloadrow() {
if (useCollectionIndex) return payloadrownew; else return payloadrowold;
}
public indexRWIEntry newRWIEntry(
String urlHash,
int urlLength,
int urlComps,
int titleLength,
int hitcount,
int wordcount,
int phrasecount,
int posintext,
int posinphrase,
int posofphrase,
int worddistance,
int sizeOfPage,
long lastmodified,
long updatetime,
int quality,
String language,
char doctype,
int outlinksSame,
int outlinksOther,
kelondroBitfield flags ) {
if (useCollectionIndex)
return new indexRWIEntryNew(urlHash, urlLength, urlComps, titleLength, hitcount, wordcount, phrasecount,
posintext, posinphrase, posofphrase, worddistance, sizeOfPage, lastmodified, updatetime, quality, language, doctype,
outlinksSame, outlinksOther, flags);
else
return new indexRWIEntryOld(urlHash, urlLength, urlComps, titleLength, hitcount, wordcount, phrasecount,
posintext, posinphrase, posofphrase, worddistance, sizeOfPage, lastmodified, updatetime, quality, language, doctype,
outlinksSame, outlinksOther, false);
}
public File getRoot() {
return oldDatabaseRoot;
}
public int maxURLinDHTOutCache() {
return dhtOutCache.maxURLinCache();
}
public long minAgeOfDHTOutCache() {
return dhtOutCache.minAgeOfCache();
}
public long maxAgeOfDHTOutCache() {
return dhtOutCache.maxAgeOfCache();
}
public int maxURLinDHTInCache() {
return dhtInCache.maxURLinCache();
}
public long minAgeOfDHTInCache() {
return dhtInCache.minAgeOfCache();
}
public long maxAgeOfDHTInCache() {
return dhtInCache.maxAgeOfCache();
}
public int dhtOutCacheSize() {
return dhtOutCache.size();
}
public int dhtInCacheSize() {
return dhtInCache.size();
}
public int[] assortmentsSizes() {
return (assortmentCluster == null) ? null : assortmentCluster.sizes();
}
public int assortmentsCacheChunkSizeAvg() {
return (assortmentCluster == null) ? 0 : assortmentCluster.cacheChunkSizeAvg();
}
public int assortmentsCacheObjectSizeAvg() {
return (assortmentCluster == null) ? 0 : assortmentCluster.cacheObjectSizeAvg();
}
public int[] assortmentsCacheNodeStatus() {
if (assortmentCluster != null) return assortmentCluster.cacheNodeStatus();
return new int[]{0,0,0,0,0,0,0,0,0,0};
}
public long[] assortmentsCacheObjectStatus() {
if (assortmentCluster != null) return assortmentCluster.cacheObjectStatus();
return new long[]{0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0};
}
public void setMaxWordCount(int maxWords) {
dhtOutCache.setMaxWordCount(maxWords);
}
public void setInMaxWordCount(int maxWords) {
dhtInCache.setMaxWordCount(maxWords);
}
public void setWordFlushDivisor(int idleDivisor, int busyDivisor) {
this.idleDivisor = idleDivisor;
this.busyDivisor = busyDivisor;
}
public void flushControl() {
// check for forced flush
synchronized (this) {
if (dhtOutCache.size() > dhtOutCache.getMaxWordCount()) {
flushCache(dhtOutCache, dhtOutCache.size() + 500 - dhtOutCache.getMaxWordCount());
}
if (dhtInCache.size() > dhtInCache.getMaxWordCount()) {
flushCache(dhtInCache, dhtInCache.size() + 500 - dhtInCache.getMaxWordCount());
}
}
}
public long getUpdateTime(String wordHash) {
indexContainer entries = getContainer(wordHash, null, false, -1);
if (entries == null) return 0;
return entries.updated();
}
public indexContainer emptyContainer(String wordHash) {
return new indexContainer(wordHash, payloadrow(), useCollectionIndex);
}
public indexContainer addEntry(String wordHash, indexRWIEntry entry, long updateTime, boolean dhtInCase) {
if ((useCollectionIndex) && (entry instanceof indexRWIEntryOld)) entry = new indexRWIEntryNew((indexRWIEntryOld) entry);
// set dhtInCase depending on wordHash
if ((!dhtInCase) && (yacyDHTAction.shallBeOwnWord(wordHash))) dhtInCase = true;
// add the entry
if (dhtInCase) {
dhtInCache.addEntry(wordHash, entry, updateTime, true);
} else {
dhtOutCache.addEntry(wordHash, entry, updateTime, false);
flushControl();
}
return null;
}
private indexContainer convertOld2New(indexContainer entries) {
// convert old entries to new entries
indexContainer newentries = new indexContainer(entries.getWordHash(), payloadrownew, useCollectionIndex);
Iterator i = entries.entries();
indexRWIEntryOld old;
while (i.hasNext()) {
old = (indexRWIEntryOld) i.next();
newentries.add(new indexRWIEntryNew(old));
}
return newentries;
}
public indexContainer addEntries(indexContainer entries, long updateTime, boolean dhtInCase) {
if ((useCollectionIndex) && (entries.row().objectsize() == payloadrowold.objectsize())) entries = convertOld2New(entries);
// set dhtInCase depending on wordHash
if ((!dhtInCase) && (yacyDHTAction.shallBeOwnWord(entries.getWordHash()))) dhtInCase = true;
// add the entry
if (dhtInCase) {
dhtInCache.addEntries(entries, updateTime, true);
} else {
dhtOutCache.addEntries(entries, updateTime, false);
flushControl();
}
return null;
}
public void flushCacheSome(boolean busy) {
flushCacheSome(dhtOutCache, busy);
flushCacheSome(dhtInCache, busy);
}
private void flushCacheSome(indexRAMRI ram, boolean busy) {
int flushCount = (busy) ? ram.size() / busyDivisor : ram.size() / idleDivisor;
if (flushCount > 100) flushCount = 100;
if (flushCount < 1) flushCount = Math.min(1, ram.size());
flushCache(ram, flushCount);
while (ram.maxURLinCache() > ((useCollectionIndex) ? 1024 : 64)) flushCache(ram, 1);
}
private void flushCache(indexRAMRI ram, int count) {
if (count <= 0) return;
if (count > 1000) count = 1000;
busyCacheFlush = true;
String wordHash;
//System.out.println("DEBUG-Started flush of " + count + " entries from RAM to DB");
//long start = System.currentTimeMillis();
for (int i = 0; i < count; i++) { // possible position of outOfMemoryError ?
if (ram.size() == 0) break;
synchronized (this) {
wordHash = ram.bestFlushWordHash();
// flush the wordHash
indexContainer c = ram.deleteContainer(wordHash);
if (c != null) {
if (useCollectionIndex) {
indexContainer feedback = collections.addEntries(c, c.updated(), false);
if (feedback != null) {
throw new RuntimeException("indexCollectionRI shall not return feedback entries; feedback = " + feedback.toString());
}
} else {
indexContainer feedback = assortmentCluster.addEntries(c, c.updated(), false);
if (feedback != null) {
backend.addEntries(feedback, System.currentTimeMillis(), true);
}
}
}
// pause to next loop to give other processes a chance to use IO
//try {this.wait(8);} catch (InterruptedException e) {}
}
}
//System.out.println("DEBUG-Finished flush of " + count + " entries from RAM to DB in " + (System.currentTimeMillis() - start) + " milliseconds");
busyCacheFlush = false;
}
private static final int hour = 3600000;
private static final int day = 86400000;
public static int microDateDays(Date modified) {
return microDateDays(modified.getTime());
}
public static int microDateDays(long modified) {
// this calculates a virtual age from a given date
// the purpose is to have an age in days of a given modified date
// from a fixed standpoint in the past
// one day has 60*60*24 seconds = 86400 seconds
// we take mod 64**3 = 262144, this is the mask of the storage
return (int) ((modified / day) % 262144);
}
public static String microDateHoursStr(long time) {
return kelondroBase64Order.enhancedCoder.encodeLong(microDateHoursInt(time), 3);
}
public static int microDateHoursInt(long time) {
return (int) ((time / hour) % 262144);
}
public static int microDateHoursAge(String mdhs) {
return microDateHoursInt(System.currentTimeMillis()) - (int) kelondroBase64Order.enhancedCoder.decodeLong(mdhs);
}
public static long reverseMicroDateDays(int microDateDays) {
return ((long) microDateDays) * ((long) day);
}
public int addPageIndex(URL url, String urlHash, Date urlModified, int size, plasmaParserDocument document, plasmaCondenser condenser, String language, char doctype, int outlinksSame, int outlinksOther) {
// this is called by the switchboard to put in a new page into the index
// use all the words in one condenser object to simultanous create index entries
// iterate over all words
Iterator i = condenser.words();
Map.Entry wentry;
String word;
indexRWIEntry ientry;
plasmaCondenser.wordStatProp wprop;
String wordHash;
int urlLength = url.toString().length();
int urlComps = htmlFilterContentScraper.urlComps(url.toString()).length;
while (i.hasNext()) {
wentry = (Map.Entry) i.next();
word = (String) wentry.getKey();
wprop = (plasmaCondenser.wordStatProp) wentry.getValue();
// if ((s.length() > 4) && (c > 1)) System.out.println("# " + s + ":" + c);
wordHash = plasmaCondenser.word2hash(word);
ientry = newRWIEntry(urlHash,
urlLength, urlComps, (document == null) ? urlLength : document.getMainLongTitle().length(),
wprop.count,
condenser.RESULT_SIMI_WORDS,
condenser.RESULT_SIMI_SENTENCES,
wprop.posInText,
wprop.posInPhrase,
wprop.numOfPhrase,
0,
size,
urlModified.getTime(),
System.currentTimeMillis(),
condenser.RESULT_WORD_ENTROPHY,
language,
doctype,
outlinksSame, outlinksOther,
condenser.RESULT_FLAGS);
addEntry(wordHash, ientry, System.currentTimeMillis(), false);
}
// System.out.println("DEBUG: plasmaSearch.addPageIndex: added " +
// condenser.getWords().size() + " words, flushed " + c + " entries");
return condenser.RESULT_SIMI_WORDS;
}
public indexContainer getContainer(String wordHash, Set urlselection, boolean deleteIfEmpty, long maxTime) {
long start = System.currentTimeMillis();
// get from cache
indexContainer container = dhtOutCache.getContainer(wordHash, urlselection, true, -1);
if (container == null) {
container = dhtInCache.getContainer(wordHash, urlselection, true, -1);
} else {
container.add(dhtInCache.getContainer(wordHash, urlselection, true, -1), -1);
}
// get from collection index
if (useCollectionIndex) {
if (container == null) {
container = collections.getContainer(wordHash, urlselection, true, (maxTime < 0) ? -1 : maxTime);
} else {
container.add(collections.getContainer(wordHash, urlselection, true, (maxTime < 0) ? -1 : maxTime), -1);
}
} else {
// get from assortments
if (assortmentCluster != null) {
if (container == null) {
container = assortmentCluster.getContainer(wordHash, urlselection, true, (maxTime < 0) ? -1 : maxTime);
} else {
// add containers from assortment cluster
container.add(assortmentCluster.getContainer(wordHash, urlselection, true, (maxTime < 0) ? -1 : maxTime), -1);
}
}
// get from backend
if (maxTime > 0) {
maxTime = maxTime - (System.currentTimeMillis() - start);
if (maxTime < 0) maxTime = 100;
}
if (container == null) {
container = backend.getContainer(wordHash, urlselection, deleteIfEmpty, (maxTime < 0) ? -1 : maxTime);
} else {
container.add(backend.getContainer(wordHash, urlselection, deleteIfEmpty, (maxTime < 0) ? -1 : maxTime), -1);
}
}
return container;
}
public Map getContainers(Set wordHashes, Set urlselection, boolean deleteIfEmpty, boolean interruptIfEmpty, long maxTime) {
// return map of wordhash:indexContainer
// retrieve entities that belong to the hashes
HashMap containers = new HashMap();
String singleHash;
indexContainer singleContainer;
Iterator i = wordHashes.iterator();
long start = System.currentTimeMillis();
long remaining;
while (i.hasNext()) {
// check time
remaining = maxTime - (System.currentTimeMillis() - start);
//if ((maxTime > 0) && (remaining <= 0)) break;
if ((maxTime >= 0) && (remaining <= 0)) remaining = 100;
// get next word hash:
singleHash = (String) i.next();
// retrieve index
singleContainer = getContainer(singleHash, urlselection, deleteIfEmpty, (maxTime < 0) ? -1 : remaining / (wordHashes.size() - containers.size()));
// check result
if (((singleContainer == null) || (singleContainer.size() == 0)) && (interruptIfEmpty)) return new HashMap();
containers.put(singleHash, singleContainer);
}
return containers;
}
public int size() {
if (useCollectionIndex)
return java.lang.Math.max(collections.size(), java.lang.Math.max(dhtInCache.size(), dhtOutCache.size()));
else
return java.lang.Math.max((assortmentCluster == null) ? 0 : assortmentCluster.size(),
java.lang.Math.max(backend.size(),
java.lang.Math.max(dhtInCache.size(), dhtOutCache.size())));
}
public int indexSize(String wordHash) {
int size = 0;
size += dhtInCache.indexSize(wordHash);
size += dhtOutCache.indexSize(wordHash);
if (useCollectionIndex) {
size += collections.indexSize(wordHash);
} else try {
size += (assortmentCluster == null) ? 0 : assortmentCluster.indexSize(wordHash);
plasmaWordIndexFile entity = backend.getEntity(wordHash, true, -1);
if (entity != null) {
size += entity.size();
entity.close();
}
} catch (IOException e) {}
return size;
}
public void close(int waitingBoundSeconds) {
synchronized (this) {
dhtInCache.close(waitingBoundSeconds);
dhtOutCache.close(waitingBoundSeconds);
if (useCollectionIndex) {
collections.close(-1);
} else {
if (assortmentCluster != null) assortmentCluster.close(-1);
backend.close(10);
}
}
}
public indexContainer deleteContainer(String wordHash) {
indexContainer c = new indexContainer(wordHash, payloadrow(), useCollectionIndex);
c.add(dhtInCache.deleteContainer(wordHash), -1);
c.add(dhtOutCache.deleteContainer(wordHash), -1);
if (useCollectionIndex) {
c.add(collections.deleteContainer(wordHash), -1);
} else {
if (assortmentCluster != null) c.add(assortmentCluster.deleteContainer(wordHash), -1);
c.add(backend.deleteContainer(wordHash), -1);
}
return c;
}
public boolean removeEntry(String wordHash, String urlHash, boolean deleteComplete) {
boolean removed = false;
removed = removed | (dhtInCache.removeEntry(wordHash, urlHash, deleteComplete));
removed = removed | (dhtOutCache.removeEntry(wordHash, urlHash, deleteComplete));
if (useCollectionIndex) {
removed = removed | (collections.removeEntry(wordHash, urlHash, deleteComplete));
} else {
if (assortmentCluster != null) removed = removed | (assortmentCluster.removeEntry(wordHash, urlHash, deleteComplete));
removed = removed | backend.removeEntry(wordHash, urlHash, deleteComplete);
}
return removed;
}
public int removeEntries(String wordHash, Set urlHashes, boolean deleteComplete) {
int removed = 0;
removed += dhtInCache.removeEntries(wordHash, urlHashes, deleteComplete);
removed += dhtOutCache.removeEntries(wordHash, urlHashes, deleteComplete);
if (useCollectionIndex) {
removed += collections.removeEntries(wordHash, urlHashes, deleteComplete);
} else if (assortmentCluster != null) {
removed += assortmentCluster.removeEntries(wordHash, urlHashes, deleteComplete);
removed += backend.removeEntries(wordHash, urlHashes, deleteComplete);
}
return removed;
}
public String removeEntriesExpl(String wordHash, Set urlHashes, boolean deleteComplete) {
String removed = "";
removed += dhtInCache.removeEntries(wordHash, urlHashes, deleteComplete) + ", ";
removed += dhtOutCache.removeEntries(wordHash, urlHashes, deleteComplete) + ", ";
if (useCollectionIndex) {
removed += collections.removeEntries(wordHash, urlHashes, deleteComplete);
} else {
if (assortmentCluster != null) removed += assortmentCluster.removeEntries(wordHash, urlHashes, deleteComplete) + ", ";
removed += backend.removeEntries(wordHash, urlHashes, deleteComplete);
}
return removed;
}
public static final int RL_RAMCACHE = 0;
public static final int RL_COLLECTIONS = 1; // the new index structure
public static final int RL_ASSORTMENTS = 2; // (to be) outdated structure
public static final int RL_WORDFILES = 3; // (to be) outdated structure
public int tryRemoveURLs(String urlHash) {
// this tries to delete an index from the cache that has this
// urlHash assigned. This can only work if the entry is really fresh
// and can be found in the RAM cache
// this returns the number of deletion that had been possible
return dhtInCache.tryRemoveURLs(urlHash) | dhtOutCache.tryRemoveURLs(urlHash);
}
public TreeSet indexContainerSet(String startHash, int resourceLevel, boolean rot, int count) throws IOException {
// creates a set of indexContainers
// this does not use the dhtInCache
kelondroOrder containerOrder = new indexContainerOrder((kelondroOrder) indexOrder.clone());
containerOrder.rotate(startHash.getBytes());
TreeSet containers = new TreeSet(containerOrder);
Iterator i = wordContainers(startHash, resourceLevel, rot);
if (resourceLevel == plasmaWordIndex.RL_RAMCACHE) count = Math.min(dhtOutCache.size(), count);
indexContainer container;
while ((count > 0) && (i.hasNext())) {
container = (indexContainer) i.next();
if ((container != null) && (container.size() > 0)) {
containers.add(container);
count--;
}
}
return containers;
}
public Iterator wordContainers(String startHash, boolean rot) {
// returns an iteration of indexContainers
try {
return wordContainers(startHash, RL_WORDFILES, rot);
} catch (IOException e) {
return new HashSet().iterator();
}
}
public Iterator wordContainers(String startHash, int resourceLevel, boolean rot) throws IOException {
if (rot) return new rotatingContainerIterator(startHash, resourceLevel);
else return wordContainers(startHash, resourceLevel);
}
private Iterator wordContainers(String startWordHash, int resourceLevel) throws IOException {
kelondroOrder containerOrder = new indexContainerOrder((kelondroOrder) indexOrder.clone());
containerOrder.rotate(startWordHash.getBytes());
if (resourceLevel == plasmaWordIndex.RL_RAMCACHE) {
return dhtOutCache.wordContainers(startWordHash, false);
}
if (useCollectionIndex) {
return new kelondroMergeIterator(
dhtOutCache.wordContainers(startWordHash, false),
collections.wordContainers(startWordHash, false),
containerOrder,
indexContainer.containerMergeMethod,
true);
} else {
if (resourceLevel == plasmaWordIndex.RL_ASSORTMENTS) {
return new kelondroMergeIterator(
dhtOutCache.wordContainers(startWordHash, false),
(assortmentCluster == null) ? null : assortmentCluster.wordContainers(startWordHash, true, false),
containerOrder,
indexContainer.containerMergeMethod,
true);
}
if (resourceLevel == plasmaWordIndex.RL_WORDFILES) {
return new kelondroMergeIterator(
new kelondroMergeIterator(
dhtOutCache.wordContainers(startWordHash, false),
(assortmentCluster == null) ? null : assortmentCluster.wordContainers(startWordHash, true, false),
containerOrder,
indexContainer.containerMergeMethod,
true),
backend.wordContainers(startWordHash, false),
containerOrder,
indexContainer.containerMergeMethod,
true);
}
}
return null;
}
public class rotatingContainerIterator implements Iterator {
Iterator i;
int resourceLevel;
public rotatingContainerIterator(String startWordHash, int resourceLevel) throws IOException {
this.resourceLevel = resourceLevel;
i = wordContainers(startWordHash, resourceLevel);
}
public void finalize() {
i = null;
}
public boolean hasNext() {
if (i.hasNext()) return true;
else try {
i = wordContainers("------------", resourceLevel);
return i.hasNext();
} catch (IOException e) {
return false;
}
}
public Object next() {
return i.next();
}
public void remove() {
throw new java.lang.UnsupportedOperationException("rotatingWordIterator does not support remove");
}
} // class rotatingContainerIterator
public Object migrateWords2Assortment(String wordhash) throws IOException {
// returns the number of entries that had been added to the assortments
// can be negative if some assortments have been moved to the backend
File db = plasmaWordIndexFile.wordHash2path(oldDatabaseRoot, wordhash);
if (!(db.exists())) return "not available";
plasmaWordIndexFile entity = null;
try {
entity = new plasmaWordIndexFile(oldDatabaseRoot, wordhash, true);
int size = entity.size();
if (size > assortmentCluster.clusterCapacity) {
// this will be too big to integrate it
entity.close(); entity = null;
return "too big";
} else {
// take out all words from the assortment to see if it fits
// together with the extracted assortment
indexContainer container = assortmentCluster.deleteContainer(wordhash, -1);
if (size + container.size() > assortmentCluster.clusterCapacity) {
// this will also be too big to integrate, add to entity
entity.addEntries(container);
entity.close(); entity = null;
return new Integer(-container.size());
} else {
// the combined container will fit, read the container
try {
Iterator entries = entity.elements(true);
indexRWIEntry entry;
while (entries.hasNext()) {
entry = (indexRWIEntry) entries.next();
// System.out.println("ENTRY = " + entry.getUrlHash());
container.add(new indexRWIEntry[]{entry}, System.currentTimeMillis());
}
// we have read all elements, now delete the entity
entity.deleteComplete();
entity.close(); entity = null;
// integrate the container into the assortments; this will work
assortmentCluster.addEntries(container, container.updated(), false);
return new Integer(size);
} catch (kelondroException e) {
// database corrupted, we simply give up the database and delete it
try {entity.close();} catch (Exception ee) {} entity = null;
try {db.delete();} catch (Exception ee) {}
return "database corrupted; deleted";
}
}
}
} finally {
if (entity != null) try {entity.close();}catch(Exception e){}
}
}
public Object migrateWords2index(String wordhash) throws IOException {
// returns the number of entries that had been added to the assortments
// can be negative if some assortments have been moved to the backend
File db = plasmaWordIndexFile.wordHash2path(oldDatabaseRoot, wordhash);
if (!(db.exists())) return "not available";
plasmaWordIndexFile entity = null;
try {
entity = new plasmaWordIndexFile(oldDatabaseRoot, wordhash, true);
int size = entity.size();
indexContainer container = new indexContainer(wordhash, payloadrow(), useCollectionIndex);
try {
Iterator entries = entity.elements(true);
indexRWIEntry entry;
while (entries.hasNext()) {
entry = (indexRWIEntry) entries.next();
// System.out.println("ENTRY = " + entry.getUrlHash());
container.add(new indexRWIEntry[] { entry }, System.currentTimeMillis());
}
// we have read all elements, now delete the entity
entity.deleteComplete();
entity.close();
entity = null;
indexContainer feedback = collections.addEntries(container, container.updated(), false);
if (feedback != null) return feedback;
return new Integer(size);
} catch (kelondroException e) {
// database corrupted, we simply give up the database and delete it
try { entity.close(); } catch (Exception ee) { }
entity = null;
try { db.delete(); } catch (Exception ee) { }
return "database corrupted; deleted";
}
} finally {
if (entity != null) try {entity.close();}catch(Exception e){}
}
}
// The Cleaner class was provided as "UrldbCleaner" by Hydrox
// see http://www.yacy-forum.de/viewtopic.php?p=18093#18093
public Cleaner makeCleaner(plasmaCrawlLURL lurl, String startHash) {
return new Cleaner(lurl, startHash);
}
public class Cleaner extends Thread {
private String startHash;
private boolean run = true;
private boolean pause = false;
public int rwiCountAtStart = 0;
public String wordHashNow = "";
public String lastWordHash = "";
public int lastDeletionCounter = 0;
private plasmaCrawlLURL lurl;
public Cleaner(plasmaCrawlLURL lurl, String startHash) {
this.lurl = lurl;
this.startHash = startHash;
this.rwiCountAtStart = size();
}
public void run() {
serverLog.logInfo("INDEXCLEANER", "IndexCleaner-Thread started");
indexContainer container = null;
indexRWIEntry entry = null;
URL url = null;
HashSet urlHashs = new HashSet();
try {
Iterator indexContainerIterator = indexContainerSet(startHash, plasmaWordIndex.RL_WORDFILES, false, 100).iterator();
while (indexContainerIterator.hasNext() && run) {
waiter();
container = (indexContainer) indexContainerIterator.next();
Iterator containerIterator = container.entries();
wordHashNow = container.getWordHash();
while (containerIterator.hasNext() && run) {
waiter();
entry = (indexRWIEntry) containerIterator.next();
// System.out.println("Wordhash: "+wordHash+" UrlHash: "+entry.getUrlHash());
indexURLEntry ue = lurl.load(entry.urlHash(), null);
if (ue == null) {
urlHashs.add(entry.urlHash());
} else {
url = ue.comp().url();
if ((url == null) || (plasmaSwitchboard.urlBlacklist.isListed(plasmaURLPattern.BLACKLIST_CRAWLER, url) == true)) {
urlHashs.add(entry.urlHash());
}
}
}
if (urlHashs.size() > 0) {
int removed = removeEntries(container.getWordHash(), urlHashs, true);
serverLog.logFine("INDEXCLEANER", container.getWordHash() + ": " + removed + " of " + container.size() + " URL-entries deleted");
lastWordHash = container.getWordHash();
lastDeletionCounter = urlHashs.size();
urlHashs.clear();
}
if (!containerIterator.hasNext()) {
// We may not be finished yet, try to get the next chunk of wordHashes
TreeSet containers = indexContainerSet(container.getWordHash(), plasmaWordIndex.RL_WORDFILES, false, 100);
indexContainerIterator = containers.iterator();
// Make sure we don't get the same wordhash twice, but don't skip a word
if ((indexContainerIterator.hasNext())&&(!container.getWordHash().equals(((indexContainer) indexContainerIterator.next()).getWordHash()))) {
indexContainerIterator = containers.iterator();
}
}
}
} catch (IOException e) {
serverLog.logSevere("INDEXCLEANER",
"IndexCleaner-Thread: unable to start: "
+ e.getMessage());
}
serverLog.logInfo("INDEXCLEANER", "IndexCleaner-Thread stopped");
}
public void abort() {
synchronized(this) {
run = false;
this.notifyAll();
}
}
public void pause() {
synchronized(this) {
if(pause == false) {
pause = true;
serverLog.logInfo("INDEXCLEANER", "IndexCleaner-Thread paused");
}
}
}
public void endPause() {
synchronized(this) {
if (pause == true) {
pause = false;
this.notifyAll();
serverLog.logInfo("INDEXCLEANER", "IndexCleaner-Thread resumed");
}
}
}
public void waiter() {
synchronized(this) {
if (this.pause) {
try {
this.wait();
} catch (InterruptedException e) {
this.run = false;
return;
}
}
}
}
}
public static void main(String[] args) {
// System.out.println(kelondroMSetTools.fastStringComparator(true).compare("RwGeoUdyDQ0Y", "rwGeoUdyDQ0Y"));
// System.out.println(new Date(reverseMicroDateDays(microDateDays(System.currentTimeMillis()))));
File plasmadb = new File("D:\\dev\\proxy\\DATA\\PLASMADB");
File indexdb = new File("D:\\dev\\proxy\\DATA\\INDEX");
try {
plasmaWordIndex index = new plasmaWordIndex(plasmadb, indexdb, true, 555, 1000, new serverLog("TESTAPP"), false);
Iterator containerIter = index.wordContainers("5A8yhZMh_Kmv", plasmaWordIndex.RL_WORDFILES, true);
while (containerIter.hasNext()) {
System.out.println("File: " + (indexContainer) containerIter.next());
}
} catch (IOException e) {
e.printStackTrace();
}
}
}