yacy_search_server/source/de/anomic/kelondro/text/BufferedIndexCollection.java
orbiter a9cea419ef Integration of the new index data structure IndexCell
This is the start of a testing phase for IndexCell data structure which will replace
the collections and caching strategy. IndexCall creation and maintenance is fast, has
no caching overhead, very low IO load and is the basis for the next data structure,
index segments.

IndexCell files are stored at DATA/<network>/TEXT/RICELL
With this commit still the old data structures are used, until a flag in yacy.conf is set.
To switch to the new data structure, set
useCell = true
in yacy.conf. Then you will have no access any more to TEXT/RICACHE and TEXT/RICOLLECTION

This code is still bleeding-edge development. Please do not use the new data structure for
production now. Future versions may have changed data types, or other storage locations.
The next main release will have a migration feature for old data structures.

git-svn-id: https://svn.berlios.de/svnroot/repos/yacy/trunk@5724 6c8d7289-2bf4-0310-a012-ef5d649a1542
2009-03-17 13:03:27 +00:00

344 lines
13 KiB
Java

// BufferedIndexCollection.java
// (C) 2005, 2006 by Michael Peter Christen; mc@yacy.net, Frankfurt a. M., Germany
// first published 2005 on http://www.anomic.de
//
// This is a part of YaCy, a peer-to-peer based web search engine
//
// $LastChangedDate: 2009-03-13 11:34:51 +0100 (Fr, 13 Mrz 2009) $
// $LastChangedRevision: 5709 $
// $LastChangedBy: orbiter $
//
// LICENSE
//
// This program is free software; you can redistribute it and/or modify
// it under the terms of the GNU General Public License as published by
// the Free Software Foundation; either version 2 of the License, or
// (at your option) any later version.
//
// This program is distributed in the hope that it will be useful,
// but WITHOUT ANY WARRANTY; without even the implied warranty of
// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
// GNU General Public License for more details.
//
// You should have received a copy of the GNU General Public License
// along with this program; if not, write to the Free Software
// Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
package de.anomic.kelondro.text;
import java.io.File;
import java.io.IOException;
import java.util.ArrayList;
import java.util.Set;
import de.anomic.kelondro.index.Row;
import de.anomic.kelondro.index.RowCollection;
import de.anomic.kelondro.order.Base64Order;
import de.anomic.kelondro.order.ByteOrder;
import de.anomic.kelondro.order.CloneableIterator;
import de.anomic.kelondro.order.MergeIterator;
import de.anomic.kelondro.order.Order;
import de.anomic.kelondro.order.RotateIterator;
import de.anomic.kelondro.text.Index;
import de.anomic.kelondro.text.IndexBuffer;
import de.anomic.kelondro.text.IndexCollection;
import de.anomic.kelondro.text.ReferenceContainer;
import de.anomic.kelondro.text.ReferenceContainerOrder;
import de.anomic.kelondro.text.ReferenceRow;
import de.anomic.kelondro.util.MemoryControl;
import de.anomic.kelondro.util.Log;
import de.anomic.server.serverProfiling;
public final class BufferedIndexCollection extends AbstractBufferedIndex implements Index, BufferedIndex {
// environment constants
public static final long wCacheMaxAge = 1000 * 60 * 30; // milliseconds; 30 minutes
public static final int wCacheMaxChunk = 800; // maximum number of references for each urlhash
public static final int lowcachedivisor = 900;
public static final int maxCollectionPartition = 7; // should be 7
private final IndexBuffer buffer;
private final IndexCollection collections;
public BufferedIndexCollection (
File indexPrimaryTextLocation,
final ByteOrder wordOrdering,
final Row payloadrow,
final int entityCacheMaxSize,
final boolean useCommons,
final int redundancy,
Log log) throws IOException {
final File textindexcache = new File(indexPrimaryTextLocation, "RICACHE");
if (!(textindexcache.exists())) textindexcache.mkdirs();
if (new File(textindexcache, "index.dhtin.blob").exists()) {
// migration of the both caches into one
this.buffer = new IndexBuffer(textindexcache, wordOrdering, payloadrow, entityCacheMaxSize, wCacheMaxChunk, wCacheMaxAge, "index.dhtout.blob", log);
IndexBuffer dhtInCache = new IndexBuffer(textindexcache, wordOrdering, payloadrow, entityCacheMaxSize, wCacheMaxChunk, wCacheMaxAge, "index.dhtin.blob", log);
for (ReferenceContainer c: dhtInCache) {
this.buffer.add(c);
}
new File(textindexcache, "index.dhtin.blob").delete();
} else {
// read in new BLOB
this.buffer = new IndexBuffer(textindexcache, wordOrdering, payloadrow, entityCacheMaxSize, wCacheMaxChunk, wCacheMaxAge, "index.dhtout.blob", log);
}
// create collections storage path
final File textindexcollections = new File(indexPrimaryTextLocation, "RICOLLECTION");
if (!(textindexcollections.exists())) textindexcollections.mkdirs();
this.collections = new IndexCollection(
textindexcollections,
"collection",
12,
Base64Order.enhancedCoder,
maxCollectionPartition,
ReferenceRow.urlEntryRow,
useCommons);
}
/* methods for interface Index */
public void add(final ReferenceContainer entries) {
assert (entries.row().objectsize == ReferenceRow.urlEntryRow.objectsize);
// add the entry
buffer.add(entries);
cacheFlushControl();
}
public void add(final String wordHash, final ReferenceRow entry) throws IOException {
// add the entry
buffer.add(wordHash, entry);
cacheFlushControl();
}
public boolean has(final String wordHash) {
if (buffer.has(wordHash)) return true;
if (collections.has(wordHash)) return true;
return false;
}
public int count(String key) {
return buffer.count(key) + collections.count(key);
}
public ReferenceContainer get(final String wordHash, final Set<String> urlselection) {
if (wordHash == null) {
// wrong input
return null;
}
// get from cache
ReferenceContainer container;
container = buffer.get(wordHash, urlselection);
// get from collection index
if (container == null) {
container = collections.get(wordHash, urlselection);
} else {
container.addAllUnique(collections.get(wordHash, urlselection));
}
if (container == null) return null;
// check doubles
final int beforeDouble = container.size();
container.sort();
final ArrayList<RowCollection> d = container.removeDoubles();
RowCollection set;
for (int i = 0; i < d.size(); i++) {
// for each element in the double-set, take that one that is the most recent one
set = d.get(i);
ReferenceRow e, elm = null;
long lm = 0;
for (int j = 0; j < set.size(); j++) {
e = new ReferenceRow(set.get(j, true));
if ((elm == null) || (e.lastModified() > lm)) {
elm = e;
lm = e.lastModified();
}
}
if(elm != null) {
container.addUnique(elm.toKelondroEntry());
}
}
if (container.size() < beforeDouble) System.out.println("*** DEBUG DOUBLECHECK - removed " + (beforeDouble - container.size()) + " index entries from word container " + container.getWordHash());
return container;
}
public ReferenceContainer delete(final String wordHash) {
final ReferenceContainer c = new ReferenceContainer(
wordHash,
ReferenceRow.urlEntryRow,
buffer.count(wordHash));
c.addAllUnique(buffer.delete(wordHash));
c.addAllUnique(collections.delete(wordHash));
return c;
}
public boolean remove(final String wordHash, final String urlHash) {
boolean removed = false;
removed = removed | (buffer.remove(wordHash, urlHash));
removed = removed | (collections.remove(wordHash, urlHash));
return removed;
}
public int remove(final String wordHash, final Set<String> urlHashes) {
int removed = 0;
removed += buffer.remove(wordHash, urlHashes);
removed += collections.remove(wordHash, urlHashes);
return removed;
}
public synchronized CloneableIterator<ReferenceContainer> references(final String startHash, final boolean rot, final boolean ram) throws IOException {
final CloneableIterator<ReferenceContainer> i = wordContainers(startHash, ram);
if (rot) {
return new RotateIterator<ReferenceContainer>(i, new String(Base64Order.zero(startHash.length())), buffer.size() + ((ram) ? 0 : collections.size()));
}
return i;
}
private synchronized CloneableIterator<ReferenceContainer> wordContainers(final String startWordHash, final boolean ram) throws IOException {
final Order<ReferenceContainer> containerOrder = new ReferenceContainerOrder(buffer.ordering().clone());
containerOrder.rotate(ReferenceContainer.emptyContainer(startWordHash, 0));
if (ram) {
return buffer.references(startWordHash, false);
}
return collections.references(startWordHash, false);
/*
return new MergeIterator<ReferenceContainer>(
indexCache.referenceIterator(startWordHash, false, true),
collections.referenceIterator(startWordHash, false, false),
containerOrder,
ReferenceContainer.containerMergeMethod,
true);
*/
}
public void clear() {
buffer.clear();
try {
collections.clear();
} catch (IOException e) {
e.printStackTrace();
}
}
public void close() {
buffer.close();
collections.close();
}
public int size() {
return java.lang.Math.max(collections.size(), buffer.size());
}
public int minMem() {
return 1024*1024 /* indexing overhead */ + buffer.minMem() + collections.minMem();
}
/*
* methods for cache management
*/
public int getBufferMaxReferences() {
return buffer.getBufferMaxReferences();
}
public long getBufferMinAge() {
return buffer.getBufferMinAge();
}
public long getBufferMaxAge() {
return buffer.getBufferMaxAge();
}
public long getBufferSizeBytes() {
return buffer.getBufferSizeBytes();
}
public void setBufferMaxWordCount(final int maxWords) {
buffer.setMaxWordCount(maxWords);
}
private void cacheFlushControl() {
// check for forced flush
int cs = getBufferSize();
if (cs > 0) {
// flush elements that are too big. This flushing depends on the fact that the flush rule
// selects the biggest elements first for flushing. If it does not for any reason, the following
// loop would not terminate.
serverProfiling.update("wordcache", Long.valueOf(cs), true);
// To ensure termination an additional counter is used
int l = 0;
while (this.buffer.size() > 0 && (l++ < 100) && (this.buffer.getBufferMaxReferences() > wCacheMaxChunk)) {
flushCacheOne(this.buffer);
}
// next flush more entries if the size exceeds the maximum size of the cache
while (this.buffer.size() > 0 &&
((this.buffer.size() > this.buffer.getMaxWordCount()) ||
(MemoryControl.available() < collections.minMem()))) {
flushCacheOne(this.buffer);
}
if (getBufferSize() != cs) serverProfiling.update("wordcache", Long.valueOf(getBufferSize()), true);
}
}
public void cleanupBuffer(int time) {
flushCacheUntil(System.currentTimeMillis() + time);
}
private synchronized void flushCacheUntil(long timeout) {
while (System.currentTimeMillis() < timeout && buffer.size() > 0) {
flushCacheOne(buffer);
}
}
private synchronized void flushCacheOne(final IndexBuffer ram) {
if (ram.size() > 0) collections.add(flushContainer(ram));
}
private ReferenceContainer flushContainer(final IndexBuffer ram) {
String wordHash;
ReferenceContainer c;
wordHash = ram.maxScoreWordHash();
c = ram.get(wordHash, null);
if ((c != null) && (c.size() > wCacheMaxChunk)) {
return ram.delete(wordHash);
} else {
return ram.delete(ram.bestFlushWordHash());
}
}
public int getBackendSize() {
return collections.size();
}
public int getBufferSize() {
return buffer.size();
}
public ByteOrder ordering() {
return collections.ordering();
}
public CloneableIterator<ReferenceContainer> references(String startWordHash, boolean rot) {
final Order<ReferenceContainer> containerOrder = new ReferenceContainerOrder(this.buffer.ordering().clone());
return new MergeIterator<ReferenceContainer>(
this.buffer.references(startWordHash, rot),
new MergeIterator<ReferenceContainer>(
this.buffer.references(startWordHash, false),
this.collections.references(startWordHash, false),
containerOrder,
ReferenceContainer.containerMergeMethod,
true),
containerOrder,
ReferenceContainer.containerMergeMethod,
true);
}
}