yacy_search_server/source/de/anomic/kelondro/text/IndexCollectionMigration.java
orbiter c8624903c6 full redesign of index access data model:
terms (words) are not any more retrieved by their word hash string, but by a byte[] containing the word hash.
this has strong advantages when RWIs are sorted in the ReferenceContainer Cache and compared with the sun.java TreeMap method, which needed getBytes() and new String() transformations before.
Many thousands of such conversions are now omitted every second, which increases the indexing speed by a factor of two.

git-svn-id: https://svn.berlios.de/svnroot/repos/yacy/trunk@5812 6c8d7289-2bf4-0310-a012-ef5d649a1542
2009-04-16 15:29:00 +00:00

344 lines
13 KiB
Java

// IndexCollectionMigration.java
// (C) 2009 by Michael Peter Christen; mc@yacy.net, Frankfurt a. M., Germany
// first published 30.03.2009 on http://yacy.net
//
// This is a part of YaCy, a peer-to-peer based web search engine
//
// $LastChangedDate: 2009-03-13 11:34:51 +0100 (Fr, 13 Mrz 2009) $
// $LastChangedRevision: 5709 $
// $LastChangedBy: orbiter $
//
// LICENSE
//
// This program is free software; you can redistribute it and/or modify
// it under the terms of the GNU General Public License as published by
// the Free Software Foundation; either version 2 of the License, or
// (at your option) any later version.
//
// This program is distributed in the hope that it will be useful,
// but WITHOUT ANY WARRANTY; without even the implied warranty of
// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
// GNU General Public License for more details.
//
// You should have received a copy of the GNU General Public License
// along with this program; if not, write to the Free Software
// Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
package de.anomic.kelondro.text;
import java.io.File;
import java.io.IOException;
import java.util.Set;
import de.anomic.kelondro.index.Row;
import de.anomic.kelondro.order.Base64Order;
import de.anomic.kelondro.order.ByteOrder;
import de.anomic.kelondro.order.CloneableIterator;
import de.anomic.kelondro.order.MergeIterator;
import de.anomic.kelondro.order.Order;
import de.anomic.kelondro.order.RotateIterator;
import de.anomic.kelondro.text.Index;
import de.anomic.kelondro.text.IndexCollection;
import de.anomic.kelondro.text.ReferenceContainer;
import de.anomic.kelondro.text.ReferenceContainerOrder;
import de.anomic.kelondro.text.referencePrototype.WordReferenceRow;
import de.anomic.kelondro.util.FileUtils;
import de.anomic.kelondro.util.Log;
public final class IndexCollectionMigration<ReferenceType extends Reference> extends AbstractBufferedIndex<ReferenceType> implements Index<ReferenceType>, BufferedIndex<ReferenceType> {
private final IndexCell<ReferenceType> cell;
private IndexCollection<ReferenceType> collections;
private final IODispatcher<ReferenceType> merger;
public IndexCollectionMigration (
final File indexPrimaryTextLocation,
final ReferenceFactory<ReferenceType> factory,
final ByteOrder wordOrdering,
final Row payloadrow,
final int entityCacheMaxSize,
final long targetFileSize,
final long maxFileSize,
final IODispatcher<ReferenceType> merger,
final Log log) throws IOException {
super(factory);
this.merger = merger;
final File celldir = new File(indexPrimaryTextLocation, "RICELL");
this.cell = new IndexCell<ReferenceType>(
celldir,
factory,
wordOrdering,
WordReferenceRow.urlEntryRow,
entityCacheMaxSize,
targetFileSize,
maxFileSize,
this.merger);
final File textindexcache = new File(indexPrimaryTextLocation, "RICACHE");
if (textindexcache.exists()) {
// migrate the "index.dhtout.blob" into RICELL directory
File f = new File(textindexcache, "index.dhtout.blob");
if (f.exists()) {
File n = this.cell.newContainerBLOBFile();
f.renameTo(n);
this.cell.mountBLOBFile(n);
}
f = new File(textindexcache, "index.dhtin.blob");
if (f.exists()) {
File n = this.cell.newContainerBLOBFile();
f.renameTo(n);
this.cell.mountBLOBFile(n);
}
// delete everything else
String[] l = textindexcache.list();
for (String s: l) {
f = new File(textindexcache, s);
FileUtils.deletedelete(f);
}
FileUtils.deletedelete(textindexcache);
}
// open collections, this is for migration only.
final File textindexcollections = new File(indexPrimaryTextLocation, "RICOLLECTION");
if (textindexcollections.exists()) {
this.collections = new IndexCollection<ReferenceType>(
textindexcollections,
"collection",
factory,
12,
Base64Order.enhancedCoder,
BufferedIndexCollection.maxCollectionPartition,
WordReferenceRow.urlEntryRow,
false);
if (this.collections.size() == 0) {
// delete everything here
this.collections.close();
this.collections = null;
String[] l = textindexcollections.list();
File f;
for (String s: l) {
f = new File(textindexcollections, s);
FileUtils.deletedelete(f);
}
FileUtils.deletedelete(textindexcollections);
}
} else {
this.collections = null;
}
}
/* methods for interface Index */
public void add(final ReferenceContainer<ReferenceType> entries) throws IOException {
assert (entries.row().objectsize == WordReferenceRow.urlEntryRow.objectsize);
if (this.collections != null) {
ReferenceContainer<ReferenceType> e = this.collections.delete(entries.getTermHash());
if (e != null) {
e.merge(entries);
cell.add(e);
} else {
cell.add(entries);
}
} else {
cell.add(entries);
}
}
public void add(final byte[] wordHash, final ReferenceType entry) throws IOException {
if (this.collections != null) {
ReferenceContainer<ReferenceType> e = this.collections.delete(wordHash);
if (e != null) {
e.add(entry);
cell.add(e);
} else {
cell.add(wordHash, entry);
}
} else {
cell.add(wordHash, entry);
}
}
public boolean has(final byte[] wordHash) {
if (this.collections != null) {
ReferenceContainer<ReferenceType> e = this.collections.delete(wordHash);
if (e != null) {
try {
cell.add(e);
} catch (IOException e1) {
e1.printStackTrace();
}
return true;
} else {
return cell.has(wordHash);
}
} else {
return cell.has(wordHash);
}
}
public int count(byte[] wordHash) {
if (this.collections != null) {
ReferenceContainer<ReferenceType> e = this.collections.delete(wordHash);
if (e != null) {
try {
cell.add(e);
} catch (IOException e1) {
e1.printStackTrace();
}
return cell.count(wordHash);
} else {
return cell.count(wordHash);
}
} else {
return cell.count(wordHash);
}
}
public ReferenceContainer<ReferenceType> get(final byte[] wordHash, final Set<String> urlselection) throws IOException {
if (wordHash == null) {
// wrong input
return null;
}
if (this.collections != null) {
ReferenceContainer<ReferenceType> e = this.collections.delete(wordHash);
if (e != null) cell.add(e);
}
return this.cell.get(wordHash, urlselection);
}
public ReferenceContainer<ReferenceType> delete(final byte[] wordHash) throws IOException {
ReferenceContainer<ReferenceType> cc = cell.delete(wordHash);
if (cc == null) {
if (collections == null) return null;
return collections.delete(wordHash);
} else {
if (collections == null) return cc;
ReferenceContainer<ReferenceType> cd = collections.delete(wordHash);
if (cd == null) return cc;
return cc.merge(cd);
}
}
public boolean remove(final byte[] wordHash, final String urlHash) throws IOException {
if (this.collections != null) {
ReferenceContainer<ReferenceType> e = this.collections.delete(wordHash);
if (e != null) cell.add(e);
}
return cell.remove(wordHash, urlHash);
}
public int remove(final byte[] wordHash, final Set<String> urlHashes) throws IOException {
if (this.collections != null) {
ReferenceContainer<ReferenceType> e = this.collections.delete(wordHash);
if (e != null) cell.add(e);
}
return cell.remove(wordHash, urlHashes);
}
public synchronized CloneableIterator<ReferenceContainer<ReferenceType>> references(final byte[] startHash, final boolean rot, final boolean ram) throws IOException {
final CloneableIterator<ReferenceContainer<ReferenceType>> i = wordContainers(startHash, ram);
if (rot) {
return new RotateIterator<ReferenceContainer<ReferenceType>>(i, Base64Order.zero(startHash.length), cell.size() + ((ram) ? 0 : collections.size()));
}
return i;
}
private synchronized CloneableIterator<ReferenceContainer<ReferenceType>> wordContainers(final byte[] startWordHash, final boolean ram) throws IOException {
final Order<ReferenceContainer<ReferenceType>> containerOrder = new ReferenceContainerOrder<ReferenceType>(factory, cell.ordering().clone());
ReferenceContainer<ReferenceType> emptyContainer = ReferenceContainer.emptyContainer(factory, startWordHash, 0);
containerOrder.rotate(emptyContainer);
if (ram) {
return cell.references(startWordHash, true);
}
if (collections == null) return cell.references(startWordHash, false);
return new MergeIterator<ReferenceContainer<ReferenceType>>(
cell.references(startWordHash, false),
collections.references(startWordHash, false),
containerOrder,
ReferenceContainer.containerMergeMethod,
true);
}
public void clear() {
try {
cell.clear();
} catch (IOException e1) {
e1.printStackTrace();
}
if (collections != null) try {
collections.clear();
} catch (IOException e) {
e.printStackTrace();
}
}
public void close() {
cell.close();
if (collections != null) collections.close();
}
public int size() {
return (collections == null) ? cell.size() : java.lang.Math.max(collections.size(), cell.size());
}
public int minMem() {
return 1024*1024 /* indexing overhead */ + cell.minMem() + ((collections == null) ? 0 : collections.minMem());
}
/*
* methods for cache management
*/
public int getBufferMaxReferences() {
return cell.getBufferMaxReferences();
}
public long getBufferMinAge() {
return cell.getBufferMinAge();
}
public long getBufferMaxAge() {
return cell.getBufferMaxAge();
}
public long getBufferSizeBytes() {
return cell.getBufferSizeBytes();
}
public void setBufferMaxWordCount(final int maxWords) {
cell.setBufferMaxWordCount(maxWords);
}
public int getBackendSize() {
return (collections == null) ? cell.getBackendSize() : collections.size();
}
public int getBufferSize() {
return cell.getBufferSize();
}
public ByteOrder ordering() {
return cell.ordering();
}
public CloneableIterator<ReferenceContainer<ReferenceType>> references(byte[] startWordHash, boolean rot) {
final Order<ReferenceContainer<ReferenceType>> containerOrder = new ReferenceContainerOrder<ReferenceType>(factory, this.cell.ordering().clone());
if (this.collections == null) return this.cell.references(startWordHash, rot);
//else
return new MergeIterator<ReferenceContainer<ReferenceType>>(
this.cell.references(startWordHash, false),
this.collections.references(startWordHash, false),
containerOrder,
ReferenceContainer.containerMergeMethod,
true);
}
public void cleanupBuffer(int time) {
this.cell.cleanupBuffer(time);
}
}