orbiter 8444357291 added new row interator in kelondro tables files that enumerates rows
without an order by the primary key. The result is a very fast enumeration of the Eco table data structure. Other table data types are not affected.
The new enumerator is used for the url export function that can be accessed from the online interface (Index Administration -> URL References -> Export). This export should now be much faster, if all url database files are from type Eco
The new enumeration is also used at other functions in YaCy, i.e. the initialization of the crawl balancer and the initialization of YaCy News.

git-svn-id: https://svn.berlios.de/svnroot/repos/yacy/trunk@5647 6c8d7289-2bf4-0310-a012-ef5d649a1542
2009-02-24 10:40:20 +00:00

627 lines
24 KiB

// kelondroRowSet.java
// (C) 2006 by Michael Peter Christen; mc@yacy.net, Frankfurt a. M., Germany
// first published 20.06.2006 on http://www.anomic.de
// $LastChangedDate: 2006-04-02 22:40:07 +0200 (So, 02 Apr 2006) $
// $LastChangedRevision: 1986 $
// $LastChangedBy: orbiter $
// This program is free software; you can redistribute it and/or modify
// it under the terms of the GNU General Public License as published by
// the Free Software Foundation; either version 2 of the License, or
// (at your option) any later version.
// This program is distributed in the hope that it will be useful,
// but WITHOUT ANY WARRANTY; without even the implied warranty of
// GNU General Public License for more details.
// You should have received a copy of the GNU General Public License
// along with this program; if not, write to the Free Software
// Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
package de.anomic.kelondro.index;
import java.io.DataInput;
import java.io.IOException;
import java.util.Date;
import java.util.Iterator;
import java.util.List;
import java.util.Random;
import de.anomic.kelondro.order.Base64Order;
import de.anomic.kelondro.order.ByteOrder;
import de.anomic.kelondro.order.CloneableIterator;
import de.anomic.kelondro.order.NaturalOrder;
public class RowSet extends RowCollection implements ObjectIndex, Iterable<Row.Entry> {
private static final int collectionReSortLimit = 400;
public RowSet(final RowSet rs) {
public RowSet(final Row rowdef, final int objectCount, final byte[] cache, final int sortBound) {
super(rowdef, objectCount, cache, sortBound);
assert rowdef.objectOrder != null;
public RowSet(final Row rowdef, final int objectCount) {
super(rowdef, objectCount);
assert rowdef.objectOrder != null;
* import an exported collection
* @param rowdef
* @param exportedCollectionRowEnvironment
* @param columnInEnvironment
public RowSet(final Row rowdef, final Row.Entry exportedCollectionRowEnvironment) {
super(rowdef, exportedCollectionRowEnvironment);
assert rowdef.objectOrder != null;
public void setOrdering(final ByteOrder newOrder, final int newColumn) {
assert newOrder != null;
if ((rowdef.objectOrder == null) ||
(!(rowdef.objectOrder.signature().equals(newOrder.signature()))) ||
(newColumn != rowdef.primaryKeyIndex)) {
rowdef.setOrdering(newOrder, newColumn);
this.sortBound = 0;
public static RowSet importRowSet(final DataInput is, final Row rowdef) throws IOException {
final byte[] byte6 = new byte[6];
final int size = is.readInt();
//short lastread = (short) kelondroNaturalOrder.decodeLong(byte2);
//short lastwrote = (short) kelondroNaturalOrder.decodeLong(byte2);
//String orderkey = new String(byte2);
final int orderbound = is.readInt();
final byte[] chunkcache = new byte[size * rowdef.objectsize];
return new RowSet(rowdef, size, chunkcache, orderbound);
public static RowSet importRowSet(byte[] b, final Row rowdef) {
final int size = (int) NaturalOrder.decodeLong(b, 0, 4);
final int orderbound = (int) NaturalOrder.decodeLong(b, 10, 4);
final byte[] chunkcache = new byte[size * rowdef.objectsize];
assert b.length - exportOverheadSize == size * rowdef.objectsize;
System.arraycopy(b, 14, chunkcache, 0, chunkcache.length);
return new RowSet(rowdef, size, chunkcache, orderbound);
public void reset() {
public synchronized boolean has(final byte[] key) {
final int index = find(key, 0, key.length);
return index >= 0;
public synchronized Row.Entry get(final byte[] key) {
return get(key, 0, key.length);
private Row.Entry get(final byte[] key, final int astart, final int alength) {
final int index = find(key, astart, alength);
final Row.Entry entry = (index >= 0) ? get(index, true) : null;
return entry;
public synchronized void putMultiple(final List<Row.Entry> rows) {
final Iterator<Row.Entry> i = rows.iterator();
while (i.hasNext()) put(i.next());
public Row.Entry put(final Row.Entry row, final Date entryDate) {
return put(row);
public synchronized Row.Entry put(final Row.Entry entry) {
assert (entry != null);
assert (entry.getPrimaryKeyBytes() != null);
int index = -1;
Row.Entry oldentry = null;
// when reaching a specific amount of un-sorted entries, re-sort all
if ((this.chunkcount - this.sortBound) > collectionReSortLimit) {
index = find(entry.bytes(), (rowdef.primaryKeyIndex < 0) ? 0 :super.rowdef.colstart[rowdef.primaryKeyIndex], super.rowdef.primaryKeyLength);
if (index < 0) {
} else {
oldentry = get(index, true);
int sb = this.sortBound; // save the sortBound, because it is not altered (we replace at the same place)
set(index, entry); // this may alter the sortBound, which we will revert in the next step
this.sortBound = sb; // revert a sortBound altering
return oldentry;
private synchronized Row.Entry remove(final byte[] a, final int start, final int length) {
final int index = find(a, start, length);
if (index < 0) return null;
final Row.Entry entry = super.get(index, true);
super.removeRow(index, true); // keep order of collection!
int findagainindex = 0;
assert (findagainindex = find(a, start, length)) < 0 : "remove: chunk found again at index position (after remove) " + findagainindex + ", index(before) = " + index + ", inset=" + NaturalOrder.arrayList(super.chunkcache, super.rowdef.objectsize * findagainindex, length) + ", searchkey=" + NaturalOrder.arrayList(a, start, length); // check if the remove worked
return entry;
* remove a byte[] from the set.
* if the entry was found, return the entry, but delete the entry from the set
* if the entry was not found, return null.
public Row.Entry remove(final byte[] a) {
return remove(a, 0, a.length);
private int find(final byte[] a, final int astart, final int alength) {
// returns the chunknumber; -1 if not found
if (rowdef.objectOrder == null) return iterativeSearch(a, astart, alength, 0, this.chunkcount);
if ((this.chunkcount - this.sortBound) > (collectionReSortLimit << 1)) {
if ((this.rowdef.objectOrder != null) && (this.rowdef.objectOrder instanceof Base64Order) && (this.sortBound > 4000)) {
// first try to find in sorted area
assert this.rowdef.objectOrder.wellformed(a, astart, alength) : "not wellformed: " + new String(a, astart, alength);
final byte[] compiledPivot = compilePivot(a, astart, alength);
final int p = binarySearchCompiledPivot(compiledPivot);
if (p >= 0) return p;
// then find in unsorted area
return iterativeSearchCompiledPivot(compiledPivot, this.sortBound, this.chunkcount);
} else {
// first try to find in sorted area
final int p = binarySearch(a, astart, alength);
if (p >= 0) return p;
// then find in unsorted area
return iterativeSearch(a, astart, alength, this.sortBound, this.chunkcount);
private int iterativeSearch(final byte[] key, final int astart, final int alength, final int leftBorder, final int rightBound) {
// returns the chunknumber
if (rowdef.objectOrder == null) {
for (int i = leftBorder; i < rightBound; i++) {
if (match(key, astart, alength, i)) return i;
return -1;
// we dont do a special handling of kelondroBase64Order here, because tests showed that this produces too much overhead
for (int i = leftBorder; i < rightBound; i++) {
if (compare(key, astart, alength, i) == 0) return i;
return -1;
private int iterativeSearchCompiledPivot(final byte[] compiledPivot, final int leftBorder, final int rightBound) {
// returns the chunknumber
assert (rowdef.objectOrder != null);
assert (rowdef.objectOrder instanceof Base64Order);
for (int i = leftBorder; i < rightBound; i++) {
if (comparePivot(compiledPivot, i) == 0) return i;
return -1;
private int binarySearch(final byte[] key, final int astart, final int alength) {
// returns the exact position of the key if the key exists,
// or -1 if the key does not exist
assert (rowdef.objectOrder != null);
int l = 0;
int rbound = this.sortBound;
int p = 0;
int d;
while (l < rbound) {
p = l + ((rbound - l) >> 1);
d = compare(key, astart, alength, p);
if (d == 0) return p;
if (d < 0) rbound = p; else l = p + 1;
return -1;
private int binarySearchCompiledPivot(final byte[] compiledPivot) {
// returns the exact position of the key if the key exists,
// or -1 if the key does not exist
assert (rowdef.objectOrder != null);
assert (rowdef.objectOrder instanceof Base64Order);
int l = 0;
int rbound = this.sortBound;
int p = 0;
int d;
while (l < rbound) {
p = l + ((rbound - l) >> 1);
d = comparePivot(compiledPivot, p);
if (d == 0) return p;
if (d < 0) rbound = p; else l = p + 1;
return -1;
private int binaryPosition(final byte[] key, final int astart, final int alength) {
// returns the exact position of the key if the key exists,
// or a position of an entry that is greater than the key if the
// key does not exist
assert (rowdef.objectOrder != null);
int l = 0;
int rbound = this.sortBound;
int p = 0;
int d;
while (l < rbound) {
p = l + ((rbound - l) >> 1);
d = compare(key, astart, alength, p);
if (d == 0) return p;
if (d < 0) rbound = p; else l = p + 1;
return l;
public synchronized Iterator<byte[]> keys() {
return super.keys(true);
public synchronized CloneableIterator<byte[]> keys(final boolean up, final byte[] firstKey) {
return new keyIterator(up, firstKey);
public class keyIterator implements CloneableIterator<byte[]> {
private final boolean up;
private final byte[] first;
private int p;
final int bound;
public keyIterator(final boolean up, final byte[] firstKey) {
// see that all elements are sorted
this.up = up;
this.first = firstKey;
this.bound = sortBound;
if (first == null) {
p = 0;
} else {
p = binaryPosition(first, 0, first.length); // check this to find bug in DHT selection enumeration
//System.out.println("binaryposition for key " + new String(firstKey) + " is " + p);
public keyIterator clone(final Object second) {
return new keyIterator(up, (byte[]) second);
public boolean hasNext() {
if (p < 0) return false;
if (p >= size()) return false;
if (up) {
return p < bound;
} else {
return p >= 0;
public byte[] next() {
final byte[] key = getKey(p);
if (up) p++; else p--;
return key;
public void remove() {
throw new UnsupportedOperationException();
public synchronized Iterator<Row.Entry> iterator() {
// iterates kelondroRow.Entry - type entries
return super.iterator();
public synchronized CloneableIterator<Row.Entry> rows(final boolean up, final byte[] firstKey) {
return new rowIterator(up, firstKey);
public synchronized CloneableIterator<Row.Entry> rows() {
return new rowIterator(true, null);
public class rowIterator implements CloneableIterator<Row.Entry> {
private final boolean up;
private final byte[] first;
private int p;
final int bound;
public rowIterator(final boolean up, final byte[] firstKey) {
// see that all elements are sorted
this.up = up;
this.first = firstKey;
this.bound = sortBound;
if (first == null) {
p = 0;
} else {
p = binaryPosition(first, 0, first.length); // check this to find bug in DHT selection enumeration
//System.out.println("binaryposition for key " + new String(firstKey) + " is " + p);
public rowIterator clone(final Object second) {
return new rowIterator(up, (byte[]) second);
public boolean hasNext() {
if (p < 0) return false;
if (p >= size()) return false;
if (up) {
return p < bound;
} else {
return p >= 0;
public Row.Entry next() {
final Row.Entry entry = get(p, true);
if (up) p++; else p--;
return entry;
public void remove() {
throw new UnsupportedOperationException();
* merge this row collection with another row collection.
* The resulting collection is sorted and does not contain any doubles, which are also removed during the merge.
* The new collection may be a copy of one of the old one, or can be an alteration of one of the input collections
* After this merge, none of the input collections should be used, because they can be altered
* @param c
* @return
public RowSet merge(RowSet c) {
if (this.isSorted() && this.size() >= c.size()) {
return mergeInsert(this, c);
return mergeEnum(this, c);
private static kelondroRowSet mergeInsert(kelondroRowSet sorted, kelondroRowCollection small) {
assert sorted.rowdef == small.rowdef;
assert sorted.isSorted();
assert small.size() <= sorted.size();
sorted.ensureSize(sorted.size() + small.size());
for (int i = 0; i < small.size(); i++) {
return sorted;
* merge this row collection with another row collection using an simultanous iteration of the input collections
* the current collection is not altered in any way, the returned collection is a new collection with copied content.
* @param c
* @return
protected static RowSet mergeEnum(RowCollection c0, RowCollection c1) {
assert c0.rowdef == c1.rowdef;
RowSet r = new RowSet(c0.rowdef, c0.size() + c1.size());
int c0i = 0, c1i = 0;
int c0p, c1p;
int o;
final int objectsize = c0.rowdef.objectsize;
while (c0i < c0.size() && c1i < c1.size()) {
c0p = c0i * objectsize;
c1p = c1i * objectsize;
o = c0.rowdef.objectOrder.compare(
c0.chunkcache, c0p, c0.rowdef.primaryKeyLength,
c1.chunkcache, c1p, c0.rowdef.primaryKeyLength);
if (o == 0) {
r.addSorted(c0.chunkcache, c0p, objectsize);
if (o < 0) {
r.addSorted(c0.chunkcache, c0p, objectsize);
if (o > 0) {
r.addSorted(c1.chunkcache, c1p, objectsize);
while (c0i < c0.size()) {
r.addSorted(c0.chunkcache, c0i * objectsize, objectsize);
while (c1i < c1.size()) {
r.addSorted(c1.chunkcache, c1i * objectsize, objectsize);
return r;
public static void main(final String[] args) {
// sort/uniq-test
kelondroRow rowdef = new kelondroRow("Cardinal key-4 {b256}, byte[] payload-1", kelondroNaturalOrder.naturalOrder, 0);
kelondroRowSet rs = new kelondroRowSet(rowdef, 0);
Random random = new Random(0);
kelondroRow.Entry entry;
for (int i = 0; i < 10000000; i++) {
entry = rowdef.newEntry();
entry.setCol(0, Math.abs(random.nextLong() % 1000000));
entry.setCol(1, "a".getBytes());
System.out.println("before sort, size = " + rs.size());
System.out.println("after sort, before uniq, size = " + rs.size());
System.out.println("after uniq, size = " + rs.size());
final String[] test = { "eins", "zwei", "drei", "vier", "fuenf", "sechs", "sieben", "acht", "neun", "zehn" };
final RowSet d = new RowSet(new Row("byte[] key-10, Cardinal x-4 {b256}", NaturalOrder.naturalOrder, 0), 0);
d.setOrdering(NaturalOrder.naturalOrder, 0);
for (int ii = 0; ii < test.length; ii++) d.add(test[ii].getBytes());
for (int ii = 0; ii < test.length; ii++) d.add(test[ii].getBytes());
d.remove("fuenf".getBytes(), 0, 5);
final Iterator<Row.Entry> ii = d.iterator();
String s;
System.out.print("INPUT-ITERATOR: ");
Row.Entry entry;
while (ii.hasNext()) {
entry = ii.next();
s = new String(entry.getColBytes(0)).trim();
System.out.print(s + ", ");
if (s.equals("drei")) ii.remove();
System.out.println("INPUT-TOSTRING: " + d.toString());
System.out.println("SORTED : " + d.toString());
System.out.println("UNIQ : " + d.toString());
System.out.println("TRIM : " + d.toString());
// second test
c = new kelondroRowSet(new kelondroRow(new int[]{10, 3}));
c.setOrdering(kelondroNaturalOrder.naturalOrder, 0);
Random rand = new Random(0);
long start = System.currentTimeMillis();
long t, d = 0;
String w;
for (long k = 0; k < 60000; k++) {
t = System.currentTimeMillis();
w = "a" + Long.toString(rand.nextLong());
if (k % 10000 == 0)
System.out.println("added " + k + " entries in " +
((t - start) / 1000) + " seconds, " +
(((t - start) > 1000) ? (k / ((t - start) / 1000)) : k) +
" entries/second, size = " + c.size());
System.out.println("bevore sort: " + ((System.currentTimeMillis() - start) / 1000) + " seconds");
System.out.println("after sort: " + ((System.currentTimeMillis() - start) / 1000) + " seconds");
System.out.println("after uniq: " + ((System.currentTimeMillis() - start) / 1000) + " seconds");
System.out.println("RESULT SIZE: " + c.size());
// third test
c = new kelondroRowSet(new kelondroRow(new int[]{10, 3}), 60000);
c.setOrdering(kelondroNaturalOrder.naturalOrder, 0);
rand = new Random(0);
start = System.currentTimeMillis();
d = 0;
for (long k = 0; k < 60000; k++) {
t = System.currentTimeMillis();
w = "a" + Long.toString(rand.nextLong());
if (c.get(w.getBytes(), 0, 10) == null) c.add(w.getBytes()); else d++;
if (k % 10000 == 0)
System.out.println("added " + k + " entries in " +
((t - start) / 1000) + " seconds, " +
(((t - start) > 1000) ? (k / ((t - start) / 1000)) : k) +
" entries/second, " + d + " double, size = " + c.size() +
", sum = " + (c.size() + d));
System.out.println("RESULT SIZE: " + c.size());
// performance test for put
long start = System.currentTimeMillis();
kelondroRowSet c = new kelondroRowSet(new kelondroRow("byte[] a-12, byte[] b-12"), 0);
Random random = new Random(0);
byte[] key;
for (int i = 0; i < 100000; i++) {
key = randomHash(random);
c.put(c.rowdef.newEntry(new byte[][]{key, key}));
if (i % 1000 == 0) System.out.println(i + " entries. ");
System.out.println("RESULT SIZE: " + c.size());
System.out.println("Time: " + ((System.currentTimeMillis() - start) / 1000) + " seconds");
// remove test
final long start = System.currentTimeMillis();
final RowSet c = new RowSet(new Row("byte[] a-12, byte[] b-12", Base64Order.enhancedCoder, 0), 0);
byte[] key;
final int testsize = 5000;
final byte[][] delkeys = new byte[testsize / 5][];
Random random = new Random(0);
for (int i = 0; i < testsize; i++) {
key = randomHash(random);
if (i % 5 != 0) continue;
delkeys[i / 5] = key;
random = new Random(0);
for (int i = 0; i < testsize; i++) {
key = randomHash(random);
c.put(c.rowdef.newEntry(new byte[][]{key, key}));
if (i % 1000 == 0) {
for (int j = 0; j < delkeys.length; j++) c.remove(delkeys[j]);
for (int j = 0; j < delkeys.length; j++) c.remove(delkeys[j]);
random = new Random(0);
for (int i = 0; i < testsize; i++) {
key = randomHash(random);
if (i % 5 == 0) continue;
if (c.get(key) == null) System.out.println("missing entry " + new String(key));
System.out.println("RESULT SIZE: " + c.size());
System.out.println("Time: " + ((System.currentTimeMillis() - start) / 1000) + " seconds");
public static byte[] randomHash(final long r0, final long r1) {
// a long can have 64 bit, but a 12-byte hash can have 6 * 12 = 72 bits
// so we construct a generic Hash using two long values
return (Base64Order.enhancedCoder.encodeLong(Math.abs(r0), 11).substring(5) +
Base64Order.enhancedCoder.encodeLong(Math.abs(r1), 11).substring(5)).getBytes();
public static byte[] randomHash(final Random r) {
return randomHash(r.nextLong(), r.nextLong());
public String filename() {
return null;
public void deleteOnExit() {
// do nothing, there is no file