added kelondroRowCollection and kelondroRowSet which

shall replace kelondroCollection

git-svn-id: https://svn.berlios.de/svnroot/repos/yacy/trunk@2220 6c8d7289-2bf4-0310-a012-ef5d649a1542
This commit is contained in:
orbiter 2006-06-20 14:17:21 +00:00
parent ed2cb040d1
commit 8f0c1a240a
3 changed files with 650 additions and 26 deletions

View File

@ -1,11 +1,13 @@
// kelondroCollection.java
// -----------------------
// part of The Kelondro Database
// (C) by Michael Peter Christen; mc@anomic.de
// first published on http://www.anomic.de
// Frankfurt, Germany, 2006
// created: 12.01.2006
// kelondroCollection.java
// (C) 2006 by Michael Peter Christen; mc@anomic.de, Frankfurt a. M., Germany
// first published 12.01.2006 on http://www.anomic.de
//
// $LastChangedDate: 2006-04-02 22:40:07 +0200 (So, 02 Apr 2006) $
// $LastChangedRevision: 1986 $
// $LastChangedBy: orbiter $
//
// LICENSE
//
// This program is free software; you can redistribute it and/or modify
// it under the terms of the GNU General Public License as published by
// the Free Software Foundation; either version 2 of the License, or
@ -19,25 +21,6 @@
// You should have received a copy of the GNU General Public License
// along with this program; if not, write to the Free Software
// Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
//
// Using this software in any meaning (reading, learning, copying, compiling,
// running) means that you agree that the Author(s) is (are) not responsible
// for cost, loss of data or any harm that may be caused directly or indirectly
// by usage of this softare or this documentation. The usage of this software
// is on your own risk. The installation and usage (starting/running) of this
// software may allow other people or application to access your computer and
// any attached devices and is highly dependent on the configuration of the
// software which must be done by the user of the software; the author(s) is
// (are) also not responsible for proper configuration and usage of the
// software, even if provoked by documentation provided together with
// the software.
//
// Any changes to this file according to the GPL as documented in the file
// gpl.txt aside this file in the shipment you received can be done to the
// lines that follows this copyright notice here, but changes must not be
// done inside the copyright notive above. A re-distribution must contain
// the intact and unchanged copyright notice.
// Contributions and changes to the program code must be marked as such.
package de.anomic.kelondro;

View File

@ -0,0 +1,413 @@
// kelondroRowCollection.java
// (C) 2006 by Michael Peter Christen; mc@anomic.de, Frankfurt a. M., Germany
// first published 12.01.2006 on http://www.anomic.de
//
// $LastChangedDate: 2006-04-02 22:40:07 +0200 (So, 02 Apr 2006) $
// $LastChangedRevision: 1986 $
// $LastChangedBy: orbiter $
//
// LICENSE
//
// This program is free software; you can redistribute it and/or modify
// it under the terms of the GNU General Public License as published by
// the Free Software Foundation; either version 2 of the License, or
// (at your option) any later version.
//
// This program is distributed in the hope that it will be useful,
// but WITHOUT ANY WARRANTY; without even the implied warranty of
// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
// GNU General Public License for more details.
//
// You should have received a copy of the GNU General Public License
// along with this program; if not, write to the Free Software
// Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
package de.anomic.kelondro;
import java.util.Iterator;
import java.util.Random;
public class kelondroRowCollection {
protected byte[] chunkcache;
protected int chunkcount;
protected long lastTimeRead, lastTimeWrote;
protected kelondroRow rowdef;
protected int sortBound;
protected kelondroOrder sortOrder;
protected int sortColumn;
public kelondroRowCollection(kelondroRow rowdef) {
this(rowdef, 0);
}
public kelondroRowCollection(kelondroRow rowdef, int objectCount) {
this.rowdef = rowdef;
this.chunkcache = new byte[objectCount * rowdef.objectsize()];
this.chunkcount = 0;
this.sortColumn = 0;
this.sortOrder = null;
this.sortBound = 0;
}
public kelondroRowCollection(kelondroRow rowdef, int objectCount, byte[] cache) {
this.rowdef = rowdef;
this.chunkcache = cache;
this.chunkcount = objectCount;
this.sortColumn = 0;
this.sortOrder = null;
this.sortBound = 0;
}
private void ensureSize(int elements) {
int needed = elements * rowdef.objectsize();
if (chunkcache.length >= needed) return;
byte[] newChunkcache = new byte[needed * 2];
System.arraycopy(chunkcache, 0, newChunkcache, 0, chunkcache.length);
chunkcache = newChunkcache;
newChunkcache = null;
}
public void trim() {
synchronized (chunkcache) {
int needed = chunkcount * rowdef.objectsize();
if (chunkcache.length == needed) return;
byte[] newChunkcache = new byte[needed];
System.arraycopy(chunkcache, 0, newChunkcache, 0, newChunkcache.length);
chunkcache = newChunkcache;
newChunkcache = null;
}
}
public long lastRead() {
return lastTimeRead;
}
public long lastWrote() {
return lastTimeWrote;
}
public kelondroRow.Entry get(int index) {
assert (index < chunkcount);
byte[] a = new byte[rowdef.objectsize()];
synchronized (chunkcache) {
System.arraycopy(chunkcache, index * rowdef.objectsize(), a, 0, rowdef.objectsize());
}
return rowdef.newEntry(a);
}
public void set(int index, kelondroRow.Entry a) {
set(index, a.bytes(), 0, a.bytes().length);
}
public void set(int index, byte[] a, int astart, int alength) {
assert (index < this.chunkcount);
int l = Math.min(rowdef.objectsize(), Math.min(alength, a.length - astart));
synchronized (chunkcache) {
System.arraycopy(a, astart, chunkcache, index * rowdef.objectsize(), l);
}
this.lastTimeWrote = System.currentTimeMillis();
}
public void add(kelondroRow.Entry a) {
add(a.bytes(), 0, a.bytes().length);
}
public void add(byte[] a, int astart, int alength) {
int l = Math.min(rowdef.objectsize(), Math.min(alength, a.length - astart));
synchronized (chunkcache) {
ensureSize(chunkcount + 1);
System.arraycopy(a, 0, chunkcache, rowdef.objectsize() * chunkcount, l);
chunkcount++;
}
this.lastTimeWrote = System.currentTimeMillis();
}
public void addAll(kelondroRowCollection c) {
assert(rowdef.objectsize() >= c.rowdef.objectsize());
synchronized(chunkcache) {
ensureSize(chunkcount + c.size());
}
Iterator i = c.elements();
byte[] b;
while (i.hasNext()) {
b = (byte[]) i.next();
add(b, 0, b.length);
}
}
public void remove(int p) {
assert ((p >= 0) && (p < chunkcount) && (chunkcount > 0));
synchronized (chunkcache) {
System.arraycopy(chunkcache, (p + 1) * rowdef.objectsize(), chunkcache, p * rowdef.objectsize(), (chunkcount - p - 1) * rowdef.objectsize());
chunkcount--;
if (p < sortBound) sortBound--;
}
this.lastTimeWrote = System.currentTimeMillis();
}
public void removeOne() {
if (chunkcount == 0) return;
if (chunkcount == sortBound) sortBound--;
chunkcount--;
this.lastTimeWrote = System.currentTimeMillis();
}
public void clear() {
this.chunkcache = new byte[0];
this.chunkcount = 0;
this.sortBound = 0;
}
public int size() {
return chunkcount;
}
public Iterator elements() { // iterates byte[] - objects
return new chunkIterator();
}
public class chunkIterator implements Iterator {
int c = 0;
public chunkIterator() {
c = 0;
}
public boolean hasNext() {
return c < chunkcount;
}
public Object next() {
byte[] chunk = new byte[rowdef.objectsize()];
System.arraycopy(chunkcache, c * rowdef.objectsize(), chunk, 0, rowdef.objectsize());
c++;
return chunk;
}
public void remove() {
c--;
System.arraycopy(chunkcache, (c + 1) * rowdef.objectsize(), chunkcache, c * rowdef.objectsize(), (chunkcount - c - 1) * rowdef.objectsize());
chunkcount--;
}
}
public void sort(kelondroOrder newOrder, int newColumn) {
if ((this.sortOrder == null) ||
(!(this.sortOrder.signature().equals(newOrder.signature()))) ||
(newColumn != this.sortColumn)) {
this.sortOrder = newOrder;
this.sortBound = 0;
this.sortColumn = newColumn;
}
sort();
}
public void sort() {
assert (this.sortOrder != null);
if (this.sortBound == this.chunkcount) return; // this is already sorted
//System.out.println("SORT(chunkcount=" + this.chunkcount + ", sortbound=" + this.sortbound + ")");
if (this.sortBound > 1) {
qsort(0, this.sortBound, this.chunkcount);
} else {
qsort(0, this.chunkcount);
}
this.sortBound = this.chunkcount;
}
private void qsort(int L, int S, int R) {
//System.out.println("QSORT: chunkcache.length=" + chunkcache.length + ", chunksize=" + chunksize + ", L=" + L + ", S=" + S + ", R=" + R);
assert (S <= R);
if (L >= R - 1) return;
if (S >= R) return;
if (R - L < 20) {
isort(L, R);
return;
}
int p = L + ((S - L) / 2);
int ps = p;
int q = S;
int qs = q;
int pivot = p;
while (q < R) {
if (compare(pivot, q) < 1) {
q++;
} else {
pivot = swap(p, q, pivot);
p++;
q++;
}
}
if ((ps - L) <= ((p - L) / 2)) qsort(L, p); else qsort(L, ps, p);
if ((qs - p) <= ((R - p) / 2)) qsort(p, R); else qsort(p, qs, R);
}
private void qsort(int L, int R) {
//System.out.println("QSORT: chunkcache.length=" + chunkcache.length + ", L=" + L + "/" + new String(this.chunkcache, L * this.rowdef.objectsize(), this.rowdef.width(0)) + ", R=" + R + "/" + new String(this.chunkcache, (R - 1) * this.rowdef.objectsize(), this.rowdef.width(0)));
/*
if ((L == 190) && (R == 258)) {
for (int i = L; i < R; i++) {
System.out.print(new String(this.chunkcache, L * this.chunksize, this.chunksize) + ", ");
}
System.out.println();
}
*/
if (L >= R - 1) return;
if (R - L < 20) {
isort(L, R);
return;
}
int i = L;
int j = R - 1;
int pivot = (i + j) / 2;
//System.out.println("Pivot=" + pivot + "/" + new String(this.chunkcache, pivot * this.rowdef.objectsize(), this.rowdef.width(0)));
while (i <= j) {
while (compare(pivot, i) == 1) i++; // chunkAt[i] < keybuffer
while (compare(pivot, j) == -1) j--; // chunkAt[j] > keybuffer
//if (L == 6693) System.out.println(i + ", " + j);
if (i <= j) {
pivot = swap(i, j, pivot);
i++;
j--;
}
}
//if (L == 6693) System.out.println(i);
qsort(L, i);
qsort(i, R);
}
private void isort(int L, int R) {
for (int i = L + 1; i < R; i++)
for (int j = i; j > L && compare(j - 1, j) > 0; j--)
swap(j, j - 1, 0);
}
private int swap(int i, int j, int p) {
if (i == j) return p;
if (this.chunkcount * this.rowdef.objectsize() < this.chunkcache.length) {
// there is space in the chunkcache that we can use as buffer
System.arraycopy(chunkcache, this.rowdef.objectsize() * i, chunkcache, chunkcache.length - this.rowdef.objectsize(), this.rowdef.objectsize());
System.arraycopy(chunkcache, this.rowdef.objectsize() *j , chunkcache, this.rowdef.objectsize() * i, this.rowdef.objectsize());
System.arraycopy(chunkcache, chunkcache.length - this.rowdef.objectsize(), chunkcache, this.rowdef.objectsize() * j, this.rowdef.objectsize());
} else {
// allocate a chunk to use as buffer
byte[] a = new byte[this.rowdef.objectsize()];
System.arraycopy(chunkcache, this.rowdef.objectsize() * i, a, 0, this.rowdef.objectsize());
System.arraycopy(chunkcache, this.rowdef.objectsize() * j , chunkcache, this.rowdef.objectsize() * i, this.rowdef.objectsize());
System.arraycopy(a, 0, chunkcache, this.rowdef.objectsize() * j, this.rowdef.objectsize());
}
if (i == p) return j; else if (j == p) return i; else return p;
}
public void uniq() {
assert (this.sortOrder != null);
// removes double-occurrences of chunks
// this works only if the collection was ordered with sort before
synchronized (chunkcache) {
if (chunkcount <= 1) return;
int i = 0;
while (i < chunkcount - 1) {
if (compare(i, i + 1) == 0) {
//System.out.println("DOUBLE: " + new String(this.chunkcache, this.chunksize * i, this.chunksize));
remove(i);
} else {
i++;
}
}
}
}
public String toString() {
StringBuffer s = new StringBuffer();
Iterator i = elements();
if (i.hasNext()) s.append(new String((byte[]) i.next()).trim());
while (i.hasNext()) s.append(", " + new String((byte[]) i.next()).trim());
return new String(s);
}
public byte[] toByteArray() {
return this.chunkcache;
}
private int compare(int i, int j) {
assert (i < chunkcount);
assert (j < chunkcount);
if (i == j) return 0;
int keylength = this.rowdef.width(this.sortColumn);
int colstart = this.rowdef.colstart[this.sortColumn];
int c = this.sortOrder.compare(
chunkcache,
i * this.rowdef.objectsize() + colstart,
keylength,
chunkcache,
j * this.rowdef.objectsize() + colstart,
keylength);
/*
System.out.println("COMPARE(" +
new String(this.chunkcache, i * this.rowdef.objectsize(), this.rowdef.width(0)) +
", " +
new String(this.chunkcache, j * this.rowdef.objectsize(), this.rowdef.width(0)) +
")=" + c);
*/
return c;
}
public static void main(String[] args) {
String[] test = { "eins", "zwei", "drei", "vier", "fuenf", "sechs", "sieben", "acht", "neun", "zehn" };
kelondroRowCollection c = new kelondroRowCollection(new kelondroRow(new int[]{10, 3}));
for (int i = 0; i < test.length; i++) c.add(test[i].getBytes(), 0, 10);
for (int i = 0; i < test.length; i++) c.add(test[i].getBytes(), 0, 10);
c.sort(kelondroNaturalOrder.naturalOrder, 0);
c.remove(1);
Iterator i = c.elements();
String s;
System.out.print("INPUT-ITERATOR: ");
while (i.hasNext()) {
s = new String((byte[]) i.next()).trim();
System.out.print(s + ", ");
if (s.equals("drei")) i.remove();
}
System.out.println("");
System.out.println("INPUT-TOSTRING: " + c.toString());
c.sort();
System.out.println("SORTED : " + c.toString());
c.uniq();
System.out.println("UNIQ : " + c.toString());
c.trim();
System.out.println("TRIM : " + c.toString());
// second test
c = new kelondroRowCollection(new kelondroRow(new int[]{10, 3}));
Random rand = new Random(0);
long start = System.currentTimeMillis();
long t;
String w;
for (long k = 0; k < 60000; k++) {
t = System.currentTimeMillis();
w = "a" + Long.toString(rand.nextLong());
c.add(w.getBytes(), 0, 10);
if (k % 10000 == 0)
System.out.println("added " + k + " entries in " +
((t - start) / 1000) + " seconds, " +
(((t - start) > 1000) ? (k / ((t - start) / 1000)) : k) +
" entries/second, size = " + c.size());
}
System.out.println("bevore sort: " + ((System.currentTimeMillis() - start) / 1000) + " seconds");
c.sort(kelondroNaturalOrder.naturalOrder, 0);
System.out.println("after sort: " + ((System.currentTimeMillis() - start) / 1000) + " seconds");
c.uniq();
System.out.println("after uniq: " + ((System.currentTimeMillis() - start) / 1000) + " seconds");
System.out.println("RESULT SIZE: " + c.size());
System.out.println();
System.out.println("RESULT SIZE: " + c.size());
}
}

View File

@ -0,0 +1,228 @@
// kelondroRowSet.java
// (C) 2006 by Michael Peter Christen; mc@anomic.de, Frankfurt a. M., Germany
// first published 20.06.2006 on http://www.anomic.de
//
// $LastChangedDate: 2006-04-02 22:40:07 +0200 (So, 02 Apr 2006) $
// $LastChangedRevision: 1986 $
// $LastChangedBy: orbiter $
//
// LICENSE
//
// This program is free software; you can redistribute it and/or modify
// it under the terms of the GNU General Public License as published by
// the Free Software Foundation; either version 2 of the License, or
// (at your option) any later version.
//
// This program is distributed in the hope that it will be useful,
// but WITHOUT ANY WARRANTY; without even the implied warranty of
// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
// GNU General Public License for more details.
//
// You should have received a copy of the GNU General Public License
// along with this program; if not, write to the Free Software
// Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
package de.anomic.kelondro;
import java.util.Iterator;
import java.util.Random;
public class kelondroRowSet extends kelondroRowCollection {
public kelondroRowSet(kelondroRow rowdef) {
super(rowdef);
}
public kelondroRowSet(kelondroRow rowdef, int objectCount) {
super(rowdef, objectCount);
}
public kelondroRow.Entry get(byte[] key) {
return get(key, 0, key.length);
}
public kelondroRow.Entry get(byte[] key, int astart, int alength) {
synchronized (chunkcache) {
int i = find(key, astart, alength);
if (i >= 0) return get(i);
}
return null;
}
public kelondroRow.Entry remove(byte[] a) {
return remove(a, 0, a.length);
}
public kelondroRow.Entry remove(byte[] a, int astart, int alength) {
// the byte[] a may be shorter than the chunksize
if (chunkcount == 0) return null;
kelondroRow.Entry b = null;
synchronized(chunkcache) {
int p = find(a, astart, alength);
if (p < 0) return null;
b = get(p);
remove(p);
}
return b;
}
public void removeAll(kelondroRowCollection c) {
Iterator i = c.elements();
byte[] b;
while (i.hasNext()) {
b = (byte[]) i.next();
remove(b, 0, b.length);
}
}
public kelondroOrder getOrdering() {
return this.sortOrder;
}
public void setOrdering(kelondroOrder newOrder, int newColumn) {
if ((this.sortOrder == null) ||
(!(this.sortOrder.signature().equals(newOrder.signature()))) ||
(newColumn != this.sortColumn)) {
this.sortOrder = newOrder;
this.sortBound = 0;
this.sortColumn = newColumn;
}
}
protected int find(byte[] a, int astart, int alength) {
// returns the chunknumber; -1 if not found
if (this.sortOrder == null) return iterativeSearch(a, astart, alength);
// check if a re-sorting make sense
if ((this.chunkcount - this.sortBound) > 90) sort();
// first try to find in sorted area
int p = binarySearch(a, astart, alength);
if (p >= 0) return p;
// then find in unsorted area
return iterativeSearch(a, astart, alength);
}
private int iterativeSearch(byte[] key, int astart, int alength) {
// returns the chunknumber
if (this.sortOrder == null) {
for (int i = this.sortBound; i < this.chunkcount; i++) {
if (match(key, astart, alength, i)) return i;
}
return -1;
} else {
for (int i = this.sortBound; i < this.chunkcount; i++) {
if (compare(key, astart, alength, i) == 0) return i;
}
return -1;
}
}
private int binarySearch(byte[] key, int astart, int alength) {
assert (this.sortOrder != null);
int l = 0;
int rbound = this.sortBound;
int p = 0;
int d;
while (l < rbound) {
p = l + ((rbound - l) >> 1);
d = compare(key, astart, alength, p);
if (d == 0) return p;
else if (d < 0) rbound = p;
else l = p + 1;
}
return -1;
}
private int compare(byte[] a, int astart, int alength, int chunknumber) {
assert (chunknumber < chunkcount);
int l = Math.min(this.rowdef.width(0), Math.min(a.length - astart, alength));
return this.sortOrder.compare(a, astart, l, chunkcache, chunknumber * this.rowdef.objectsize(), l);
}
private boolean match(byte[] a, int astart, int alength, int chunknumber) {
if (chunknumber >= chunkcount) return false;
int i = 0;
int p = chunknumber * this.rowdef.objectsize();
final int len = Math.min(this.rowdef.width(0), Math.min(alength, a.length - astart));
while (i < len) if (a[astart + i++] != chunkcache[p++]) return false;
return true;
}
public static void main(String[] args) {
String[] test = { "eins", "zwei", "drei", "vier", "fuenf", "sechs", "sieben", "acht", "neun", "zehn" };
kelondroRowSet c = new kelondroRowSet(new kelondroRow(new int[]{10, 3}));
c.setOrdering(kelondroNaturalOrder.naturalOrder, 0);
for (int i = 0; i < test.length; i++) c.add(test[i].getBytes(), 0, 10);
for (int i = 0; i < test.length; i++) c.add(test[i].getBytes(), 0, 10);
c.sort();
c.remove("fuenf".getBytes(), 0, 5);
Iterator i = c.elements();
String s;
System.out.print("INPUT-ITERATOR: ");
while (i.hasNext()) {
s = new String((byte[]) i.next()).trim();
System.out.print(s + ", ");
if (s.equals("drei")) i.remove();
}
System.out.println("");
System.out.println("INPUT-TOSTRING: " + c.toString());
c.sort();
System.out.println("SORTED : " + c.toString());
c.uniq();
System.out.println("UNIQ : " + c.toString());
c.trim();
System.out.println("TRIM : " + c.toString());
// second test
c = new kelondroRowSet(new kelondroRow(new int[]{10, 3}));
c.setOrdering(kelondroNaturalOrder.naturalOrder, 0);
Random rand = new Random(0);
long start = System.currentTimeMillis();
long t, d = 0;
String w;
for (long k = 0; k < 60000; k++) {
t = System.currentTimeMillis();
w = "a" + Long.toString(rand.nextLong());
c.add(w.getBytes(), 0, 10);
if (k % 10000 == 0)
System.out.println("added " + k + " entries in " +
((t - start) / 1000) + " seconds, " +
(((t - start) > 1000) ? (k / ((t - start) / 1000)) : k) +
" entries/second, size = " + c.size());
}
System.out.println("bevore sort: " + ((System.currentTimeMillis() - start) / 1000) + " seconds");
c.sort();
System.out.println("after sort: " + ((System.currentTimeMillis() - start) / 1000) + " seconds");
c.uniq();
System.out.println("after uniq: " + ((System.currentTimeMillis() - start) / 1000) + " seconds");
System.out.println("RESULT SIZE: " + c.size());
System.out.println();
// third test
c = new kelondroRowSet(new kelondroRow(new int[]{10, 3}), 60000);
c.setOrdering(kelondroNaturalOrder.naturalOrder, 0);
rand = new Random(0);
start = System.currentTimeMillis();
d = 0;
for (long k = 0; k < 60000; k++) {
t = System.currentTimeMillis();
w = "a" + Long.toString(rand.nextLong());
if (c.get(w.getBytes(), 0, 10) == null) c.add(w.getBytes(), 0, 10); else d++;
if (k % 10000 == 0)
System.out.println("added " + k + " entries in " +
((t - start) / 1000) + " seconds, " +
(((t - start) > 1000) ? (k / ((t - start) / 1000)) : k) +
" entries/second, " + d + " double, size = " + c.size() +
", sum = " + (c.size() + d));
}
System.out.println("RESULT SIZE: " + c.size());
}
}