speed enhancement for reading of eco-table indexes

git-svn-id: https://svn.berlios.de/svnroot/repos/yacy/trunk@4647 6c8d7289-2bf4-0310-a012-ef5d649a1542
This commit is contained in:
orbiter 2008-04-06 11:50:15 +00:00
parent e96ecd269f
commit 117ae78001
8 changed files with 126 additions and 56 deletions

View File

@ -3,7 +3,7 @@ javacSource=1.5
javacTarget=1.5 javacTarget=1.5
# Release Configuration # Release Configuration
releaseVersion=0.577 releaseVersion=0.578
stdReleaseFile=yacy_v${releaseVersion}_${DSTAMP}_${releaseNr}.tar.gz stdReleaseFile=yacy_v${releaseVersion}_${DSTAMP}_${releaseNr}.tar.gz
embReleaseFile=yacy_emb_v${releaseVersion}_${DSTAMP}_${releaseNr}.tar.gz embReleaseFile=yacy_emb_v${releaseVersion}_${DSTAMP}_${releaseNr}.tar.gz
proReleaseFile=yacy_pro_v${releaseVersion}_${DSTAMP}_${releaseNr}.tar.gz proReleaseFile=yacy_pro_v${releaseVersion}_${DSTAMP}_${releaseNr}.tar.gz

View File

@ -259,7 +259,7 @@ public class result {
prop.put("excluded", "0"); prop.put("excluded", "0");
} }
if (prop == null || prop.size() == 0) { if (prop == null || prop.isEmpty()) {
if (post.get("search", "").length() < 3) { if (post.get("search", "").length() < 3) {
prop.put("num-results", "2"); // no results - at least 3 chars prop.put("num-results", "2"); // no results - at least 3 chars
} else { } else {

View File

@ -107,26 +107,7 @@ public class kelondroBufferedEcoFS {
public synchronized void add(byte[] b, int start) throws IOException { public synchronized void add(byte[] b, int start) throws IOException {
put(size(), b, start); put(size(), b, start);
} }
/*
public synchronized void clean(long index, byte[] b, int start) throws IOException {
assert b.length - start >= efs.recordsize;
if (index >= size()) throw new IndexOutOfBoundsException("kelondroBufferedEcoFS.clean(" + index + ") outside bounds (" + this.size() + ")");
byte[] bb = buffer.get(new Long(index));
if (bb == null) {
efs.clean(index, b, start);
} else {
System.arraycopy(bb, 0, b, start, efs.recordsize);
buffer.remove(new Long(index));
efs.clean(index);
}
}
public synchronized void clean(long index) throws IOException {
if (index >= size()) throw new IndexOutOfBoundsException("kelondroBufferedEcoFS.clean(" + index + ") outside bounds (" + this.size() + ")");
buffer.remove(new Long(index));
efs.clean(index);
}
*/
public synchronized void cleanLast(byte[] b, int start) throws IOException { public synchronized void cleanLast(byte[] b, int start) throws IOException {
assert b.length - start >= efs.recordsize; assert b.length - start >= efs.recordsize;
Long i = new Long(size() - 1); Long i = new Long(size() - 1);

View File

@ -24,11 +24,15 @@
package de.anomic.kelondro; package de.anomic.kelondro;
import java.io.BufferedInputStream;
import java.io.File; import java.io.File;
import java.io.FileInputStream;
import java.io.FileNotFoundException; import java.io.FileNotFoundException;
import java.io.FileOutputStream; import java.io.FileOutputStream;
import java.io.IOException; import java.io.IOException;
import java.io.InputStream;
import java.io.RandomAccessFile; import java.io.RandomAccessFile;
import java.util.Iterator;
/** /**
* The EcoFS is a flat file with records of fixed length. The file does not contain * The EcoFS is a flat file with records of fixed length. The file does not contain
@ -522,6 +526,62 @@ public class kelondroEcoFS {
this.raf.setLength((long) (this.size() - 1) * (long) this.recordsize); this.raf.setLength((long) (this.size() - 1) * (long) this.recordsize);
} }
public static class ChunkIterator implements Iterator<byte[]> {
private int recordsize, chunksize;
private InputStream stream;
/**
* create a ChunkIterator
* a ChunkIterator uses a BufferedInputStream to iterate through the file
* and is therefore a fast option to get all elements in the file as a sequence
* @param file: the eco-file
* @param recordsize: the size of the elements in the file
* @param chunksize: the size of the chunks that are returned by next(). remaining bytes until the lenght of recordsize are skipped
* @throws FileNotFoundException
*/
public ChunkIterator(File file, int recordsize, int chunksize) throws FileNotFoundException {
assert (file.exists());
assert file.length() % recordsize == 0;
this.recordsize = recordsize;
this.chunksize = chunksize;
this.stream = new BufferedInputStream(new FileInputStream(file), 64 * 1024);
}
public boolean hasNext() {
try {
return stream != null && stream.available() > 0;
} catch (IOException e) {
return false;
}
}
public byte[] next() {
byte[] chunk = new byte[chunksize];
int r;
try {
// read the chunk
r = this.stream.read(chunk);
while (r < chunksize) {
r += this.stream.read(chunk, r, chunksize - r);
}
// skip remaining bytes
while (r < recordsize) {
r += this.stream.skip(recordsize - r);
}
return chunk;
} catch (IOException e) {
this.stream = null;
return null;
}
}
public void remove() {
throw new UnsupportedOperationException();
}
}
/** /**
* main - writes some data and checks the tables size (with time measureing) * main - writes some data and checks the tables size (with time measureing)
* @param args * @param args

View File

@ -95,10 +95,10 @@ public class kelondroEcoTable implements kelondroIndex {
try { try {
// open an existing table file // open an existing table file
this.file = new kelondroBufferedEcoFS(new kelondroEcoFS(tablefile, rowdef.objectsize), this.buffersize); int fileSize = (int) tableSize(tablefile, rowdef.objectsize);
// initialize index and copy table // initialize index and copy table
int records = (int) Math.max(file.size(), initialSpace); int records = (int) Math.max(fileSize, initialSpace);
long neededRAM4table = ((long) records) * (((long) rowdef.objectsize) + 4L) * 3L; long neededRAM4table = ((long) records) * (((long) rowdef.objectsize) + 4L) * 3L;
table = ((neededRAM4table < maxarraylength) && table = ((neededRAM4table < maxarraylength) &&
((useTailCache == tailCacheForceUsage) || ((useTailCache == tailCacheForceUsage) ||
@ -118,52 +118,69 @@ public class kelondroEcoTable implements kelondroIndex {
System.out.println("*** DEBUG " + tablefile + ": EcoTable " + tablefile.toString() + " has table copy " + ((table == null) ? "DISABLED" : "ENABLED")); System.out.println("*** DEBUG " + tablefile + ": EcoTable " + tablefile.toString() + " has table copy " + ((table == null) ? "DISABLED" : "ENABLED"));
// read all elements from the file into the copy table // read all elements from the file into the copy table
byte[] record = new byte[rowdef.objectsize];
byte[] key = new byte[rowdef.primaryKeyLength];
int fs = (int) file.size();
System.out.print("*** initializing RAM index for EcoTable " + tablefile.getName() + ":"); System.out.print("*** initializing RAM index for EcoTable " + tablefile.getName() + ":");
for (int i = 0; i < fs; i++) { int i = 0;
// read entry byte[] key;
file.get(i, record, 0); if (table == null) {
Iterator<byte[]> ki = keyIterator(tablefile, rowdef);
while (ki.hasNext()) {
key = ki.next();
// write the key into the index table // write the key into the index table
System.arraycopy(record, 0, key, 0, rowdef.primaryKeyLength); assert key != null;
index.addi(key, i); if (key == null) {i++; continue;}
index.addi(key, i++);
// write the tail into the table if ((i % 10000) == 0) {
if (table != null) table.addUnique(taildef.newEntry(record, rowdef.primaryKeyLength, true)); System.out.print('.');
System.out.flush();
}
}
} else {
byte[] record;
key = new byte[rowdef.primaryKeyLength];
Iterator<byte[]> ri = new kelondroEcoFS.ChunkIterator(tablefile, rowdef.objectsize, rowdef.objectsize);
while (ri.hasNext()) {
record = ri.next();
assert record != null;
if (record == null) {i++; continue;}
System.arraycopy(record, 0, key, 0, rowdef.primaryKeyLength);
if ((i % 10000) == 0) { // write the key into the index table
System.out.print('.'); index.addi(key, i++);
System.out.flush();
// write the tail into the table
table.addUnique(taildef.newEntry(record, rowdef.primaryKeyLength, true));
if ((i % 10000) == 0) {
System.out.print('.');
System.out.flush();
}
} }
} }
// check consistency
System.out.print(" -ordering- .."); System.out.print(" -ordering- ..");
System.out.flush(); System.out.flush();
// check consistency this.file = new kelondroBufferedEcoFS(new kelondroEcoFS(tablefile, rowdef.objectsize), this.buffersize);
ArrayList<Integer[]> doubles = index.removeDoubles(); ArrayList<Integer[]> doubles = index.removeDoubles();
System.out.println(" -removed " + doubles.size() + " doubles- done."); System.out.println(" -removed " + doubles.size() + " doubles- done.");
if (doubles.size() > 0) { if (doubles.size() > 0) {
System.out.println("DEBUG " + tablefile + ": WARNING - EcoTable " + tablefile + " has " + doubles.size() + " doubles"); System.out.println("DEBUG " + tablefile + ": WARNING - EcoTable " + tablefile + " has " + doubles.size() + " doubles");
// from all the doubles take one, put it back to the index and remove the others from the file // from all the doubles take one, put it back to the index and remove the others from the file
Iterator<Integer[]> i = doubles.iterator();
Integer[] ds;
// first put back one element each // first put back one element each
while (i.hasNext()) { byte[] record = new byte[rowdef.objectsize];
ds = i.next(); key = new byte[rowdef.primaryKeyLength];
for (Integer[] ds: doubles) {
file.get(ds[0].longValue(), record, 0); file.get(ds[0].longValue(), record, 0);
System.arraycopy(record, 0, key, 0, rowdef.primaryKeyLength); System.arraycopy(record, 0, key, 0, rowdef.primaryKeyLength);
index.addi(key, ds[0].intValue()); index.addi(key, ds[0].intValue());
} }
// then remove the other doubles by removing them from the table, but do a re-indexing while doing that // then remove the other doubles by removing them from the table, but do a re-indexing while doing that
// first aggregate all the delete positions because the elements from the top positions must be removed first // first aggregate all the delete positions because the elements from the top positions must be removed first
i = doubles.iterator();
TreeSet<Integer> delpos = new TreeSet<Integer>(); TreeSet<Integer> delpos = new TreeSet<Integer>();
while (i.hasNext()) { for (Integer[] ds: doubles) {
ds = i.next(); for (int j = 1; j < ds.length; j++) delpos.add(ds[j]);
for (int j = 1; j < ds.length; j++) {
delpos.add(ds[j]);
}
} }
// now remove the entries in a sorted way (top-down) // now remove the entries in a sorted way (top-down)
Integer top; Integer top;
@ -191,6 +208,18 @@ public class kelondroEcoTable implements kelondroIndex {
tableTracker.put(tablefile.toString(), this); tableTracker.put(tablefile.toString(), this);
} }
/**
* a KeyIterator
* @param file: the eco-file
* @param rowdef: the row definition
* @throws FileNotFoundException
* @return an iterator for all keys in the file
*/
public Iterator<byte[]> keyIterator(File file, kelondroRow rowdef) throws FileNotFoundException {
assert rowdef.primaryKeyIndex == 0;
return new kelondroEcoFS.ChunkIterator(file, rowdef.objectsize, rowdef.primaryKeyLength);
}
public static long tableSize(File tablefile, int recordsize) { public static long tableSize(File tablefile, int recordsize) {
// returns number of records in table // returns number of records in table
return kelondroEcoFS.tableSize(tablefile, recordsize); return kelondroEcoFS.tableSize(tablefile, recordsize);

View File

@ -477,7 +477,7 @@ public class kelondroRowCollection {
public synchronized void select(Set<String> keys) { public synchronized void select(Set<String> keys) {
// removes all entries but the ones given by urlselection // removes all entries but the ones given by urlselection
if ((keys == null) || (keys.size() == 0)) return; if ((keys == null) || (keys.isEmpty())) return;
Iterator<kelondroRow.Entry> i = rows(); Iterator<kelondroRow.Entry> i = rows();
kelondroRow.Entry row; kelondroRow.Entry row;
while (i.hasNext()) { while (i.hasNext()) {

View File

@ -78,7 +78,7 @@ public class kelondroSortStack<E> {
public synchronized stackElement top() { public synchronized stackElement top() {
// returns the element that is currently on top of the stack // returns the element that is currently on top of the stack
if (this.onstack.size() == 0) return null; if (this.onstack.isEmpty()) return null;
Long w = this.onstack.firstKey(); Long w = this.onstack.firstKey();
E element = this.onstack.get(w); E element = this.onstack.get(w);
return new stackElement(element, w); return new stackElement(element, w);
@ -88,7 +88,7 @@ public class kelondroSortStack<E> {
// returns the element that is currently on top of the stack // returns the element that is currently on top of the stack
// it is removed and added to the offstack list // it is removed and added to the offstack list
// this is exactly the same as element(offstack.size()) // this is exactly the same as element(offstack.size())
if (this.onstack.size() == 0) return null; if (this.onstack.isEmpty()) return null;
Long w = this.onstack.firstKey(); Long w = this.onstack.firstKey();
E element = this.onstack.remove(w); E element = this.onstack.remove(w);
stackElement se = new stackElement(element, w); stackElement se = new stackElement(element, w);

View File

@ -1643,7 +1643,7 @@ public final class plasmaSwitchboard extends serverAbstractSwitch<plasmaSwitchbo
* a) the user has configured to use the htcache or * a) the user has configured to use the htcache or
* b) the content should be indexed * b) the content should be indexed
* ========================================================================= */ * ========================================================================= */
if ((entry.profile().storeHTCache()) || (doIndexing && isSupportedContent)) { if (((entry.profile() != null) && (entry.profile().storeHTCache())) || (doIndexing && isSupportedContent)) {
// store response header // store response header
if (entry.writeResourceInfo()) { if (entry.writeResourceInfo()) {
this.log.logInfo("WROTE HEADER for " + entry.cacheFile()); this.log.logInfo("WROTE HEADER for " + entry.cacheFile());