mirror of
https://github.com/yacy/yacy_search_server.git
synced 2024-09-19 00:01:41 +02:00
speed enhancement for reading of eco-table indexes
git-svn-id: https://svn.berlios.de/svnroot/repos/yacy/trunk@4647 6c8d7289-2bf4-0310-a012-ef5d649a1542
This commit is contained in:
parent
e96ecd269f
commit
117ae78001
|
@ -3,7 +3,7 @@ javacSource=1.5
|
||||||
javacTarget=1.5
|
javacTarget=1.5
|
||||||
|
|
||||||
# Release Configuration
|
# Release Configuration
|
||||||
releaseVersion=0.577
|
releaseVersion=0.578
|
||||||
stdReleaseFile=yacy_v${releaseVersion}_${DSTAMP}_${releaseNr}.tar.gz
|
stdReleaseFile=yacy_v${releaseVersion}_${DSTAMP}_${releaseNr}.tar.gz
|
||||||
embReleaseFile=yacy_emb_v${releaseVersion}_${DSTAMP}_${releaseNr}.tar.gz
|
embReleaseFile=yacy_emb_v${releaseVersion}_${DSTAMP}_${releaseNr}.tar.gz
|
||||||
proReleaseFile=yacy_pro_v${releaseVersion}_${DSTAMP}_${releaseNr}.tar.gz
|
proReleaseFile=yacy_pro_v${releaseVersion}_${DSTAMP}_${releaseNr}.tar.gz
|
||||||
|
|
|
@ -259,7 +259,7 @@ public class result {
|
||||||
prop.put("excluded", "0");
|
prop.put("excluded", "0");
|
||||||
}
|
}
|
||||||
|
|
||||||
if (prop == null || prop.size() == 0) {
|
if (prop == null || prop.isEmpty()) {
|
||||||
if (post.get("search", "").length() < 3) {
|
if (post.get("search", "").length() < 3) {
|
||||||
prop.put("num-results", "2"); // no results - at least 3 chars
|
prop.put("num-results", "2"); // no results - at least 3 chars
|
||||||
} else {
|
} else {
|
||||||
|
|
|
@ -107,26 +107,7 @@ public class kelondroBufferedEcoFS {
|
||||||
public synchronized void add(byte[] b, int start) throws IOException {
|
public synchronized void add(byte[] b, int start) throws IOException {
|
||||||
put(size(), b, start);
|
put(size(), b, start);
|
||||||
}
|
}
|
||||||
/*
|
|
||||||
public synchronized void clean(long index, byte[] b, int start) throws IOException {
|
|
||||||
assert b.length - start >= efs.recordsize;
|
|
||||||
if (index >= size()) throw new IndexOutOfBoundsException("kelondroBufferedEcoFS.clean(" + index + ") outside bounds (" + this.size() + ")");
|
|
||||||
byte[] bb = buffer.get(new Long(index));
|
|
||||||
if (bb == null) {
|
|
||||||
efs.clean(index, b, start);
|
|
||||||
} else {
|
|
||||||
System.arraycopy(bb, 0, b, start, efs.recordsize);
|
|
||||||
buffer.remove(new Long(index));
|
|
||||||
efs.clean(index);
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
public synchronized void clean(long index) throws IOException {
|
|
||||||
if (index >= size()) throw new IndexOutOfBoundsException("kelondroBufferedEcoFS.clean(" + index + ") outside bounds (" + this.size() + ")");
|
|
||||||
buffer.remove(new Long(index));
|
|
||||||
efs.clean(index);
|
|
||||||
}
|
|
||||||
*/
|
|
||||||
public synchronized void cleanLast(byte[] b, int start) throws IOException {
|
public synchronized void cleanLast(byte[] b, int start) throws IOException {
|
||||||
assert b.length - start >= efs.recordsize;
|
assert b.length - start >= efs.recordsize;
|
||||||
Long i = new Long(size() - 1);
|
Long i = new Long(size() - 1);
|
||||||
|
|
|
@ -24,11 +24,15 @@
|
||||||
|
|
||||||
package de.anomic.kelondro;
|
package de.anomic.kelondro;
|
||||||
|
|
||||||
|
import java.io.BufferedInputStream;
|
||||||
import java.io.File;
|
import java.io.File;
|
||||||
|
import java.io.FileInputStream;
|
||||||
import java.io.FileNotFoundException;
|
import java.io.FileNotFoundException;
|
||||||
import java.io.FileOutputStream;
|
import java.io.FileOutputStream;
|
||||||
import java.io.IOException;
|
import java.io.IOException;
|
||||||
|
import java.io.InputStream;
|
||||||
import java.io.RandomAccessFile;
|
import java.io.RandomAccessFile;
|
||||||
|
import java.util.Iterator;
|
||||||
|
|
||||||
/**
|
/**
|
||||||
* The EcoFS is a flat file with records of fixed length. The file does not contain
|
* The EcoFS is a flat file with records of fixed length. The file does not contain
|
||||||
|
@ -117,7 +121,7 @@ public class kelondroEcoFS {
|
||||||
assert size % recordsize == 0;
|
assert size % recordsize == 0;
|
||||||
return size / (long) recordsize;
|
return size / (long) recordsize;
|
||||||
}
|
}
|
||||||
|
|
||||||
/**
|
/**
|
||||||
* @return the number of records in file plus number of records in buffer
|
* @return the number of records in file plus number of records in buffer
|
||||||
* @throws IOException
|
* @throws IOException
|
||||||
|
@ -522,6 +526,62 @@ public class kelondroEcoFS {
|
||||||
this.raf.setLength((long) (this.size() - 1) * (long) this.recordsize);
|
this.raf.setLength((long) (this.size() - 1) * (long) this.recordsize);
|
||||||
}
|
}
|
||||||
|
|
||||||
|
public static class ChunkIterator implements Iterator<byte[]> {
|
||||||
|
|
||||||
|
private int recordsize, chunksize;
|
||||||
|
private InputStream stream;
|
||||||
|
|
||||||
|
/**
|
||||||
|
* create a ChunkIterator
|
||||||
|
* a ChunkIterator uses a BufferedInputStream to iterate through the file
|
||||||
|
* and is therefore a fast option to get all elements in the file as a sequence
|
||||||
|
* @param file: the eco-file
|
||||||
|
* @param recordsize: the size of the elements in the file
|
||||||
|
* @param chunksize: the size of the chunks that are returned by next(). remaining bytes until the lenght of recordsize are skipped
|
||||||
|
* @throws FileNotFoundException
|
||||||
|
*/
|
||||||
|
public ChunkIterator(File file, int recordsize, int chunksize) throws FileNotFoundException {
|
||||||
|
assert (file.exists());
|
||||||
|
assert file.length() % recordsize == 0;
|
||||||
|
this.recordsize = recordsize;
|
||||||
|
this.chunksize = chunksize;
|
||||||
|
this.stream = new BufferedInputStream(new FileInputStream(file), 64 * 1024);
|
||||||
|
}
|
||||||
|
|
||||||
|
public boolean hasNext() {
|
||||||
|
try {
|
||||||
|
return stream != null && stream.available() > 0;
|
||||||
|
} catch (IOException e) {
|
||||||
|
return false;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
public byte[] next() {
|
||||||
|
byte[] chunk = new byte[chunksize];
|
||||||
|
int r;
|
||||||
|
try {
|
||||||
|
// read the chunk
|
||||||
|
r = this.stream.read(chunk);
|
||||||
|
while (r < chunksize) {
|
||||||
|
r += this.stream.read(chunk, r, chunksize - r);
|
||||||
|
}
|
||||||
|
// skip remaining bytes
|
||||||
|
while (r < recordsize) {
|
||||||
|
r += this.stream.skip(recordsize - r);
|
||||||
|
}
|
||||||
|
return chunk;
|
||||||
|
} catch (IOException e) {
|
||||||
|
this.stream = null;
|
||||||
|
return null;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
public void remove() {
|
||||||
|
throw new UnsupportedOperationException();
|
||||||
|
}
|
||||||
|
|
||||||
|
}
|
||||||
|
|
||||||
/**
|
/**
|
||||||
* main - writes some data and checks the tables size (with time measureing)
|
* main - writes some data and checks the tables size (with time measureing)
|
||||||
* @param args
|
* @param args
|
||||||
|
|
|
@ -95,10 +95,10 @@ public class kelondroEcoTable implements kelondroIndex {
|
||||||
|
|
||||||
try {
|
try {
|
||||||
// open an existing table file
|
// open an existing table file
|
||||||
this.file = new kelondroBufferedEcoFS(new kelondroEcoFS(tablefile, rowdef.objectsize), this.buffersize);
|
int fileSize = (int) tableSize(tablefile, rowdef.objectsize);
|
||||||
|
|
||||||
// initialize index and copy table
|
// initialize index and copy table
|
||||||
int records = (int) Math.max(file.size(), initialSpace);
|
int records = (int) Math.max(fileSize, initialSpace);
|
||||||
long neededRAM4table = ((long) records) * (((long) rowdef.objectsize) + 4L) * 3L;
|
long neededRAM4table = ((long) records) * (((long) rowdef.objectsize) + 4L) * 3L;
|
||||||
table = ((neededRAM4table < maxarraylength) &&
|
table = ((neededRAM4table < maxarraylength) &&
|
||||||
((useTailCache == tailCacheForceUsage) ||
|
((useTailCache == tailCacheForceUsage) ||
|
||||||
|
@ -118,52 +118,69 @@ public class kelondroEcoTable implements kelondroIndex {
|
||||||
System.out.println("*** DEBUG " + tablefile + ": EcoTable " + tablefile.toString() + " has table copy " + ((table == null) ? "DISABLED" : "ENABLED"));
|
System.out.println("*** DEBUG " + tablefile + ": EcoTable " + tablefile.toString() + " has table copy " + ((table == null) ? "DISABLED" : "ENABLED"));
|
||||||
|
|
||||||
// read all elements from the file into the copy table
|
// read all elements from the file into the copy table
|
||||||
byte[] record = new byte[rowdef.objectsize];
|
|
||||||
byte[] key = new byte[rowdef.primaryKeyLength];
|
|
||||||
int fs = (int) file.size();
|
|
||||||
System.out.print("*** initializing RAM index for EcoTable " + tablefile.getName() + ":");
|
System.out.print("*** initializing RAM index for EcoTable " + tablefile.getName() + ":");
|
||||||
for (int i = 0; i < fs; i++) {
|
int i = 0;
|
||||||
// read entry
|
byte[] key;
|
||||||
file.get(i, record, 0);
|
if (table == null) {
|
||||||
|
Iterator<byte[]> ki = keyIterator(tablefile, rowdef);
|
||||||
// write the key into the index table
|
while (ki.hasNext()) {
|
||||||
System.arraycopy(record, 0, key, 0, rowdef.primaryKeyLength);
|
key = ki.next();
|
||||||
index.addi(key, i);
|
|
||||||
|
|
||||||
// write the tail into the table
|
|
||||||
if (table != null) table.addUnique(taildef.newEntry(record, rowdef.primaryKeyLength, true));
|
|
||||||
|
|
||||||
if ((i % 10000) == 0) {
|
// write the key into the index table
|
||||||
System.out.print('.');
|
assert key != null;
|
||||||
System.out.flush();
|
if (key == null) {i++; continue;}
|
||||||
|
index.addi(key, i++);
|
||||||
|
|
||||||
|
if ((i % 10000) == 0) {
|
||||||
|
System.out.print('.');
|
||||||
|
System.out.flush();
|
||||||
|
}
|
||||||
|
}
|
||||||
|
} else {
|
||||||
|
byte[] record;
|
||||||
|
key = new byte[rowdef.primaryKeyLength];
|
||||||
|
Iterator<byte[]> ri = new kelondroEcoFS.ChunkIterator(tablefile, rowdef.objectsize, rowdef.objectsize);
|
||||||
|
while (ri.hasNext()) {
|
||||||
|
record = ri.next();
|
||||||
|
assert record != null;
|
||||||
|
if (record == null) {i++; continue;}
|
||||||
|
System.arraycopy(record, 0, key, 0, rowdef.primaryKeyLength);
|
||||||
|
|
||||||
|
// write the key into the index table
|
||||||
|
index.addi(key, i++);
|
||||||
|
|
||||||
|
// write the tail into the table
|
||||||
|
table.addUnique(taildef.newEntry(record, rowdef.primaryKeyLength, true));
|
||||||
|
|
||||||
|
if ((i % 10000) == 0) {
|
||||||
|
System.out.print('.');
|
||||||
|
System.out.flush();
|
||||||
|
}
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
// check consistency
|
||||||
System.out.print(" -ordering- ..");
|
System.out.print(" -ordering- ..");
|
||||||
System.out.flush();
|
System.out.flush();
|
||||||
// check consistency
|
this.file = new kelondroBufferedEcoFS(new kelondroEcoFS(tablefile, rowdef.objectsize), this.buffersize);
|
||||||
ArrayList<Integer[]> doubles = index.removeDoubles();
|
ArrayList<Integer[]> doubles = index.removeDoubles();
|
||||||
System.out.println(" -removed " + doubles.size() + " doubles- done.");
|
System.out.println(" -removed " + doubles.size() + " doubles- done.");
|
||||||
if (doubles.size() > 0) {
|
if (doubles.size() > 0) {
|
||||||
System.out.println("DEBUG " + tablefile + ": WARNING - EcoTable " + tablefile + " has " + doubles.size() + " doubles");
|
System.out.println("DEBUG " + tablefile + ": WARNING - EcoTable " + tablefile + " has " + doubles.size() + " doubles");
|
||||||
// from all the doubles take one, put it back to the index and remove the others from the file
|
// from all the doubles take one, put it back to the index and remove the others from the file
|
||||||
Iterator<Integer[]> i = doubles.iterator();
|
|
||||||
Integer[] ds;
|
|
||||||
// first put back one element each
|
// first put back one element each
|
||||||
while (i.hasNext()) {
|
byte[] record = new byte[rowdef.objectsize];
|
||||||
ds = i.next();
|
key = new byte[rowdef.primaryKeyLength];
|
||||||
|
for (Integer[] ds: doubles) {
|
||||||
file.get(ds[0].longValue(), record, 0);
|
file.get(ds[0].longValue(), record, 0);
|
||||||
System.arraycopy(record, 0, key, 0, rowdef.primaryKeyLength);
|
System.arraycopy(record, 0, key, 0, rowdef.primaryKeyLength);
|
||||||
index.addi(key, ds[0].intValue());
|
index.addi(key, ds[0].intValue());
|
||||||
}
|
}
|
||||||
// then remove the other doubles by removing them from the table, but do a re-indexing while doing that
|
// then remove the other doubles by removing them from the table, but do a re-indexing while doing that
|
||||||
// first aggregate all the delete positions because the elements from the top positions must be removed first
|
// first aggregate all the delete positions because the elements from the top positions must be removed first
|
||||||
i = doubles.iterator();
|
|
||||||
TreeSet<Integer> delpos = new TreeSet<Integer>();
|
TreeSet<Integer> delpos = new TreeSet<Integer>();
|
||||||
while (i.hasNext()) {
|
for (Integer[] ds: doubles) {
|
||||||
ds = i.next();
|
for (int j = 1; j < ds.length; j++) delpos.add(ds[j]);
|
||||||
for (int j = 1; j < ds.length; j++) {
|
|
||||||
delpos.add(ds[j]);
|
|
||||||
}
|
|
||||||
}
|
}
|
||||||
// now remove the entries in a sorted way (top-down)
|
// now remove the entries in a sorted way (top-down)
|
||||||
Integer top;
|
Integer top;
|
||||||
|
@ -191,6 +208,18 @@ public class kelondroEcoTable implements kelondroIndex {
|
||||||
tableTracker.put(tablefile.toString(), this);
|
tableTracker.put(tablefile.toString(), this);
|
||||||
}
|
}
|
||||||
|
|
||||||
|
/**
|
||||||
|
* a KeyIterator
|
||||||
|
* @param file: the eco-file
|
||||||
|
* @param rowdef: the row definition
|
||||||
|
* @throws FileNotFoundException
|
||||||
|
* @return an iterator for all keys in the file
|
||||||
|
*/
|
||||||
|
public Iterator<byte[]> keyIterator(File file, kelondroRow rowdef) throws FileNotFoundException {
|
||||||
|
assert rowdef.primaryKeyIndex == 0;
|
||||||
|
return new kelondroEcoFS.ChunkIterator(file, rowdef.objectsize, rowdef.primaryKeyLength);
|
||||||
|
}
|
||||||
|
|
||||||
public static long tableSize(File tablefile, int recordsize) {
|
public static long tableSize(File tablefile, int recordsize) {
|
||||||
// returns number of records in table
|
// returns number of records in table
|
||||||
return kelondroEcoFS.tableSize(tablefile, recordsize);
|
return kelondroEcoFS.tableSize(tablefile, recordsize);
|
||||||
|
|
|
@ -477,7 +477,7 @@ public class kelondroRowCollection {
|
||||||
|
|
||||||
public synchronized void select(Set<String> keys) {
|
public synchronized void select(Set<String> keys) {
|
||||||
// removes all entries but the ones given by urlselection
|
// removes all entries but the ones given by urlselection
|
||||||
if ((keys == null) || (keys.size() == 0)) return;
|
if ((keys == null) || (keys.isEmpty())) return;
|
||||||
Iterator<kelondroRow.Entry> i = rows();
|
Iterator<kelondroRow.Entry> i = rows();
|
||||||
kelondroRow.Entry row;
|
kelondroRow.Entry row;
|
||||||
while (i.hasNext()) {
|
while (i.hasNext()) {
|
||||||
|
|
|
@ -78,7 +78,7 @@ public class kelondroSortStack<E> {
|
||||||
|
|
||||||
public synchronized stackElement top() {
|
public synchronized stackElement top() {
|
||||||
// returns the element that is currently on top of the stack
|
// returns the element that is currently on top of the stack
|
||||||
if (this.onstack.size() == 0) return null;
|
if (this.onstack.isEmpty()) return null;
|
||||||
Long w = this.onstack.firstKey();
|
Long w = this.onstack.firstKey();
|
||||||
E element = this.onstack.get(w);
|
E element = this.onstack.get(w);
|
||||||
return new stackElement(element, w);
|
return new stackElement(element, w);
|
||||||
|
@ -88,7 +88,7 @@ public class kelondroSortStack<E> {
|
||||||
// returns the element that is currently on top of the stack
|
// returns the element that is currently on top of the stack
|
||||||
// it is removed and added to the offstack list
|
// it is removed and added to the offstack list
|
||||||
// this is exactly the same as element(offstack.size())
|
// this is exactly the same as element(offstack.size())
|
||||||
if (this.onstack.size() == 0) return null;
|
if (this.onstack.isEmpty()) return null;
|
||||||
Long w = this.onstack.firstKey();
|
Long w = this.onstack.firstKey();
|
||||||
E element = this.onstack.remove(w);
|
E element = this.onstack.remove(w);
|
||||||
stackElement se = new stackElement(element, w);
|
stackElement se = new stackElement(element, w);
|
||||||
|
|
|
@ -1643,7 +1643,7 @@ public final class plasmaSwitchboard extends serverAbstractSwitch<plasmaSwitchbo
|
||||||
* a) the user has configured to use the htcache or
|
* a) the user has configured to use the htcache or
|
||||||
* b) the content should be indexed
|
* b) the content should be indexed
|
||||||
* ========================================================================= */
|
* ========================================================================= */
|
||||||
if ((entry.profile().storeHTCache()) || (doIndexing && isSupportedContent)) {
|
if (((entry.profile() != null) && (entry.profile().storeHTCache())) || (doIndexing && isSupportedContent)) {
|
||||||
// store response header
|
// store response header
|
||||||
if (entry.writeResourceInfo()) {
|
if (entry.writeResourceInfo()) {
|
||||||
this.log.logInfo("WROTE HEADER for " + entry.cacheFile());
|
this.log.logInfo("WROTE HEADER for " + entry.cacheFile());
|
||||||
|
|
Loading…
Reference in New Issue
Block a user