speed enhancement for reading of eco-table indexes

git-svn-id: https://svn.berlios.de/svnroot/repos/yacy/trunk@4647 6c8d7289-2bf4-0310-a012-ef5d649a1542
2024-09-19 00:01:41 +02:00 · 2008-04-06 11:50:15 +00:00 · 2008-04-06 11:50:15 +00:00 · 117ae78001
commit 117ae78001
parent e96ecd269f
8 changed files with 126 additions and 56 deletions
--- a/build.properties
+++ b/build.properties
@ -3,7 +3,7 @@ javacSource=1.5
 javacTarget=1.5
 # Release Configuration
-releaseVersion=0.577
+releaseVersion=0.578
 stdReleaseFile=yacy_v${releaseVersion}_${DSTAMP}_${releaseNr}.tar.gz
 embReleaseFile=yacy_emb_v${releaseVersion}_${DSTAMP}_${releaseNr}.tar.gz
 proReleaseFile=yacy_pro_v${releaseVersion}_${DSTAMP}_${releaseNr}.tar.gz
--- a/htroot/yacy/ui/result.java
+++ b/htroot/yacy/ui/result.java
@ -259,7 +259,7 @@ public class result {
                prop.put("excluded", "0");
            }
-            if (prop == null || prop.size() == 0) {
+            if (prop == null || prop.isEmpty()) {
                if (post.get("search", "").length() < 3) {
                    prop.put("num-results", "2"); // no results - at least 3 chars
                } else {
--- a/source/de/anomic/kelondro/kelondroBufferedEcoFS.java
+++ b/source/de/anomic/kelondro/kelondroBufferedEcoFS.java
@ -107,26 +107,7 @@ public class kelondroBufferedEcoFS {
    public synchronized void add(byte[] b, int start) throws IOException {
        put(size(), b, start);
    }
 /*
    public synchronized void clean(long index, byte[] b, int start) throws IOException {
        assert b.length - start >= efs.recordsize;
        if (index >= size()) throw new IndexOutOfBoundsException("kelondroBufferedEcoFS.clean(" + index + ") outside bounds (" + this.size() + ")");
        byte[] bb = buffer.get(new Long(index));
        if (bb == null) {
            efs.clean(index, b, start);
        } else {
            System.arraycopy(bb, 0, b, start, efs.recordsize);
            buffer.remove(new Long(index));
            efs.clean(index);
        }
    }
    public synchronized void clean(long index) throws IOException {
        if (index >= size()) throw new IndexOutOfBoundsException("kelondroBufferedEcoFS.clean(" + index + ") outside bounds (" + this.size() + ")");
        buffer.remove(new Long(index));
        efs.clean(index);
    }
 */
    public synchronized void cleanLast(byte[] b, int start) throws IOException {
        assert b.length - start >= efs.recordsize;
        Long i = new Long(size() - 1);
--- a/source/de/anomic/kelondro/kelondroEcoFS.java
+++ b/source/de/anomic/kelondro/kelondroEcoFS.java
@ -24,11 +24,15 @@
 package de.anomic.kelondro;
 import java.io.BufferedInputStream;
 import java.io.File;
 import java.io.FileInputStream;
 import java.io.FileNotFoundException;
 import java.io.FileOutputStream;
 import java.io.IOException;
 import java.io.InputStream;
 import java.io.RandomAccessFile;
 import java.util.Iterator;
 /**
 * The EcoFS is a flat file with records of fixed length. The file does not contain
@ -522,6 +526,62 @@ public class kelondroEcoFS {
        this.raf.setLength((long) (this.size() - 1) * (long) this.recordsize);
    }
    public static class ChunkIterator implements Iterator<byte[]> {
        private int recordsize, chunksize;
        private InputStream stream;
        /**
         * create a ChunkIterator
         * a ChunkIterator uses a BufferedInputStream to iterate through the file
         * and is therefore a fast option to get all elements in the file as a sequence
         * @param file: the eco-file
         * @param recordsize: the size of the elements in the file
         * @param chunksize: the size of the chunks that are returned by next(). remaining bytes until the lenght of recordsize are skipped
         * @throws FileNotFoundException 
         */
        public ChunkIterator(File file, int recordsize, int chunksize) throws FileNotFoundException {
            assert (file.exists());
            assert file.length() % recordsize == 0;
            this.recordsize = recordsize;
            this.chunksize = chunksize;
            this.stream = new BufferedInputStream(new FileInputStream(file), 64 * 1024);
        }
        public boolean hasNext() {
            try {
                return stream != null && stream.available() > 0;
            } catch (IOException e) {
                return false;
            }
        }
        public byte[] next() {
            byte[] chunk = new byte[chunksize];
            int r;
            try {
                // read the chunk
                r = this.stream.read(chunk);
                while (r < chunksize) {
                    r += this.stream.read(chunk, r, chunksize - r);
                }
                // skip remaining bytes
                while (r < recordsize) {
                    r += this.stream.skip(recordsize - r);
                }
                return chunk;
            } catch (IOException e) {
                this.stream = null;
                return null;
            }
        }
        public void remove() {
            throw new UnsupportedOperationException();
        }
    }
    /**
     * main - writes some data and checks the tables size (with time measureing)
     * @param args
--- a/source/de/anomic/kelondro/kelondroEcoTable.java
+++ b/source/de/anomic/kelondro/kelondroEcoTable.java
@ -95,10 +95,10 @@ public class kelondroEcoTable implements kelondroIndex {
        try {
            // open an existing table file
-            this.file = new kelondroBufferedEcoFS(new kelondroEcoFS(tablefile, rowdef.objectsize), this.buffersize);
+            int fileSize = (int) tableSize(tablefile, rowdef.objectsize);
            // initialize index and copy table
-            int  records = (int) Math.max(file.size(), initialSpace);
+            int  records = (int) Math.max(fileSize, initialSpace);
            long neededRAM4table = ((long) records) * (((long) rowdef.objectsize) + 4L) * 3L;
            table = ((neededRAM4table < maxarraylength) &&
                     ((useTailCache == tailCacheForceUsage) ||
@ -118,52 +118,69 @@ public class kelondroEcoTable implements kelondroIndex {
            System.out.println("*** DEBUG " + tablefile + ": EcoTable " + tablefile.toString() + " has table copy " + ((table == null) ? "DISABLED" : "ENABLED"));
            // read all elements from the file into the copy table
            byte[] record = new byte[rowdef.objectsize];
            byte[] key = new byte[rowdef.primaryKeyLength];
            int fs = (int) file.size();
            System.out.print("*** initializing RAM index for EcoTable " + tablefile.getName() + ":");
-            for (int i = 0; i < fs; i++) {
+            int i = 0;
-                // read entry
+            byte[] key;
-                file.get(i, record, 0);
+            if (table == null) {
                Iterator<byte[]> ki = keyIterator(tablefile, rowdef);
                while (ki.hasNext()) {
                    key = ki.next();
-                // write the key into the index table
+                    // write the key into the index table
-                System.arraycopy(record, 0, key, 0, rowdef.primaryKeyLength);
+                    assert key != null;
-                index.addi(key, i);
+                    if (key == null) {i++; continue;}
                    index.addi(key, i++);
-                // write the tail into the table
+                    if ((i % 10000) == 0) {
-                if (table != null) table.addUnique(taildef.newEntry(record, rowdef.primaryKeyLength, true));
+                        System.out.print('.');
                        System.out.flush();
                    }
                }
            } else {
                byte[] record;
                key = new byte[rowdef.primaryKeyLength];
                Iterator<byte[]> ri = new kelondroEcoFS.ChunkIterator(tablefile, rowdef.objectsize, rowdef.objectsize);
                while (ri.hasNext()) {
                    record = ri.next();
                    assert record != null;
                    if (record == null) {i++; continue;}
                    System.arraycopy(record, 0, key, 0, rowdef.primaryKeyLength);
-                if ((i % 10000) == 0) {
+                    // write the key into the index table
-                    System.out.print('.');
+                    index.addi(key, i++);
-                    System.out.flush();
+            
                    // write the tail into the table
                    table.addUnique(taildef.newEntry(record, rowdef.primaryKeyLength, true));
                    if ((i % 10000) == 0) {
                        System.out.print('.');
                        System.out.flush();
                    }
                }
            }
            // check consistency
            System.out.print(" -ordering- ..");
            System.out.flush();
-            // check consistency
+            this.file = new kelondroBufferedEcoFS(new kelondroEcoFS(tablefile, rowdef.objectsize), this.buffersize);
            ArrayList<Integer[]> doubles = index.removeDoubles();
            System.out.println(" -removed " + doubles.size() + " doubles- done.");
            if (doubles.size() > 0) {
                System.out.println("DEBUG " + tablefile + ": WARNING - EcoTable " + tablefile + " has " + doubles.size() + " doubles");
                // from all the doubles take one, put it back to the index and remove the others from the file
                Iterator<Integer[]> i = doubles.iterator();
                Integer[] ds;
                // first put back one element each
-                while (i.hasNext()) {
+                byte[] record = new byte[rowdef.objectsize];
-                    ds = i.next();
+                key = new byte[rowdef.primaryKeyLength];
                for (Integer[] ds: doubles) {
                    file.get(ds[0].longValue(), record, 0);
                    System.arraycopy(record, 0, key, 0, rowdef.primaryKeyLength);
                    index.addi(key, ds[0].intValue());
                }
                // then remove the other doubles by removing them from the table, but do a re-indexing while doing that
                // first aggregate all the delete positions because the elements from the top positions must be removed first
                i = doubles.iterator();
                TreeSet<Integer> delpos = new TreeSet<Integer>();
-                while (i.hasNext()) {
+                for (Integer[] ds: doubles) {
-                    ds = i.next();
+                    for (int j = 1; j < ds.length; j++) delpos.add(ds[j]);
                    for (int j = 1; j < ds.length; j++) {
                        delpos.add(ds[j]);
                    }
                }
                // now remove the entries in a sorted way (top-down)
                Integer top;
@ -191,6 +208,18 @@ public class kelondroEcoTable implements kelondroIndex {
        tableTracker.put(tablefile.toString(), this);
    }
    /**
     * a KeyIterator
     * @param file: the eco-file
     * @param rowdef: the row definition
     * @throws FileNotFoundException 
     * @return an iterator for all keys in the file
     */
    public Iterator<byte[]> keyIterator(File file, kelondroRow rowdef) throws FileNotFoundException {
        assert rowdef.primaryKeyIndex == 0;
        return new kelondroEcoFS.ChunkIterator(file, rowdef.objectsize, rowdef.primaryKeyLength);
    }
    public static long tableSize(File tablefile, int recordsize) {
        // returns number of records in table
        return kelondroEcoFS.tableSize(tablefile, recordsize);
--- a/source/de/anomic/kelondro/kelondroRowCollection.java
+++ b/source/de/anomic/kelondro/kelondroRowCollection.java
@ -477,7 +477,7 @@ public class kelondroRowCollection {
    public synchronized void select(Set<String> keys) {
        // removes all entries but the ones given by urlselection
-        if ((keys == null) || (keys.size() == 0)) return;
+        if ((keys == null) || (keys.isEmpty())) return;
        Iterator<kelondroRow.Entry> i = rows();
        kelondroRow.Entry row;
        while (i.hasNext()) {
--- a/source/de/anomic/kelondro/kelondroSortStack.java
+++ b/source/de/anomic/kelondro/kelondroSortStack.java
@ -78,7 +78,7 @@ public class kelondroSortStack<E> {
    public synchronized stackElement top() {
        // returns the element that is currently on top of the stack
-        if (this.onstack.size() == 0) return null;
+        if (this.onstack.isEmpty()) return null;
        Long w = this.onstack.firstKey();
        E element = this.onstack.get(w);
        return new stackElement(element, w);
@ -88,7 +88,7 @@ public class kelondroSortStack<E> {
        // returns the element that is currently on top of the stack
        // it is removed and added to the offstack list
        // this is exactly the same as element(offstack.size())
-        if (this.onstack.size() == 0) return null;
+        if (this.onstack.isEmpty()) return null;
        Long w = this.onstack.firstKey();
        E element = this.onstack.remove(w);
        stackElement se = new stackElement(element, w);
--- a/source/de/anomic/plasma/plasmaSwitchboard.java
+++ b/source/de/anomic/plasma/plasmaSwitchboard.java
@ -1643,7 +1643,7 @@ public final class plasmaSwitchboard extends serverAbstractSwitch<plasmaSwitchbo
         * a) the user has configured to use the htcache or
         * b) the content should be indexed
         * ========================================================================= */        
-        if ((entry.profile().storeHTCache()) || (doIndexing && isSupportedContent)) {
+        if (((entry.profile() != null) && (entry.profile().storeHTCache())) || (doIndexing && isSupportedContent)) {
            // store response header            
            if (entry.writeResourceInfo()) {
                this.log.logInfo("WROTE HEADER for " + entry.cacheFile());