To translate a mediawiki dump into the YaCy surrogate format do the following:

- download a wikipedia dump, i.e. dewiki-20090311-pages-articles.xml.bz2 from http://download.wikimedia.org/dewiki/20090311/ - move dewiki-20090311-pages-articles.xml.bz2 to DATA/HTCACHE/ - start the conversion; open a command shell, move to the yacy home directory and execute java -Xmx2000m -cp classes:lib/bzip2.jar de.anomic.tools.mediawikiIndex -convert DATA/HTCACHE/dewiki-20090311-pages-articles.xml.bz2 DATA/SURROGATES/in/ http://de.wikipedia.org/wiki/ this generates a series of files to DATA/SURROGATES/in if YaCy is running (it may run concurrently), it fetches all new dumps in the surrogate-in directory. The export process is transaction-save, that means YaCy will not start reading a dump while the dump is not completely finished. git-svn-id: https://svn.berlios.de/svnroot/repos/yacy/trunk@5851 6c8d7289-2bf4-0310-a012-ef5d649a1542
2024-09-19 00:01:41 +02:00 · 2009-04-21 22:12:19 +00:00 · 2009-04-21 22:12:19 +00:00 · 16baa7ad24
commit 16baa7ad24
parent 0b2c98edc9
5 changed files with 28 additions and 70 deletions
--- a/source/de/anomic/http/httpdFileHandler.java
+++ b/source/de/anomic/http/httpdFileHandler.java
@ -646,7 +646,7 @@ public final class httpdFileHandler {
                    StringBuilder stringBuffer = new StringBuilder(1024);

                    while (is.available() > 0) {
-                        stringBuffer.append((char) is.read());
+                    	stringBuffer.append((char) is.read());
                    }

                    String cgiReturn = stringBuffer.toString();
--- a/source/de/anomic/kelondro/index/RowCollection.java
+++ b/source/de/anomic/kelondro/index/RowCollection.java
@ -285,7 +285,7 @@ public class RowCollection implements Iterable<Row.Entry> {
    }
    
    public synchronized void add(final byte[] a) {
-        assert a.length == this.rowdef.objectsize;
+        assert a.length == this.rowdef.objectsize : "a.length = " + a.length + ", objectsize = " + this.rowdef.objectsize;
        addUnique(a, 0, a.length);
    }
    
@ -623,27 +623,22 @@ public class RowCollection implements Iterable<Row.Entry> {
        int p = L;
        int q = R - 1;
        int pivot = pivot(L, R, S, swapspace);
-        int oldpivot = -1;
-        byte[] compiledPivot = null;
        if (this.rowdef.objectOrder instanceof Base64Order) {
        	while (p <= q) {
        		// wenn pivot < S: pivot befindet sich in sortierter Sequenz von L bis S - 1
        		// d.h. alle Werte von L bis pivot sind kleiner als das pivot
-        		// zu finden ist ein minimales p <= q so dass chunk[p] >= pivot        		
-        		if (compiledPivot == null) compiledPivot = compilePivot(pivot);
+        		// zu finden ist ein minimales p <= q so dass chunk[p] >= pivot
        		if ((pivot < S) && (p < pivot)) {
        			//System.out.println("+++ saved " + (pivot - p) + " comparisments");
        			p = pivot;
        			S = 0;
        		} else {
-        			while ((p < R - 1) && (comparePivot(compiledPivot, p) >= 0)) p++; // chunkAt[p] < pivot
+        			while ((p < R - 1) && (compare(pivot, p) >= 0)) p++; // chunkAt[p] < pivot
        		}
        		// nun gilt chunkAt[p] >= pivot
-        		while ((q > L) && (comparePivot(compiledPivot, q) <= 0)) q--; // chunkAt[q] > pivot
+        		while ((q > L) && (compare(pivot, q) <= 0)) q--; // chunkAt[q] > pivot
        		if (p <= q) {
-        			oldpivot = pivot;
        			pivot = swap(p, q, pivot, swapspace);
-        			if (pivot != oldpivot && compiledPivot != null) compiledPivot = null; // must be computed again
        			p++;
        			q--;
        		}
@ -867,34 +862,6 @@ public class RowCollection implements Iterable<Row.Entry> {
                this.rowdef.primaryKeyLength);
        return c;
    }
-    
-    protected final byte[] compilePivot(final int i) {
-        assert (i >= 0) && (i < chunkcount) : "i = " + i + ", chunkcount = " + chunkcount;
-        assert (this.rowdef.objectOrder != null);
-        assert (this.rowdef.objectOrder instanceof Base64Order);
-        //assert (!bugappearance(chunkcache, i * this.rowdef.objectsize + colstart, this.rowdef.primaryKeyLength));
-        return ((Base64Order) this.rowdef.objectOrder).compilePivot(chunkcache, i * this.rowdef.objectsize, this.rowdef.primaryKeyLength);
-    }
-    
-    protected final byte[] compilePivot(final byte[] a, final int astart, final int alength) {
-        assert (this.rowdef.objectOrder != null);
-        assert (this.rowdef.objectOrder instanceof Base64Order);
-        return ((Base64Order) this.rowdef.objectOrder).compilePivot(a, astart, alength);
-    }
-    
-    protected final int comparePivot(final byte[] compiledPivot, final int j) {
-        assert (chunkcount * this.rowdef.objectsize <= chunkcache.length) : "chunkcount = " + chunkcount + ", objsize = " + this.rowdef.objectsize + ", chunkcache.length = " + chunkcache.length;
-        assert (j >= 0) && (j < chunkcount) : "j = " + j + ", chunkcount = " + chunkcount;
-        assert (this.rowdef.objectOrder != null);
-        assert (this.rowdef.objectOrder instanceof Base64Order);
-        //assert (!bugappearance(chunkcache, j * this.rowdef.objectsize + colstart, this.rowdef.primaryKeyLength));
-        final int c = ((Base64Order) this.rowdef.objectOrder).comparePivot(
-        		compiledPivot,
-                chunkcache,
-                j * this.rowdef.objectsize,
-                this.rowdef.primaryKeyLength);
-        return c;
-    }

    protected synchronized int compare(final byte[] a, final int astart, final int alength, final int chunknumber) {
        assert (chunknumber < chunkcount);
--- a/source/de/anomic/kelondro/index/RowSet.java
+++ b/source/de/anomic/kelondro/index/RowSet.java
@ -198,8 +198,7 @@ public class RowSet extends RowCollection implements ObjectIndex, Iterable<Row.E
        if ((this.rowdef.objectOrder != null) && (this.rowdef.objectOrder instanceof Base64Order) && (this.sortBound > 4000)) {
            // first try to find in sorted area
            assert this.rowdef.objectOrder.wellformed(a, astart, alength) : "not wellformed: " + new String(a, astart, alength);
-            final byte[] compiledPivot = compilePivot(a, astart, alength);
-            final int p = binarySearchCompiledPivot(compiledPivot);
+            final int p = binarySearch(a, astart, alength);
            if (p >= 0) return p;
            
            // then find in unsorted area
@ -238,24 +237,6 @@ public class RowSet extends RowCollection implements ObjectIndex, Iterable<Row.E
        }
        return -1;
    }
-    
-    private int binarySearchCompiledPivot(final byte[] compiledPivot) {
-        // returns the exact position of the key if the key exists,
-        // or -1 if the key does not exist
-        assert (rowdef.objectOrder != null);
-        assert (rowdef.objectOrder instanceof Base64Order);
-        int l = 0;
-        int rbound = this.sortBound;
-        int p = 0;
-        int d;
-        while (l < rbound) {
-            p = l + ((rbound - l) >> 1);
-            d = comparePivot(compiledPivot, p);
-            if (d == 0) return p;
-            if (d < 0) rbound = p; else l = p + 1;
-        }
-        return -1;
-    }

    private int binaryPosition(final byte[] key, final int astart, final int alength) {
        // returns the exact position of the key if the key exists,
@ -489,7 +470,17 @@ public class RowSet extends RowCollection implements ObjectIndex, Iterable<Row.E
        System.out.println("after uniq, size = " + rs.size());
        */
        
-        final String[] test = { "eins", "zwei", "drei", "vier", "fuenf", "sechs", "sieben", "acht", "neun", "zehn" };
+        final String[] test = {
+        		"eins......xxxx", 
+        		"zwei......xxxx", 
+        		"drei......xxxx", 
+        		"vier......xxxx", 
+        		"fuenf.....xxxx", 
+        		"sechs.....xxxx", 
+        		"sieben....xxxx", 
+        		"acht......xxxx", 
+        		"neun......xxxx", 
+        		"zehn......xxxx" };
        final RowSet d = new RowSet(new Row("byte[] key-10, Cardinal x-4 {b256}", NaturalOrder.naturalOrder), 0);
        for (int ii = 0; ii < test.length; ii++) d.add(test[ii].getBytes());
        for (int ii = 0; ii < test.length; ii++) d.add(test[ii].getBytes());
--- a/source/de/anomic/kelondro/order/Base64Order.java
+++ b/source/de/anomic/kelondro/order/Base64Order.java
@ -206,7 +206,7 @@ public class Base64Order extends AbstractOrder<byte[]> implements ByteOrder, Cod
    // b64-Strings
    // we will do that by grouping each three input bytes to four output bytes.
    public final String encode(final byte[] in) {
-        if (in.length == 0) return "";
+        if (in == null || in.length == 0) return "";
        int lene = in.length / 3 * 4 + 3;
        StringBuilder out = new StringBuilder(lene);
        int pos = 0;
@ -509,7 +509,7 @@ public class Base64Order extends AbstractOrder<byte[]> implements ByteOrder, Cod
        // they are equal
        return 0;
    }
-    
+    /*
    public final int comparePivot(final byte[] compiledPivot, final byte[] b, final int boffset, final int blength) {
        assert zero == null;
        assert asc;
@ -556,7 +556,7 @@ public class Base64Order extends AbstractOrder<byte[]> implements ByteOrder, Cod
        }
        return cp;
    }
-
+*/
    public static void main(final String[] s) {
        // java -classpath classes de.anomic.kelondro.kelondroBase64Order
        final Base64Order b64 = new Base64Order(true, true);
--- a/source/de/anomic/tools/mediawikiIndex.java
+++ b/source/de/anomic/tools/mediawikiIndex.java
@ -292,17 +292,17 @@ public class mediawikiIndex {
            this.end = end;
        }
    }
-    public wikiparserrecord newRecord(String title, StringBuffer sb) {
+    public wikiparserrecord newRecord(String title, StringBuilder sb) {
        return new wikiparserrecord(title, sb);
    }
    
    public class wikiparserrecord {
        public String title;
-        StringBuffer source;
+        StringBuilder source;
        String html;
        yacyURL url;
        plasmaParserDocument document;
-        public wikiparserrecord(String title, StringBuffer sb) {
+        public wikiparserrecord(String title, StringBuilder sb) {
            this.title = title;
            this.source = sb;
        }
@ -426,7 +426,7 @@ public class mediawikiIndex {
        }

        // example:
-        // java -Xmx1000m -cp classes:lib/bzip2.jar de.anomic.tools.mediawikiIndex -convert DATA\HTCACHE\dewiki-20090311-pages-articles.xml.bz2 DATA\SURROGATES\in\ http://de.wikipedia.org/wiki/
+        // java -Xmx2000m -cp classes:lib/bzip2.jar de.anomic.tools.mediawikiIndex -convert DATA/HTCACHE/dewiki-20090311-pages-articles.xml.bz2 DATA/SURROGATES/in/ http://de.wikipedia.org/wiki/
        
        if (s[0].equals("-convert") && s.length > 2 && s[1].endsWith(".xml.bz2") && s[3].startsWith("http://")) {
            File sourcefile = new File(s[1]);
@ -444,9 +444,9 @@ public class mediawikiIndex {
                    if (b != 'Z') throw new IOException("Invalid bz2 content.");
                    is = new CBZip2InputStream(is);
                }
-                BufferedReader r = new BufferedReader(new java.io.InputStreamReader(is));
+                BufferedReader r = new BufferedReader(new java.io.InputStreamReader(is, "UTF-8"));
                String t;
-                StringBuffer sb = new StringBuffer();
+                StringBuilder sb = new StringBuilder();
                boolean page = false, text = false;
                String title = null;
                plasmaParser.initHTMLParsableMimeTypes("text/html");
@ -456,7 +456,7 @@ public class mediawikiIndex {
                int fc = 0;
                int rc = 0;
                String outputfilename = targetstub + "." + fc + ".xml.tmp";
-                OutputStreamWriter osw = new OutputStreamWriter(new BufferedOutputStream(new FileOutputStream(new File(targetdir, outputfilename))));
+                OutputStreamWriter osw = new OutputStreamWriter(new BufferedOutputStream(new FileOutputStream(new File(targetdir, outputfilename))), "UTF-8");
                osw.write("<?xml version=\"1.0\" encoding=\"utf-8\"?>\n<surrogates xmlns:dc=\"http://purl.org/dc/elements/1.1/\">\n");
                while ((t = r.readLine()) != null) {
                    if (t.indexOf(pagestart) >= 0) {
@ -484,7 +484,7 @@ public class mediawikiIndex {
                                rc = 0;
                                fc++;
                                outputfilename = targetstub + "." + fc + ".xml.tmp";
-                                osw = new OutputStreamWriter(new BufferedOutputStream(new FileOutputStream(new File(targetdir, outputfilename))));
+                                osw = new OutputStreamWriter(new BufferedOutputStream(new FileOutputStream(new File(targetdir, outputfilename))), "UTF-8");
                                osw.write("<?xml version=\"1.0\" encoding=\"utf-8\"?>\n<surrogates xmlns:dc=\"http://purl.org/dc/elements/1.1/\">\n");
                            }
                        } catch (InterruptedException e) {