To translate a mediawiki dump into the YaCy surrogate format do the following:

- download a wikipedia dump, i.e. dewiki-20090311-pages-articles.xml.bz2
from http://download.wikimedia.org/dewiki/20090311/
- move dewiki-20090311-pages-articles.xml.bz2 to DATA/HTCACHE/
- start the conversion; open a command shell, move to the yacy home directory and execute
java -Xmx2000m -cp classes:lib/bzip2.jar de.anomic.tools.mediawikiIndex -convert DATA/HTCACHE/dewiki-20090311-pages-articles.xml.bz2 DATA/SURROGATES/in/ http://de.wikipedia.org/wiki/

this generates a series of files to DATA/SURROGATES/in

if YaCy is running (it may run concurrently), it fetches all new dumps in the surrogate-in directory. The export process is transaction-save, that means YaCy will not start reading a dump while the dump is not completely finished.


git-svn-id: https://svn.berlios.de/svnroot/repos/yacy/trunk@5851 6c8d7289-2bf4-0310-a012-ef5d649a1542
This commit is contained in:
orbiter 2009-04-21 22:12:19 +00:00
parent 0b2c98edc9
commit 16baa7ad24
5 changed files with 28 additions and 70 deletions

View File

@ -646,7 +646,7 @@ public final class httpdFileHandler {
StringBuilder stringBuffer = new StringBuilder(1024);
while (is.available() > 0) {
stringBuffer.append((char) is.read());
stringBuffer.append((char) is.read());
}
String cgiReturn = stringBuffer.toString();

View File

@ -285,7 +285,7 @@ public class RowCollection implements Iterable<Row.Entry> {
}
public synchronized void add(final byte[] a) {
assert a.length == this.rowdef.objectsize;
assert a.length == this.rowdef.objectsize : "a.length = " + a.length + ", objectsize = " + this.rowdef.objectsize;
addUnique(a, 0, a.length);
}
@ -623,27 +623,22 @@ public class RowCollection implements Iterable<Row.Entry> {
int p = L;
int q = R - 1;
int pivot = pivot(L, R, S, swapspace);
int oldpivot = -1;
byte[] compiledPivot = null;
if (this.rowdef.objectOrder instanceof Base64Order) {
while (p <= q) {
// wenn pivot < S: pivot befindet sich in sortierter Sequenz von L bis S - 1
// d.h. alle Werte von L bis pivot sind kleiner als das pivot
// zu finden ist ein minimales p <= q so dass chunk[p] >= pivot
if (compiledPivot == null) compiledPivot = compilePivot(pivot);
// zu finden ist ein minimales p <= q so dass chunk[p] >= pivot
if ((pivot < S) && (p < pivot)) {
//System.out.println("+++ saved " + (pivot - p) + " comparisments");
p = pivot;
S = 0;
} else {
while ((p < R - 1) && (comparePivot(compiledPivot, p) >= 0)) p++; // chunkAt[p] < pivot
while ((p < R - 1) && (compare(pivot, p) >= 0)) p++; // chunkAt[p] < pivot
}
// nun gilt chunkAt[p] >= pivot
while ((q > L) && (comparePivot(compiledPivot, q) <= 0)) q--; // chunkAt[q] > pivot
while ((q > L) && (compare(pivot, q) <= 0)) q--; // chunkAt[q] > pivot
if (p <= q) {
oldpivot = pivot;
pivot = swap(p, q, pivot, swapspace);
if (pivot != oldpivot && compiledPivot != null) compiledPivot = null; // must be computed again
p++;
q--;
}
@ -867,34 +862,6 @@ public class RowCollection implements Iterable<Row.Entry> {
this.rowdef.primaryKeyLength);
return c;
}
protected final byte[] compilePivot(final int i) {
assert (i >= 0) && (i < chunkcount) : "i = " + i + ", chunkcount = " + chunkcount;
assert (this.rowdef.objectOrder != null);
assert (this.rowdef.objectOrder instanceof Base64Order);
//assert (!bugappearance(chunkcache, i * this.rowdef.objectsize + colstart, this.rowdef.primaryKeyLength));
return ((Base64Order) this.rowdef.objectOrder).compilePivot(chunkcache, i * this.rowdef.objectsize, this.rowdef.primaryKeyLength);
}
protected final byte[] compilePivot(final byte[] a, final int astart, final int alength) {
assert (this.rowdef.objectOrder != null);
assert (this.rowdef.objectOrder instanceof Base64Order);
return ((Base64Order) this.rowdef.objectOrder).compilePivot(a, astart, alength);
}
protected final int comparePivot(final byte[] compiledPivot, final int j) {
assert (chunkcount * this.rowdef.objectsize <= chunkcache.length) : "chunkcount = " + chunkcount + ", objsize = " + this.rowdef.objectsize + ", chunkcache.length = " + chunkcache.length;
assert (j >= 0) && (j < chunkcount) : "j = " + j + ", chunkcount = " + chunkcount;
assert (this.rowdef.objectOrder != null);
assert (this.rowdef.objectOrder instanceof Base64Order);
//assert (!bugappearance(chunkcache, j * this.rowdef.objectsize + colstart, this.rowdef.primaryKeyLength));
final int c = ((Base64Order) this.rowdef.objectOrder).comparePivot(
compiledPivot,
chunkcache,
j * this.rowdef.objectsize,
this.rowdef.primaryKeyLength);
return c;
}
protected synchronized int compare(final byte[] a, final int astart, final int alength, final int chunknumber) {
assert (chunknumber < chunkcount);

View File

@ -198,8 +198,7 @@ public class RowSet extends RowCollection implements ObjectIndex, Iterable<Row.E
if ((this.rowdef.objectOrder != null) && (this.rowdef.objectOrder instanceof Base64Order) && (this.sortBound > 4000)) {
// first try to find in sorted area
assert this.rowdef.objectOrder.wellformed(a, astart, alength) : "not wellformed: " + new String(a, astart, alength);
final byte[] compiledPivot = compilePivot(a, astart, alength);
final int p = binarySearchCompiledPivot(compiledPivot);
final int p = binarySearch(a, astart, alength);
if (p >= 0) return p;
// then find in unsorted area
@ -238,24 +237,6 @@ public class RowSet extends RowCollection implements ObjectIndex, Iterable<Row.E
}
return -1;
}
private int binarySearchCompiledPivot(final byte[] compiledPivot) {
// returns the exact position of the key if the key exists,
// or -1 if the key does not exist
assert (rowdef.objectOrder != null);
assert (rowdef.objectOrder instanceof Base64Order);
int l = 0;
int rbound = this.sortBound;
int p = 0;
int d;
while (l < rbound) {
p = l + ((rbound - l) >> 1);
d = comparePivot(compiledPivot, p);
if (d == 0) return p;
if (d < 0) rbound = p; else l = p + 1;
}
return -1;
}
private int binaryPosition(final byte[] key, final int astart, final int alength) {
// returns the exact position of the key if the key exists,
@ -489,7 +470,17 @@ public class RowSet extends RowCollection implements ObjectIndex, Iterable<Row.E
System.out.println("after uniq, size = " + rs.size());
*/
final String[] test = { "eins", "zwei", "drei", "vier", "fuenf", "sechs", "sieben", "acht", "neun", "zehn" };
final String[] test = {
"eins......xxxx",
"zwei......xxxx",
"drei......xxxx",
"vier......xxxx",
"fuenf.....xxxx",
"sechs.....xxxx",
"sieben....xxxx",
"acht......xxxx",
"neun......xxxx",
"zehn......xxxx" };
final RowSet d = new RowSet(new Row("byte[] key-10, Cardinal x-4 {b256}", NaturalOrder.naturalOrder), 0);
for (int ii = 0; ii < test.length; ii++) d.add(test[ii].getBytes());
for (int ii = 0; ii < test.length; ii++) d.add(test[ii].getBytes());

View File

@ -206,7 +206,7 @@ public class Base64Order extends AbstractOrder<byte[]> implements ByteOrder, Cod
// b64-Strings
// we will do that by grouping each three input bytes to four output bytes.
public final String encode(final byte[] in) {
if (in.length == 0) return "";
if (in == null || in.length == 0) return "";
int lene = in.length / 3 * 4 + 3;
StringBuilder out = new StringBuilder(lene);
int pos = 0;
@ -509,7 +509,7 @@ public class Base64Order extends AbstractOrder<byte[]> implements ByteOrder, Cod
// they are equal
return 0;
}
/*
public final int comparePivot(final byte[] compiledPivot, final byte[] b, final int boffset, final int blength) {
assert zero == null;
assert asc;
@ -556,7 +556,7 @@ public class Base64Order extends AbstractOrder<byte[]> implements ByteOrder, Cod
}
return cp;
}
*/
public static void main(final String[] s) {
// java -classpath classes de.anomic.kelondro.kelondroBase64Order
final Base64Order b64 = new Base64Order(true, true);

View File

@ -292,17 +292,17 @@ public class mediawikiIndex {
this.end = end;
}
}
public wikiparserrecord newRecord(String title, StringBuffer sb) {
public wikiparserrecord newRecord(String title, StringBuilder sb) {
return new wikiparserrecord(title, sb);
}
public class wikiparserrecord {
public String title;
StringBuffer source;
StringBuilder source;
String html;
yacyURL url;
plasmaParserDocument document;
public wikiparserrecord(String title, StringBuffer sb) {
public wikiparserrecord(String title, StringBuilder sb) {
this.title = title;
this.source = sb;
}
@ -426,7 +426,7 @@ public class mediawikiIndex {
}
// example:
// java -Xmx1000m -cp classes:lib/bzip2.jar de.anomic.tools.mediawikiIndex -convert DATA\HTCACHE\dewiki-20090311-pages-articles.xml.bz2 DATA\SURROGATES\in\ http://de.wikipedia.org/wiki/
// java -Xmx2000m -cp classes:lib/bzip2.jar de.anomic.tools.mediawikiIndex -convert DATA/HTCACHE/dewiki-20090311-pages-articles.xml.bz2 DATA/SURROGATES/in/ http://de.wikipedia.org/wiki/
if (s[0].equals("-convert") && s.length > 2 && s[1].endsWith(".xml.bz2") && s[3].startsWith("http://")) {
File sourcefile = new File(s[1]);
@ -444,9 +444,9 @@ public class mediawikiIndex {
if (b != 'Z') throw new IOException("Invalid bz2 content.");
is = new CBZip2InputStream(is);
}
BufferedReader r = new BufferedReader(new java.io.InputStreamReader(is));
BufferedReader r = new BufferedReader(new java.io.InputStreamReader(is, "UTF-8"));
String t;
StringBuffer sb = new StringBuffer();
StringBuilder sb = new StringBuilder();
boolean page = false, text = false;
String title = null;
plasmaParser.initHTMLParsableMimeTypes("text/html");
@ -456,7 +456,7 @@ public class mediawikiIndex {
int fc = 0;
int rc = 0;
String outputfilename = targetstub + "." + fc + ".xml.tmp";
OutputStreamWriter osw = new OutputStreamWriter(new BufferedOutputStream(new FileOutputStream(new File(targetdir, outputfilename))));
OutputStreamWriter osw = new OutputStreamWriter(new BufferedOutputStream(new FileOutputStream(new File(targetdir, outputfilename))), "UTF-8");
osw.write("<?xml version=\"1.0\" encoding=\"utf-8\"?>\n<surrogates xmlns:dc=\"http://purl.org/dc/elements/1.1/\">\n");
while ((t = r.readLine()) != null) {
if (t.indexOf(pagestart) >= 0) {
@ -484,7 +484,7 @@ public class mediawikiIndex {
rc = 0;
fc++;
outputfilename = targetstub + "." + fc + ".xml.tmp";
osw = new OutputStreamWriter(new BufferedOutputStream(new FileOutputStream(new File(targetdir, outputfilename))));
osw = new OutputStreamWriter(new BufferedOutputStream(new FileOutputStream(new File(targetdir, outputfilename))), "UTF-8");
osw.write("<?xml version=\"1.0\" encoding=\"utf-8\"?>\n<surrogates xmlns:dc=\"http://purl.org/dc/elements/1.1/\">\n");
}
} catch (InterruptedException e) {