mirror of
https://github.com/yacy/yacy_search_server.git
synced 2024-09-19 00:01:41 +02:00
To translate a mediawiki dump into the YaCy surrogate format do the following:
- download a wikipedia dump, i.e. dewiki-20090311-pages-articles.xml.bz2 from http://download.wikimedia.org/dewiki/20090311/ - move dewiki-20090311-pages-articles.xml.bz2 to DATA/HTCACHE/ - start the conversion; open a command shell, move to the yacy home directory and execute java -Xmx2000m -cp classes:lib/bzip2.jar de.anomic.tools.mediawikiIndex -convert DATA/HTCACHE/dewiki-20090311-pages-articles.xml.bz2 DATA/SURROGATES/in/ http://de.wikipedia.org/wiki/ this generates a series of files to DATA/SURROGATES/in if YaCy is running (it may run concurrently), it fetches all new dumps in the surrogate-in directory. The export process is transaction-save, that means YaCy will not start reading a dump while the dump is not completely finished. git-svn-id: https://svn.berlios.de/svnroot/repos/yacy/trunk@5851 6c8d7289-2bf4-0310-a012-ef5d649a1542
This commit is contained in:
parent
0b2c98edc9
commit
16baa7ad24
|
@ -646,7 +646,7 @@ public final class httpdFileHandler {
|
|||
StringBuilder stringBuffer = new StringBuilder(1024);
|
||||
|
||||
while (is.available() > 0) {
|
||||
stringBuffer.append((char) is.read());
|
||||
stringBuffer.append((char) is.read());
|
||||
}
|
||||
|
||||
String cgiReturn = stringBuffer.toString();
|
||||
|
|
|
@ -285,7 +285,7 @@ public class RowCollection implements Iterable<Row.Entry> {
|
|||
}
|
||||
|
||||
public synchronized void add(final byte[] a) {
|
||||
assert a.length == this.rowdef.objectsize;
|
||||
assert a.length == this.rowdef.objectsize : "a.length = " + a.length + ", objectsize = " + this.rowdef.objectsize;
|
||||
addUnique(a, 0, a.length);
|
||||
}
|
||||
|
||||
|
@ -623,27 +623,22 @@ public class RowCollection implements Iterable<Row.Entry> {
|
|||
int p = L;
|
||||
int q = R - 1;
|
||||
int pivot = pivot(L, R, S, swapspace);
|
||||
int oldpivot = -1;
|
||||
byte[] compiledPivot = null;
|
||||
if (this.rowdef.objectOrder instanceof Base64Order) {
|
||||
while (p <= q) {
|
||||
// wenn pivot < S: pivot befindet sich in sortierter Sequenz von L bis S - 1
|
||||
// d.h. alle Werte von L bis pivot sind kleiner als das pivot
|
||||
// zu finden ist ein minimales p <= q so dass chunk[p] >= pivot
|
||||
if (compiledPivot == null) compiledPivot = compilePivot(pivot);
|
||||
// zu finden ist ein minimales p <= q so dass chunk[p] >= pivot
|
||||
if ((pivot < S) && (p < pivot)) {
|
||||
//System.out.println("+++ saved " + (pivot - p) + " comparisments");
|
||||
p = pivot;
|
||||
S = 0;
|
||||
} else {
|
||||
while ((p < R - 1) && (comparePivot(compiledPivot, p) >= 0)) p++; // chunkAt[p] < pivot
|
||||
while ((p < R - 1) && (compare(pivot, p) >= 0)) p++; // chunkAt[p] < pivot
|
||||
}
|
||||
// nun gilt chunkAt[p] >= pivot
|
||||
while ((q > L) && (comparePivot(compiledPivot, q) <= 0)) q--; // chunkAt[q] > pivot
|
||||
while ((q > L) && (compare(pivot, q) <= 0)) q--; // chunkAt[q] > pivot
|
||||
if (p <= q) {
|
||||
oldpivot = pivot;
|
||||
pivot = swap(p, q, pivot, swapspace);
|
||||
if (pivot != oldpivot && compiledPivot != null) compiledPivot = null; // must be computed again
|
||||
p++;
|
||||
q--;
|
||||
}
|
||||
|
@ -867,34 +862,6 @@ public class RowCollection implements Iterable<Row.Entry> {
|
|||
this.rowdef.primaryKeyLength);
|
||||
return c;
|
||||
}
|
||||
|
||||
protected final byte[] compilePivot(final int i) {
|
||||
assert (i >= 0) && (i < chunkcount) : "i = " + i + ", chunkcount = " + chunkcount;
|
||||
assert (this.rowdef.objectOrder != null);
|
||||
assert (this.rowdef.objectOrder instanceof Base64Order);
|
||||
//assert (!bugappearance(chunkcache, i * this.rowdef.objectsize + colstart, this.rowdef.primaryKeyLength));
|
||||
return ((Base64Order) this.rowdef.objectOrder).compilePivot(chunkcache, i * this.rowdef.objectsize, this.rowdef.primaryKeyLength);
|
||||
}
|
||||
|
||||
protected final byte[] compilePivot(final byte[] a, final int astart, final int alength) {
|
||||
assert (this.rowdef.objectOrder != null);
|
||||
assert (this.rowdef.objectOrder instanceof Base64Order);
|
||||
return ((Base64Order) this.rowdef.objectOrder).compilePivot(a, astart, alength);
|
||||
}
|
||||
|
||||
protected final int comparePivot(final byte[] compiledPivot, final int j) {
|
||||
assert (chunkcount * this.rowdef.objectsize <= chunkcache.length) : "chunkcount = " + chunkcount + ", objsize = " + this.rowdef.objectsize + ", chunkcache.length = " + chunkcache.length;
|
||||
assert (j >= 0) && (j < chunkcount) : "j = " + j + ", chunkcount = " + chunkcount;
|
||||
assert (this.rowdef.objectOrder != null);
|
||||
assert (this.rowdef.objectOrder instanceof Base64Order);
|
||||
//assert (!bugappearance(chunkcache, j * this.rowdef.objectsize + colstart, this.rowdef.primaryKeyLength));
|
||||
final int c = ((Base64Order) this.rowdef.objectOrder).comparePivot(
|
||||
compiledPivot,
|
||||
chunkcache,
|
||||
j * this.rowdef.objectsize,
|
||||
this.rowdef.primaryKeyLength);
|
||||
return c;
|
||||
}
|
||||
|
||||
protected synchronized int compare(final byte[] a, final int astart, final int alength, final int chunknumber) {
|
||||
assert (chunknumber < chunkcount);
|
||||
|
|
|
@ -198,8 +198,7 @@ public class RowSet extends RowCollection implements ObjectIndex, Iterable<Row.E
|
|||
if ((this.rowdef.objectOrder != null) && (this.rowdef.objectOrder instanceof Base64Order) && (this.sortBound > 4000)) {
|
||||
// first try to find in sorted area
|
||||
assert this.rowdef.objectOrder.wellformed(a, astart, alength) : "not wellformed: " + new String(a, astart, alength);
|
||||
final byte[] compiledPivot = compilePivot(a, astart, alength);
|
||||
final int p = binarySearchCompiledPivot(compiledPivot);
|
||||
final int p = binarySearch(a, astart, alength);
|
||||
if (p >= 0) return p;
|
||||
|
||||
// then find in unsorted area
|
||||
|
@ -238,24 +237,6 @@ public class RowSet extends RowCollection implements ObjectIndex, Iterable<Row.E
|
|||
}
|
||||
return -1;
|
||||
}
|
||||
|
||||
private int binarySearchCompiledPivot(final byte[] compiledPivot) {
|
||||
// returns the exact position of the key if the key exists,
|
||||
// or -1 if the key does not exist
|
||||
assert (rowdef.objectOrder != null);
|
||||
assert (rowdef.objectOrder instanceof Base64Order);
|
||||
int l = 0;
|
||||
int rbound = this.sortBound;
|
||||
int p = 0;
|
||||
int d;
|
||||
while (l < rbound) {
|
||||
p = l + ((rbound - l) >> 1);
|
||||
d = comparePivot(compiledPivot, p);
|
||||
if (d == 0) return p;
|
||||
if (d < 0) rbound = p; else l = p + 1;
|
||||
}
|
||||
return -1;
|
||||
}
|
||||
|
||||
private int binaryPosition(final byte[] key, final int astart, final int alength) {
|
||||
// returns the exact position of the key if the key exists,
|
||||
|
@ -489,7 +470,17 @@ public class RowSet extends RowCollection implements ObjectIndex, Iterable<Row.E
|
|||
System.out.println("after uniq, size = " + rs.size());
|
||||
*/
|
||||
|
||||
final String[] test = { "eins", "zwei", "drei", "vier", "fuenf", "sechs", "sieben", "acht", "neun", "zehn" };
|
||||
final String[] test = {
|
||||
"eins......xxxx",
|
||||
"zwei......xxxx",
|
||||
"drei......xxxx",
|
||||
"vier......xxxx",
|
||||
"fuenf.....xxxx",
|
||||
"sechs.....xxxx",
|
||||
"sieben....xxxx",
|
||||
"acht......xxxx",
|
||||
"neun......xxxx",
|
||||
"zehn......xxxx" };
|
||||
final RowSet d = new RowSet(new Row("byte[] key-10, Cardinal x-4 {b256}", NaturalOrder.naturalOrder), 0);
|
||||
for (int ii = 0; ii < test.length; ii++) d.add(test[ii].getBytes());
|
||||
for (int ii = 0; ii < test.length; ii++) d.add(test[ii].getBytes());
|
||||
|
|
|
@ -206,7 +206,7 @@ public class Base64Order extends AbstractOrder<byte[]> implements ByteOrder, Cod
|
|||
// b64-Strings
|
||||
// we will do that by grouping each three input bytes to four output bytes.
|
||||
public final String encode(final byte[] in) {
|
||||
if (in.length == 0) return "";
|
||||
if (in == null || in.length == 0) return "";
|
||||
int lene = in.length / 3 * 4 + 3;
|
||||
StringBuilder out = new StringBuilder(lene);
|
||||
int pos = 0;
|
||||
|
@ -509,7 +509,7 @@ public class Base64Order extends AbstractOrder<byte[]> implements ByteOrder, Cod
|
|||
// they are equal
|
||||
return 0;
|
||||
}
|
||||
|
||||
/*
|
||||
public final int comparePivot(final byte[] compiledPivot, final byte[] b, final int boffset, final int blength) {
|
||||
assert zero == null;
|
||||
assert asc;
|
||||
|
@ -556,7 +556,7 @@ public class Base64Order extends AbstractOrder<byte[]> implements ByteOrder, Cod
|
|||
}
|
||||
return cp;
|
||||
}
|
||||
|
||||
*/
|
||||
public static void main(final String[] s) {
|
||||
// java -classpath classes de.anomic.kelondro.kelondroBase64Order
|
||||
final Base64Order b64 = new Base64Order(true, true);
|
||||
|
|
|
@ -292,17 +292,17 @@ public class mediawikiIndex {
|
|||
this.end = end;
|
||||
}
|
||||
}
|
||||
public wikiparserrecord newRecord(String title, StringBuffer sb) {
|
||||
public wikiparserrecord newRecord(String title, StringBuilder sb) {
|
||||
return new wikiparserrecord(title, sb);
|
||||
}
|
||||
|
||||
public class wikiparserrecord {
|
||||
public String title;
|
||||
StringBuffer source;
|
||||
StringBuilder source;
|
||||
String html;
|
||||
yacyURL url;
|
||||
plasmaParserDocument document;
|
||||
public wikiparserrecord(String title, StringBuffer sb) {
|
||||
public wikiparserrecord(String title, StringBuilder sb) {
|
||||
this.title = title;
|
||||
this.source = sb;
|
||||
}
|
||||
|
@ -426,7 +426,7 @@ public class mediawikiIndex {
|
|||
}
|
||||
|
||||
// example:
|
||||
// java -Xmx1000m -cp classes:lib/bzip2.jar de.anomic.tools.mediawikiIndex -convert DATA\HTCACHE\dewiki-20090311-pages-articles.xml.bz2 DATA\SURROGATES\in\ http://de.wikipedia.org/wiki/
|
||||
// java -Xmx2000m -cp classes:lib/bzip2.jar de.anomic.tools.mediawikiIndex -convert DATA/HTCACHE/dewiki-20090311-pages-articles.xml.bz2 DATA/SURROGATES/in/ http://de.wikipedia.org/wiki/
|
||||
|
||||
if (s[0].equals("-convert") && s.length > 2 && s[1].endsWith(".xml.bz2") && s[3].startsWith("http://")) {
|
||||
File sourcefile = new File(s[1]);
|
||||
|
@ -444,9 +444,9 @@ public class mediawikiIndex {
|
|||
if (b != 'Z') throw new IOException("Invalid bz2 content.");
|
||||
is = new CBZip2InputStream(is);
|
||||
}
|
||||
BufferedReader r = new BufferedReader(new java.io.InputStreamReader(is));
|
||||
BufferedReader r = new BufferedReader(new java.io.InputStreamReader(is, "UTF-8"));
|
||||
String t;
|
||||
StringBuffer sb = new StringBuffer();
|
||||
StringBuilder sb = new StringBuilder();
|
||||
boolean page = false, text = false;
|
||||
String title = null;
|
||||
plasmaParser.initHTMLParsableMimeTypes("text/html");
|
||||
|
@ -456,7 +456,7 @@ public class mediawikiIndex {
|
|||
int fc = 0;
|
||||
int rc = 0;
|
||||
String outputfilename = targetstub + "." + fc + ".xml.tmp";
|
||||
OutputStreamWriter osw = new OutputStreamWriter(new BufferedOutputStream(new FileOutputStream(new File(targetdir, outputfilename))));
|
||||
OutputStreamWriter osw = new OutputStreamWriter(new BufferedOutputStream(new FileOutputStream(new File(targetdir, outputfilename))), "UTF-8");
|
||||
osw.write("<?xml version=\"1.0\" encoding=\"utf-8\"?>\n<surrogates xmlns:dc=\"http://purl.org/dc/elements/1.1/\">\n");
|
||||
while ((t = r.readLine()) != null) {
|
||||
if (t.indexOf(pagestart) >= 0) {
|
||||
|
@ -484,7 +484,7 @@ public class mediawikiIndex {
|
|||
rc = 0;
|
||||
fc++;
|
||||
outputfilename = targetstub + "." + fc + ".xml.tmp";
|
||||
osw = new OutputStreamWriter(new BufferedOutputStream(new FileOutputStream(new File(targetdir, outputfilename))));
|
||||
osw = new OutputStreamWriter(new BufferedOutputStream(new FileOutputStream(new File(targetdir, outputfilename))), "UTF-8");
|
||||
osw.write("<?xml version=\"1.0\" encoding=\"utf-8\"?>\n<surrogates xmlns:dc=\"http://purl.org/dc/elements/1.1/\">\n");
|
||||
}
|
||||
} catch (InterruptedException e) {
|
||||
|
|
Loading…
Reference in New Issue
Block a user