mirror of
https://github.com/yacy/yacy_search_server.git
synced 2024-09-19 00:01:41 +02:00
added a submenu to index administration to import a wikimedia dump (i.e. a dump from wikipedia) into the YaCy index: see
http://localhost:8080/IndexImportWikimedia_p.html git-svn-id: https://svn.berlios.de/svnroot/repos/yacy/trunk@5930 6c8d7289-2bf4-0310-a012-ef5d649a1542
This commit is contained in:
parent
df733af4fa
commit
5fb77116c6
67
htroot/IndexImportWikimedia_p.html
Normal file
67
htroot/IndexImportWikimedia_p.html
Normal file
|
@ -0,0 +1,67 @@
|
|||
<!DOCTYPE html PUBLIC "-//W3C//DTD XHTML 1.0 Strict//EN" "http://www.w3.org/TR/xhtml1/DTD/xhtml1-strict.dtd">
|
||||
<html xmlns="http://www.w3.org/1999/xhtml">
|
||||
<head>
|
||||
<title>YaCy '#[clientname]#': Wikimedia Dump Import</title>
|
||||
#%env/templates/metas.template%#
|
||||
#(import)#::<meta http-equiv="REFRESH" content="10" />#(/import)#
|
||||
</head>
|
||||
<body id="IndexImportWikimedia">
|
||||
#%env/templates/header.template%#
|
||||
#%env/templates/submenuIndexControl.template%#
|
||||
<h2>Wikimedia Dump Import</h2>
|
||||
|
||||
#(import)#
|
||||
<p>#(status)#No import thread is running, you can start a new thread here::Bad input data: #[message]# #(/status)#</p>
|
||||
<form action="IndexImportWikimedia_p.html" method="get" id="importwiki" accept-charset="UTF-8">
|
||||
<!-- no post method here, we don't want to transmit the whole file, only the path-->
|
||||
<fieldset>
|
||||
<legend>Wikimedia Dump File Selection: select a 'bz2' file</legend>
|
||||
You can import Wikipedia dumps here. An example is the file
|
||||
<a href="http://download.wikimedia.org/dewiki/20090311/dewiki-20090311-pages-articles.xml.bz2">
|
||||
http://download.wikimedia.org/dewiki/20090311/dewiki-20090311-pages-articles.xml.bz2</a>.
|
||||
<br>
|
||||
Dumps must be in XML format and must be encoded in bz2. Do not decompress the file after downloading!
|
||||
<br>
|
||||
<input name="file" type="text" value="DATA/HTCACHE/dewiki-20090311-pages-articles.xml.bz2" size="80" accept="application/x-bzip2">
|
||||
<input name="submit" type="submit" value="Import Wikimedia Dump" />
|
||||
</fieldset>
|
||||
</form>
|
||||
<p>
|
||||
When the import is started, the following happens:
|
||||
<ul>
|
||||
<li>The dump is extracted on the fly and wiki entries are translated into Dublin Core data format. The output looks like this:
|
||||
<pre>
|
||||
<?xml version="1.0" encoding="utf-8"?>
|
||||
<surrogates xmlns:dc="http://purl.org/dc/elements/1.1/">
|
||||
<record>
|
||||
<dc:Title><![CDATA[Alan Smithee]]></dc:Title>
|
||||
<dc:Identifier>http://de.wikipedia.org/wiki/Alan%20Smithee</dc:Identifier>
|
||||
<dc:Description><![CDATA[Der als Filmregisseur oft genannte Alan Smithee ist ein Anagramm]]></dc:Description>
|
||||
<dc:Language>de</dc:Language>
|
||||
<dc:Date>2009-05-07T06:03:48Z</dc:Date>
|
||||
</record>
|
||||
<record>
|
||||
...
|
||||
</record>
|
||||
</surrogates>
|
||||
</pre>
|
||||
</li>
|
||||
<li>Each 10000 wiki records are combined in one output file which is written to /DATA/SURROGATES/in into a temporary file.</li>
|
||||
<li>When each of the generated output file is finished, it is renamed to a .xml file</li>
|
||||
<li>Each time a xml surrogate file appears in /DATA/SURROGATES/in, the YaCy indexer fetches the file and indexes the record entries.</li>
|
||||
<li>When a surrogate file is finished with indexing, it is moved to /DATA/SURROGATES/out</li>
|
||||
<li>You can recycle processed surrogate files by moving them from /DATA/SURROGATES/out to /DATA/SURROGATES/in</li>
|
||||
</ul>
|
||||
</p>
|
||||
::
|
||||
<fieldset><legend>Import Process</legend>
|
||||
<dl>
|
||||
<dt>Thread: #[thread]#</dt>
|
||||
<dt>Processed Wiki Entries: #[count]#</dt>
|
||||
</dl>
|
||||
</fieldset>
|
||||
#(/import)#
|
||||
|
||||
#%env/templates/footer.template%#
|
||||
</body>
|
||||
</html>
|
78
htroot/IndexImportWikimedia_p.java
Normal file
78
htroot/IndexImportWikimedia_p.java
Normal file
|
@ -0,0 +1,78 @@
|
|||
// IndexImportWikimedia.java
|
||||
// -------------------------
|
||||
// (C) 2009 by Michael Peter Christen; mc@yacy.net
|
||||
// first published 04.05.2009 on http://yacy.net
|
||||
// Frankfurt, Germany
|
||||
//
|
||||
// $LastChangedDate: 2009-04-16 17:29:00 +0200 (Do, 16 Apr 2009) $
|
||||
// $LastChangedRevision: 5812 $
|
||||
// $LastChangedBy: orbiter $
|
||||
//
|
||||
// This program is free software; you can redistribute it and/or modify
|
||||
// it under the terms of the GNU General Public License as published by
|
||||
// the Free Software Foundation; either version 2 of the License, or
|
||||
// (at your option) any later version.
|
||||
//
|
||||
// This program is distributed in the hope that it will be useful,
|
||||
// but WITHOUT ANY WARRANTY; without even the implied warranty of
|
||||
// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
|
||||
// GNU General Public License for more details.
|
||||
//
|
||||
// You should have received a copy of the GNU General Public License
|
||||
// along with this program; if not, write to the Free Software
|
||||
// Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
|
||||
|
||||
import java.io.File;
|
||||
import java.net.MalformedURLException;
|
||||
|
||||
import de.anomic.http.httpRequestHeader;
|
||||
import de.anomic.plasma.plasmaSwitchboard;
|
||||
import de.anomic.server.serverObjects;
|
||||
import de.anomic.server.serverSwitch;
|
||||
import de.anomic.tools.mediawikiIndex;
|
||||
|
||||
public class IndexImportWikimedia_p {
|
||||
|
||||
public static serverObjects respond(final httpRequestHeader header, final serverObjects post, final serverSwitch<?> env) {
|
||||
final serverObjects prop = new serverObjects();
|
||||
final plasmaSwitchboard sb = (plasmaSwitchboard) env;
|
||||
|
||||
if (mediawikiIndex.job != null && mediawikiIndex.job.isAlive()) {
|
||||
// one import is running, no option to insert anything
|
||||
prop.put("import", 1);
|
||||
prop.put("import_thread", "running");
|
||||
prop.put("import_count", mediawikiIndex.job.count);
|
||||
} else {
|
||||
prop.put("import", 0);
|
||||
if (post == null) {
|
||||
prop.put("import_status", 0);
|
||||
} else {
|
||||
if (post.containsKey("file")) {
|
||||
File sourcefile = new File(post.get("file"));
|
||||
String name = sourcefile.getName(); // i.e. dewiki-20090311-pages-articles.xml.bz2
|
||||
if (!name.endsWith("pages-articles.xml.bz2")) {
|
||||
prop.put("import", 0);
|
||||
prop.put("import_status", 1);
|
||||
prop.put("import_status_message", "file name must end with 'pages-articles.xml.bz2'");
|
||||
return prop;
|
||||
}
|
||||
String lang = name.substring(0, 2);
|
||||
try {
|
||||
mediawikiIndex.job = new mediawikiIndex(sourcefile, sb.surrogatesInPath, "http://" + lang + ".wikipedia.org/wiki/");
|
||||
mediawikiIndex.job.start();
|
||||
prop.put("import", 1);
|
||||
prop.put("import_thread", "started");
|
||||
prop.put("import_count", 0);
|
||||
} catch (MalformedURLException e) {
|
||||
e.printStackTrace();
|
||||
prop.put("import", 0);
|
||||
prop.put("import_status", 1);
|
||||
prop.put("import_status_message", e.getMessage());
|
||||
}
|
||||
}
|
||||
return prop;
|
||||
}
|
||||
}
|
||||
return prop;
|
||||
}
|
||||
}
|
|
@ -6,5 +6,6 @@
|
|||
<li><a href="/IndexImport_p.html" class="MenuItemLink lock">Queue Import</a></li>
|
||||
<li><a href="/IndexTransfer_p.html" class="MenuItemLink lock">Index Transfer</a></li>
|
||||
<li><a href="/IndexCleaner_p.html" class="MenuItemLink lock">Index Cleaner</a></li>
|
||||
<li><a href="/IndexImportWikimedia_p.html" class="MenuItemLink lock">Wikimedia Dump Import</a></li>
|
||||
</ul>
|
||||
</div>
|
||||
|
|
|
@ -168,7 +168,7 @@ public final class IndexCell<ReferenceType extends Reference> extends AbstractBu
|
|||
if (c0 == null) return c1;
|
||||
return c1.merge(c0);
|
||||
}
|
||||
|
||||
|
||||
/**
|
||||
* remove url references from a selected word hash. this deletes also in the BLOB
|
||||
* files, which means that there exists new gap entries after the deletion
|
||||
|
|
|
@ -55,7 +55,7 @@ public class plasmaParserDocument {
|
|||
private final String mimeType; // mimeType as taken from http header
|
||||
private final String charset; // the charset of the document
|
||||
private final List<String> keywords; // most resources provide a keyword field
|
||||
private final StringBuilder title; // a document title, taken from title or h1 tag; shall appear as headline of search result
|
||||
private StringBuilder title; // a document title, taken from title or h1 tag; shall appear as headline of search result
|
||||
private final StringBuilder creator; // author or copyright
|
||||
private final List<String> sections; // if present: more titles/headlines appearing in the document
|
||||
private final StringBuilder description; // an abstract, if present: short content description
|
||||
|
@ -173,6 +173,10 @@ dc_rights
|
|||
return title.toString();
|
||||
}
|
||||
|
||||
public void setTitle(String title) {
|
||||
this.title = new StringBuilder(title);
|
||||
}
|
||||
|
||||
public String dc_creator() {
|
||||
if (creator == null)
|
||||
return "";
|
||||
|
|
|
@ -59,6 +59,7 @@ import java.util.concurrent.TimeoutException;
|
|||
import de.anomic.data.wiki.wikiCode;
|
||||
import de.anomic.data.wiki.wikiParser;
|
||||
import de.anomic.kelondro.util.ByteBuffer;
|
||||
import de.anomic.kelondro.util.Log;
|
||||
import de.anomic.plasma.plasmaParser;
|
||||
import de.anomic.plasma.plasmaParserDocument;
|
||||
import de.anomic.plasma.parser.ParserException;
|
||||
|
@ -69,7 +70,7 @@ import de.anomic.yacy.yacyURL;
|
|||
* as referenced with xmlns="http://www.mediawiki.org/xml/export-0.3/"
|
||||
*/
|
||||
|
||||
public class mediawikiIndex {
|
||||
public class mediawikiIndex extends Thread {
|
||||
|
||||
private static final String textstart = "<text";
|
||||
private static final String textend = "</text>";
|
||||
|
@ -81,16 +82,151 @@ public class mediawikiIndex {
|
|||
private wikiParser wparser;
|
||||
private plasmaParser hparser;
|
||||
private String urlStub;
|
||||
private File sourcefile;
|
||||
private File targetdir;
|
||||
public int count;
|
||||
|
||||
public mediawikiIndex(String baseURL) throws MalformedURLException {
|
||||
urlStub = baseURL;
|
||||
wparser = new wikiCode(new URL(baseURL).getHost());
|
||||
hparser = new plasmaParser();
|
||||
public static mediawikiIndex job; // if started from a servlet, this object is used to store the thread
|
||||
|
||||
public mediawikiIndex(File sourcefile, File targetdir, String baseURL) throws MalformedURLException {
|
||||
this.sourcefile = sourcefile;
|
||||
this.targetdir = targetdir;
|
||||
this.urlStub = baseURL;
|
||||
this.wparser = new wikiCode(new URL(baseURL).getHost());
|
||||
this.hparser = new plasmaParser();
|
||||
this.count = 0;
|
||||
// must be called before usage:
|
||||
plasmaParser.initHTMLParsableMimeTypes("text/html");
|
||||
plasmaParser.initParseableMimeTypes(plasmaParser.PARSER_MODE_CRAWLER, "text/html");
|
||||
}
|
||||
|
||||
|
||||
public void run() {
|
||||
try {
|
||||
String targetstub = sourcefile.getName();
|
||||
targetstub = targetstub.substring(0, targetstub.length() - 8);
|
||||
InputStream is = new FileInputStream(sourcefile);
|
||||
if (sourcefile.getName().endsWith(".bz2")) {
|
||||
int b = is.read();
|
||||
if (b != 'B') throw new IOException("Invalid bz2 content.");
|
||||
b = is.read();
|
||||
if (b != 'Z') throw new IOException("Invalid bz2 content.");
|
||||
is = new CBZip2InputStream(is);
|
||||
}
|
||||
BufferedReader r = new BufferedReader(new java.io.InputStreamReader(is, "UTF-8"), 10 * 1024 * 1024);
|
||||
String t;
|
||||
StringBuilder sb = new StringBuilder();
|
||||
boolean page = false, text = false;
|
||||
String title = null;
|
||||
plasmaParser.initHTMLParsableMimeTypes("text/html");
|
||||
plasmaParser.initParseableMimeTypes(plasmaParser.PARSER_MODE_CRAWLER, "text/html");
|
||||
wikiparserrecord poison = newRecord();
|
||||
int threads = Math.max(2, Runtime.getRuntime().availableProcessors() - 1);
|
||||
BlockingQueue<wikiparserrecord> in = new ArrayBlockingQueue<wikiparserrecord>(threads * 10);
|
||||
BlockingQueue<wikiparserrecord> out = new ArrayBlockingQueue<wikiparserrecord>(threads * 10);
|
||||
ExecutorService service = Executors.newFixedThreadPool(threads + 1);
|
||||
convertConsumer[] consumers = new convertConsumer[threads];
|
||||
Future<?>[] consumerResults = new Future[threads];
|
||||
for (int i = 0; i < threads; i++) {
|
||||
consumers[i] = new convertConsumer(in, out, poison);
|
||||
consumerResults[i] = service.submit(consumers[i]);
|
||||
}
|
||||
convertWriter writer = new convertWriter(out, poison, targetdir, targetstub);
|
||||
Future<Integer> writerResult = service.submit(writer);
|
||||
|
||||
wikiparserrecord record;
|
||||
int p;
|
||||
while ((t = r.readLine()) != null) {
|
||||
if (t.indexOf(pagestart) >= 0) {
|
||||
page = true;
|
||||
continue;
|
||||
}
|
||||
if ((p = t.indexOf(textstart)) >= 0) {
|
||||
text = page;
|
||||
int q = t.indexOf('>', p + textstart.length());
|
||||
if (q > 0) {
|
||||
int u = t.indexOf(textend, q + 1);
|
||||
if (u > q) {
|
||||
sb.append(t.substring(q + 1, u));
|
||||
Log.logInfo("WIKITRANSLATION", "[INJECT] Title: " + title);
|
||||
if (sb.length() == 0) {
|
||||
Log.logInfo("WIKITRANSLATION", "ERROR: " + title + " has empty content");
|
||||
continue;
|
||||
}
|
||||
record = newRecord(title, sb);
|
||||
try {
|
||||
in.put(record);
|
||||
this.count++;
|
||||
} catch (InterruptedException e1) {
|
||||
e1.printStackTrace();
|
||||
}
|
||||
sb = new StringBuilder(200);
|
||||
continue;
|
||||
} else {
|
||||
sb.append(t.substring(q + 1));
|
||||
}
|
||||
}
|
||||
continue;
|
||||
}
|
||||
if (t.indexOf(textend) >= 0) {
|
||||
text = false;
|
||||
Log.logInfo("WIKITRANSLATION", "[INJECT] Title: " + title);
|
||||
if (sb.length() == 0) {
|
||||
Log.logInfo("WIKITRANSLATION", "ERROR: " + title + " has empty content");
|
||||
continue;
|
||||
}
|
||||
record = newRecord(title, sb);
|
||||
try {
|
||||
in.put(record);
|
||||
this.count++;
|
||||
} catch (InterruptedException e1) {
|
||||
e1.printStackTrace();
|
||||
}
|
||||
sb = new StringBuilder(200);
|
||||
continue;
|
||||
}
|
||||
if (t.indexOf(pageend) >= 0) {
|
||||
page = false;
|
||||
continue;
|
||||
}
|
||||
if ((p = t.indexOf("<title>")) >= 0) {
|
||||
title = t.substring(p + 7);
|
||||
int q = title.indexOf("</title>");
|
||||
if (q >= 0) title = title.substring(0, q);
|
||||
continue;
|
||||
}
|
||||
if (text) {
|
||||
sb.append(t);
|
||||
sb.append('\n');
|
||||
}
|
||||
}
|
||||
r.close();
|
||||
|
||||
try {
|
||||
for (int i = 0; i < threads; i++) {
|
||||
in.put(poison);
|
||||
}
|
||||
for (int i = 0; i < threads; i++) {
|
||||
consumerResults[i].get(10000, TimeUnit.MILLISECONDS);
|
||||
}
|
||||
out.put(poison);
|
||||
writerResult.get(10000, TimeUnit.MILLISECONDS);
|
||||
} catch (InterruptedException e1) {
|
||||
e1.printStackTrace();
|
||||
} catch (ExecutionException e) {
|
||||
e.printStackTrace();
|
||||
} catch (TimeoutException e) {
|
||||
e.printStackTrace();
|
||||
} catch (Exception e) {
|
||||
e.printStackTrace();
|
||||
}
|
||||
} catch (IOException e) {
|
||||
e.printStackTrace();
|
||||
} catch (Exception e) {
|
||||
e.printStackTrace();
|
||||
}
|
||||
}
|
||||
|
||||
public static void checkIndex(File wikimediaxml) {
|
||||
File idx = idxFromWikimediaXML(wikimediaxml);
|
||||
if (idx.exists()) return;
|
||||
|
@ -188,13 +324,13 @@ public class mediawikiIndex {
|
|||
while(true) {
|
||||
r = entries.take();
|
||||
if (r == poison) {
|
||||
System.out.println("producer / got poison");
|
||||
Log.logInfo("WIKITRANSLATION", "producer / got poison");
|
||||
break;
|
||||
}
|
||||
out.println(" <page start=\"" + r.start + "\" length=\"" + (r.end - r.start) + "\">");
|
||||
out.println(" <title>" + r.title + "</title>");
|
||||
out.println(" </page>");
|
||||
System.out.println("producer / record start: " + r.start + ", title : " + r.title);
|
||||
Log.logInfo("WIKITRANSLATION", "producer / record start: " + r.start + ", title : " + r.title);
|
||||
count++;
|
||||
}
|
||||
} catch (InterruptedException e) {
|
||||
|
@ -236,13 +372,13 @@ public class mediawikiIndex {
|
|||
while(true) {
|
||||
c = entries.take();
|
||||
if (c == poison) {
|
||||
System.out.println("consumer / got poison");
|
||||
Log.logInfo("WIKITRANSLATION", "consumer / got poison");
|
||||
break;
|
||||
}
|
||||
try {
|
||||
r = new wikisourcerecord(c.b, c.start, c.end);
|
||||
producer.consume(r);
|
||||
System.out.println("consumer / record start: " + r.start + ", title : " + r.title);
|
||||
Log.logInfo("WIKITRANSLATION", "consumer / record start: " + r.start + ", title : " + r.title);
|
||||
count++;
|
||||
} catch (RuntimeException e) {}
|
||||
}
|
||||
|
@ -325,6 +461,8 @@ public class mediawikiIndex {
|
|||
try {
|
||||
url = new yacyURL(urlStub + title, null);
|
||||
document = hparser.parseSource(url, "text/html", "utf-8", html.getBytes("UTF-8"));
|
||||
// the wiki parser is not able to find the proper title in the source text, so it must be set here
|
||||
document.setTitle(title);
|
||||
} catch (UnsupportedEncodingException e) {
|
||||
e.printStackTrace();
|
||||
} catch (MalformedURLException e1) {
|
||||
|
@ -414,7 +552,7 @@ public class mediawikiIndex {
|
|||
in.resetBuffer();
|
||||
if (s.indexOf(m) >= 0) {
|
||||
// we found the record
|
||||
//System.out.println("s = " + s);
|
||||
//Log.logInfo("WIKITRANSLATION", "s = " + s);
|
||||
int p = s.indexOf("start=\"");
|
||||
if (p < 0) return null;
|
||||
p += 7;
|
||||
|
@ -427,7 +565,7 @@ public class mediawikiIndex {
|
|||
q = s.indexOf('"', p + 1);
|
||||
if (q < 0) return null;
|
||||
int length = Integer.parseInt(s.substring(p, q));
|
||||
//System.out.println("start = " + start + ", length = " + length);
|
||||
//Log.logInfo("WIKITRANSLATION", "start = " + start + ", length = " + length);
|
||||
return new wikisourcerecord(title, start, start + length);
|
||||
}
|
||||
}
|
||||
|
@ -451,7 +589,7 @@ public class mediawikiIndex {
|
|||
while(true) {
|
||||
record = in.take();
|
||||
if (record == poison) {
|
||||
System.out.println("convertConsumer / got poison");
|
||||
Log.logInfo("WIKITRANSLATION", "convertConsumer / got poison");
|
||||
break;
|
||||
}
|
||||
try {
|
||||
|
@ -470,7 +608,7 @@ public class mediawikiIndex {
|
|||
} catch (InterruptedException e) {
|
||||
e.printStackTrace();
|
||||
}
|
||||
System.out.println("*** convertConsumer has terminated");
|
||||
Log.logInfo("WIKITRANSLATION", "*** convertConsumer has terminated");
|
||||
return Integer.valueOf(0);
|
||||
}
|
||||
|
||||
|
@ -507,7 +645,7 @@ public class mediawikiIndex {
|
|||
while(true) {
|
||||
record = in.take();
|
||||
if (record == poison) {
|
||||
System.out.println("convertConsumer / got poison");
|
||||
Log.logInfo("WIKITRANSLATION", "convertConsumer / got poison");
|
||||
break;
|
||||
}
|
||||
|
||||
|
@ -517,7 +655,7 @@ public class mediawikiIndex {
|
|||
this.osw = new OutputStreamWriter(new BufferedOutputStream(new FileOutputStream(new File(targetdir, outputfilename))), "UTF-8");
|
||||
osw.write("<?xml version=\"1.0\" encoding=\"utf-8\"?>\n<surrogates xmlns:dc=\"http://purl.org/dc/elements/1.1/\">\n");
|
||||
}
|
||||
System.out.println("[CONSUME] Title: " + record.title);
|
||||
Log.logInfo("WIKITRANSLATION", "[CONSUME] Title: " + record.title);
|
||||
record.document.writeXML(osw, new Date());
|
||||
rc++;
|
||||
if (rc >= 10000) {
|
||||
|
@ -552,114 +690,19 @@ public class mediawikiIndex {
|
|||
e.printStackTrace();
|
||||
}
|
||||
}
|
||||
System.out.println("*** convertWriter has terminated");
|
||||
Log.logInfo("WIKITRANSLATION", "*** convertWriter has terminated");
|
||||
return Integer.valueOf(0);
|
||||
}
|
||||
|
||||
}
|
||||
|
||||
public static void convert(File sourcefile, File targetdir, String urlStub) throws IOException {
|
||||
String targetstub = sourcefile.getName();
|
||||
targetstub = targetstub.substring(0, targetstub.length() - 8);
|
||||
InputStream is = new FileInputStream(sourcefile);
|
||||
if (sourcefile.getName().endsWith(".bz2")) {
|
||||
int b = is.read();
|
||||
if (b != 'B') throw new IOException("Invalid bz2 content.");
|
||||
b = is.read();
|
||||
if (b != 'Z') throw new IOException("Invalid bz2 content.");
|
||||
is = new CBZip2InputStream(is);
|
||||
}
|
||||
BufferedReader r = new BufferedReader(new java.io.InputStreamReader(is, "UTF-8"), 10 * 1024 * 1024);
|
||||
String t;
|
||||
StringBuilder sb = new StringBuilder();
|
||||
boolean page = false, text = false;
|
||||
String title = null;
|
||||
plasmaParser.initHTMLParsableMimeTypes("text/html");
|
||||
plasmaParser.initParseableMimeTypes(plasmaParser.PARSER_MODE_CRAWLER, "text/html");
|
||||
mediawikiIndex mi = new mediawikiIndex(urlStub);
|
||||
wikiparserrecord poison = mi.newRecord();
|
||||
int threads = Math.max(2, Runtime.getRuntime().availableProcessors() - 1);
|
||||
BlockingQueue<wikiparserrecord> in = new ArrayBlockingQueue<wikiparserrecord>(threads * 10);
|
||||
BlockingQueue<wikiparserrecord> out = new ArrayBlockingQueue<wikiparserrecord>(threads * 10);
|
||||
ExecutorService service = Executors.newFixedThreadPool(threads + 1);
|
||||
convertConsumer[] consumers = new convertConsumer[threads];
|
||||
Future<?>[] consumerResults = new Future[threads];
|
||||
for (int i = 0; i < threads; i++) {
|
||||
consumers[i] = new convertConsumer(in, out, poison);
|
||||
consumerResults[i] = service.submit(consumers[i]);
|
||||
}
|
||||
convertWriter writer = new convertWriter(out, poison, targetdir, targetstub);
|
||||
Future<Integer> writerResult = service.submit(writer);
|
||||
|
||||
wikiparserrecord record;
|
||||
while ((t = r.readLine()) != null) {
|
||||
if (t.indexOf(pagestart) >= 0) {
|
||||
page = true;
|
||||
continue;
|
||||
}
|
||||
if (t.indexOf(textstart) >= 0) {
|
||||
text = page;
|
||||
continue;
|
||||
}
|
||||
if (t.indexOf(textend) >= 0) {
|
||||
text = false;
|
||||
System.out.println("[INJECT] Title: " + title);
|
||||
if (sb.length() == 0) {
|
||||
System.out.println("ERROR: " + title + " has empty content");
|
||||
continue;
|
||||
}
|
||||
record = mi.newRecord(title, sb);
|
||||
try {
|
||||
in.put(record);
|
||||
} catch (InterruptedException e1) {
|
||||
e1.printStackTrace();
|
||||
}
|
||||
sb.setLength(0);
|
||||
continue;
|
||||
}
|
||||
if (t.indexOf(pageend) >= 0) {
|
||||
page = false;
|
||||
continue;
|
||||
}
|
||||
if (t.indexOf("<title>") >= 0) {
|
||||
title = t.substring(t.indexOf("<title>") + 7);
|
||||
int p = title.indexOf("</title>");
|
||||
if (p >= 0) title = title.substring(0, p);
|
||||
continue;
|
||||
}
|
||||
if (text) {
|
||||
sb.append(t);
|
||||
sb.append('\n');
|
||||
}
|
||||
}
|
||||
r.close();
|
||||
|
||||
try {
|
||||
for (int i = 0; i < threads; i++) {
|
||||
in.put(poison);
|
||||
}
|
||||
for (int i = 0; i < threads; i++) {
|
||||
consumerResults[i].get(10000, TimeUnit.MILLISECONDS);
|
||||
}
|
||||
out.put(poison);
|
||||
writerResult.get(10000, TimeUnit.MILLISECONDS);
|
||||
} catch (InterruptedException e1) {
|
||||
e1.printStackTrace();
|
||||
} catch (ExecutionException e) {
|
||||
e.printStackTrace();
|
||||
} catch (TimeoutException e) {
|
||||
e.printStackTrace();
|
||||
}
|
||||
|
||||
}
|
||||
|
||||
public static void main(String[] s) {
|
||||
if (s.length == 0) {
|
||||
System.out.println("usage:");
|
||||
System.out.println(" -index <wikipedia-dump>");
|
||||
System.out.println(" -read <start> <len> <idx-file>");
|
||||
System.out.println(" -find <title> <wikipedia-dump>");
|
||||
System.out.println(" -convert <wikipedia-dump-xml.bz2> <convert-target-dir> <url-stub>");
|
||||
Log.logInfo("WIKITRANSLATION", "usage:");
|
||||
Log.logInfo("WIKITRANSLATION", " -index <wikipedia-dump>");
|
||||
Log.logInfo("WIKITRANSLATION", " -read <start> <len> <idx-file>");
|
||||
Log.logInfo("WIKITRANSLATION", " -find <title> <wikipedia-dump>");
|
||||
Log.logInfo("WIKITRANSLATION", " -convert <wikipedia-dump-xml.bz2> <convert-target-dir> <url-stub>");
|
||||
System.exit(0);
|
||||
}
|
||||
|
||||
|
@ -672,7 +715,11 @@ public class mediawikiIndex {
|
|||
String urlStub = s[3]; // i.e. http://de.wikipedia.org/wiki/
|
||||
//String language = urlStub.substring(7,9);
|
||||
try {
|
||||
convert(sourcefile, targetdir, urlStub);
|
||||
mediawikiIndex mi = new mediawikiIndex(sourcefile, targetdir, urlStub);
|
||||
mi.start();
|
||||
mi.join();
|
||||
} catch (InterruptedException e) {
|
||||
e.printStackTrace();
|
||||
} catch (IOException e) {
|
||||
e.printStackTrace();
|
||||
}
|
||||
|
@ -700,7 +747,7 @@ public class mediawikiIndex {
|
|||
try {
|
||||
wikisourcerecord w = find(s[1], new File(s[2] + ".idx.xml"));
|
||||
if (w == null) {
|
||||
System.out.println("not found");
|
||||
Log.logInfo("WIKITRANSLATION", "not found");
|
||||
} else {
|
||||
System.out.println(new String(read(new File(s[2]), w.start, (int) (w.end - w.start)), "UTF-8"));
|
||||
}
|
||||
|
@ -709,6 +756,7 @@ public class mediawikiIndex {
|
|||
}
|
||||
|
||||
}
|
||||
System.exit(0);
|
||||
}
|
||||
|
||||
}
|
||||
|
|
Loading…
Reference in New Issue
Block a user