- extended experimental wikipedia dump parser

- removed historic, possibly unused code from wiki parser that was in conflict with actual wikipedia wiki code

git-svn-id: https://svn.berlios.de/svnroot/repos/yacy/trunk@5790 6c8d7289-2bf4-0310-a012-ef5d649a1542
This commit is contained in:
orbiter 2009-04-09 14:55:20 +00:00
parent c3aff2521e
commit d4d87d90c4
2 changed files with 24 additions and 92 deletions

View File

@ -64,7 +64,6 @@ public class wikiCode extends abstractWikiParser implements wikiParser {
private boolean defList = false; //needed for definition lists
private boolean escape = false; //needed for escape
private boolean escaped = false; //needed for <pre> not getting in the way
private boolean escapeSpan = false; //needed for escape symbols [= and =] spanning over several lines
private boolean newrowstart=false; //needed for the first row not to be empty
private boolean nolist = false; //needed for handling of [= and <pre> in lists
private boolean preformatted = false; //needed for preformatted text
@ -72,7 +71,6 @@ public class wikiCode extends abstractWikiParser implements wikiParser {
private boolean replacedHTML = false; //indicates if method replaceHTML has been used with line already
private boolean table = false; //needed for tables, because they reach over several lines
private int preindented = 0; //needed for indented <pre>s
private int escindented = 0; //needed for indented [=s
private int headlines = 0; //number of headlines in page
private final ArrayList<String> dirElements = new ArrayList<String>(); //list of headlines used to create diectory of page
@ -473,78 +471,6 @@ public class wikiCode extends abstractWikiParser implements wikiParser {
return result;
}
/** This method handles the escape tags [= =] */
//contributed by [MN]
private String escapeTag(String result){
int p0 = 0;
int p1 = 0;
//both [= and =] in the same line
if(((p0 = result.indexOf("[="))>=0)&&((p1 = result.indexOf("=]"))>0)&&(!(preformatted))){
if(p0<p1){
String escapeText = result.substring(p0+2,p1);
escapeText = escapeText.replaceAll("!esc!", "!esc!!");
result = transformLine(result.substring(0,p0).replaceAll("!esc!", "!esc!!")+"!esc!txt!"+result.substring(p1+2).replaceAll("!esc!", "!esc!!"));
result = result.replaceAll("!esc!txt!", escapeText);
result = result.replaceAll("!esc!!", "!esc!");
}
//handles cases like [=[= =]=] [= =] that would cause an exception otherwise
else{
escape = true;
final String temp1 = transformLine(result.substring(0,p0-1).replaceAll("!tmp!","!tmp!!")+"!tmp!txt!");
nolist = true;
final String temp2 = transformLine(result.substring(p0));
nolist = false;
result = temp1.replaceAll("!tmp!txt!",temp2);
result = result.replaceAll("!tmp!!", "!tmp!");
escape = false;
}
}
//start [=
else if(((p0 = result.indexOf("[="))>=0)&&(!escapeSpan)&&(!preformatted)){
escape = true; //prevent surplus line breaks
escaped = true; //prevents <pre> being parsed
String bq = ""; //gets filled with <blockquote>s as needed
String escapeText = result.substring(p0+2);
escapeText = escapeText.replaceAll("!esc!", "!esc!!");
//taking care of indented lines
while(result.substring(escindented,p0).startsWith(":")){
escindented++;
bq = bq + "<blockquote>";
}
result = transformLine(result.substring(escindented,p0).replaceAll("!esc!", "!esc!!")+"!esc!txt!");
result = bq + result.replaceAll("!esc!txt!", escapeText);
result = result.replaceAll("!esc!!", "!esc!");
escape = false;
escapeSpan = true;
}
//end =]
else if(((p0 = result.indexOf("=]"))>=0)&&(escapeSpan)&&(!preformatted)){
escapeSpan = false;
String bq = ""; //gets filled with </blockquote>s as needed
String escapeText = result.substring(0,p0);
escapeText = escapeText.replaceAll("!esc!", "!esc!!");
//taking care of indented lines
while(escindented > 0){
bq = bq + "</blockquote>";
escindented--;
}
result = transformLine("!esc!txt!"+result.substring(p0+2).replaceAll("!esc!", "!esc!!"));
result = result.replaceAll("!esc!txt!", escapeText) + bq;
result = result.replaceAll("!esc!!", "!esc!");
escaped = false;
}
//Getting rid of surplus =]
else if (((p0 = result.indexOf("=]"))>=0)&&(!escapeSpan)&&(!preformatted)){
while((p0 = result.indexOf("=]"))>=0){
result = result.substring(0,p0)+result.substring(p0+2);
}
result = transformLine(result);
}
return result;
}
/** This method handles the preformatted tags <pre> </pre> */
//contributed by [MN]
private String preformattedTag(String result){
@ -757,18 +683,10 @@ public class wikiCode extends abstractWikiParser implements wikiParser {
replacedHTML = true;
}
//check if line contains escape symbols([= =]) or if we are in an escape sequence already.
if ((result.indexOf("[=")>=0)||(result.indexOf("=]")>=0)||(escapeSpan)){
result = escapeTag(result);
}
//check if line contains preformatted symbols or if we are in a preformatted sequence already.
else if ((result.indexOf("&lt;pre&gt;")>=0)||(result.indexOf("&lt;/pre&gt;")>=0)||(preformattedSpan)){
if ((result.indexOf("&lt;pre&gt;")>=0)||(result.indexOf("&lt;/pre&gt;")>=0)||(preformattedSpan)){
result = preformattedTag(result);
}
//transform page as usual
else {
} else {
//tables first -> wiki-tags in cells can be treated after that
result = processTable(result);

View File

@ -50,6 +50,8 @@ import java.util.concurrent.Future;
import java.util.concurrent.TimeUnit;
import java.util.concurrent.TimeoutException;
import de.anomic.data.wiki.wikiCode;
import de.anomic.data.wiki.wikiParser;
import de.anomic.kelondro.util.ByteBuffer;
/*
@ -59,6 +61,8 @@ import de.anomic.kelondro.util.ByteBuffer;
public class mediawikiIndex {
private static final String textstart = "<text";
private static final String textend = "</text>";
private static final String pagestart = "<page>";
private static final String pageend = "</page>";
private static final byte[] pagestartb = pagestart.getBytes();
@ -388,19 +392,29 @@ public class mediawikiIndex {
BufferedReader r = new BufferedReader(new java.io.InputStreamReader(is));
String t;
StringBuffer sb = new StringBuffer();
boolean read = false;
boolean page = false, text = false;
String title = null;
wikiParser wparser = new wikiCode("de.wikipedia.org");
//plasmaParser hparser = new plasmaParser();
while ((t = r.readLine()) != null) {
if (t.indexOf(pagestart) >= 0) {
read = true;
page = true;
continue;
}
if (t.indexOf(textstart) >= 0) {
text = page;
continue;
}
if (t.indexOf(textend) >= 0) {
text = false;
System.out.println("Title: " + title);
System.out.println(wparser.transform(sb.toString()));
System.out.println();
sb.setLength(0);
continue;
}
if (t.indexOf(pageend) >= 0) {
read = false;
System.out.println("Title: " + title);
System.out.println(sb);
System.out.println();
sb.setLength(0);
page = false;
continue;
}
if (t.indexOf("<title>") >= 0) {
@ -409,7 +423,7 @@ public class mediawikiIndex {
if (p >= 0) title = title.substring(0, p);
continue;
}
if (read) {
if (text) {
sb.append(t);
sb.append('\n');
}