yacy_search_server/source/de/anomic/data/wiki/knwikiParser.java
orbiter 1d8d51075c refactoring:
- removed the plasma package. The name of that package came from a very early pre-version of YaCy, even before YaCy was named AnomicHTTPProxy. The Proxy project introduced search for cache contents using class files that had been developed during the plasma project. Information from 2002 about plasma can be found here:
http://web.archive.org/web/20020802110827/http://anomic.de/AnomicPlasma/index.html
We stil have one class that comes mostly unchanged from the plasma project, the Condenser class. But this is now part of the document package and all other classes in the plasma package can be assigned to other packages.
- cleaned up the http package: better structure of that class and clean isolation of server and client classes. The old HTCache becomes part of the client sub-package of http.
- because the plasmaSwitchboard is now part of the search package all servlets had to be touched to declare a different package source.

git-svn-id: https://svn.berlios.de/svnroot/repos/yacy/trunk@6232 6c8d7289-2bf4-0310-a012-ef5d649a1542
2009-07-19 20:37:44 +00:00

254 lines
8.6 KiB
Java

// wikiParser.java
// ---------
// part of YaCy
// (C) by Michael Peter Christen; mc@yacy.net
// first published on http://www.anomic.de
// Frankfurt, Germany, 2007
// Created 22.02.2007
//
// This file is contributed by Franz Brausze
//
// $LastChangedDate: $
// $LastChangedRevision: $
// $LastChangedBy: $
//
// This program is free software; you can redistribute it and/or modify
// it under the terms of the GNU General Public License as published by
// the Free Software Foundation; either version 2 of the License, or
// (at your option) any later version.
//
// This program is distributed in the hope that it will be useful,
// but WITHOUT ANY WARRANTY; without even the implied warranty of
// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
// GNU General Public License for more details.
//
// You should have received a copy of the GNU General Public License
// along with this program; if not, write to the Free Software
// Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
package de.anomic.data.wiki;
import java.io.UnsupportedEncodingException;
import java.util.ArrayList;
import java.util.regex.Matcher;
import de.anomic.data.wiki.tokens.DefinitionListToken;
import de.anomic.data.wiki.tokens.LinkToken;
import de.anomic.data.wiki.tokens.ListToken;
import de.anomic.data.wiki.tokens.SimpleToken;
import de.anomic.data.wiki.tokens.TableToken;
import de.anomic.data.wiki.tokens.Token;
import de.anomic.search.Switchboard;
public class knwikiParser implements wikiParser {
public Token[] tokens;
private String[] BEs;
private final Switchboard sb;
private knwikiParser(final Switchboard sb) {
this.sb = sb;
}
public static void main(final String[] args) {
final String text = "===T<pre>itle===\n" +
"==blubb== was ==ein '''shice'''==...och.bla\n" +
"* ein \n" +
"*==test=</pre>=\n" +
"** doppelt\n" +
"* ''tess*sst''\n" +
"*** xyz\n" +
"=]*** huch\n" +
"* ehehe***\n" +
"* blubb\n" +
"bliblablo\n\n\n" +
"* blubb\n" +
"{|border=-1\n" +
"|-\n" +
"||bla|| blubb\n" +
"|-\n" +
"||align center|och||huch||\n" +
"|}\n" +
"\n" +
"# bla\n" +
"# blubb\n" +
"'''''ehehehe''''', ne?!\n" +
"[http://www/index.html,ne?!] -\n" +
"[[Image:blubb|BLA]] ---- och\n" +
" blubb1\n" +
" blubb2\n" +
":doppel-blubb[= huch =]\n" +
";hier:da\n" +
";dort:und so\n" +
";;und:doppelt\n\n\n\n" +
"[[Image:blubb|BLA]]";
// text = "[=\n=]* bla";
String t = "[=] ein fucking [= test =]-text[=,ne?!=] joa, [=alles=]wunderbar," +
"[=denk ich=] mal =]";
final long l = System.currentTimeMillis();
t = new knwikiParser(null).parse((args.length > 0) ? args[0] : text, "localhost:8080");
System.out.println("parsing time: " + (System.currentTimeMillis() - l) + " ms");
System.out.println("--- --- ---");
System.out.println(t);
}
public String transform(final String content) {
return parse(content, null);
}
public String transform(final byte[] content) throws UnsupportedEncodingException {
return parse(new String(content, "UTF-8"), null);
}
public String transform(
final byte[] content, final String encoding) throws UnsupportedEncodingException {
return parse(new String(content, encoding), null);
}
private String parse(String text, final String publicAddress) {
tokens = new Token[] {
new SimpleToken('=', '=', new String[][] { null, { "h2" }, { "h3" }, { "h4" } }, true),
new SimpleToken('\'', '\'', new String[][] { null, { "i" }, { "b" }, null, { "b", "i" } }, false),
new LinkToken((publicAddress == null) ? sb.peers.mySeed().getPublicAddress() : publicAddress, "Wiki.html?page=", sb),
new ListToken('*', "ul"),
new ListToken('#', "ol"),
new ListToken(':', "blockquote", null),
new ListToken(' ', null, "tt", false),
new DefinitionListToken(),
new TableToken()
};
final ArrayList<String> r = new ArrayList<String>();
for (int i = 0, k, j; i < tokens.length; i++)
if (tokens[i].getBlockElementNames() != null)
for (j = 0; j < tokens[i].getBlockElementNames().length; j++) {
if (tokens[i].getBlockElementNames()[j] == null) continue;
if ((k = tokens[i].getBlockElementNames()[j].indexOf(' ')) > 1) {
r.add(tokens[i].getBlockElementNames()[j].substring(0, k));
} else {
r.add(tokens[i].getBlockElementNames()[j]);
}
}
r.add("hr");
BEs = r.toArray(new String[r.size()]);
Text[] tt = Text.split2Texts(text, "[=", "=]");
for (int i=0; i<tt.length; i+=2)
tt[i].setText(parseUnescaped(tt[i].getText()));
text = Text.mergeTexts(tt);
tt = Text.split2Texts(text, "<pre>", "</pre>");
for (int i=0; i<tt.length; i+=2)
tt[i].setText(replaceBRs(tt[i].getText()));
return Text.mergeTexts(tt);
}
private String parseUnescaped(String text) {
Token st;
Matcher m;
StringBuffer sb;
for (int i=0; i<tokens.length; i++) {
st = tokens[i];
for (int j=0; j<st.getRegex().length; j++) {
m = st.getRegex()[j].matcher(text);
sb = new StringBuffer();
while (m.find()) try {
//System.out.print("found " + st.getClass().getSimpleName() + ": " +
// m.group().replaceAll("\n", "\\\\n").replaceAll("\t", " ") + ", ");
if (!st.setText(m.group(), j)) {
// System.out.println("not usable");
continue;
//} else {
// System.out.println("usable");
}
m.appendReplacement(sb, (st.getMarkup() == null) ? m.group() : st.getMarkup());
} catch (final wikiParserException e) {
m.appendReplacement(sb, st.getText());
}
text = new String(m.appendTail(sb));
}
}
return text.replaceAll("----", "<hr />");
}
private String replaceBRs(final String text) {
final StringBuilder sb = new StringBuilder(text.length());
final String[] tt = text.split("\n");
boolean replace;
for (int i=0, j; i<tt.length; i++) {
replace = true;
for (j=0; j<BEs.length; j++)
if (tt[i].endsWith(BEs[j] + ">")) { replace = false; break; }
sb.append(tt[i]);
if (i < tt.length - 1) {
if (replace) sb.append("<br />");
sb.append("\n");
}
}
return new String(sb);
}
private static class Text {
public static final String escapeNewLine = "@";
private String text;
private final boolean nl;
public Text(final String text, final boolean escaped, final boolean newLineBefore) {
this.text = text;
this.nl = newLineBefore;
}
public String setText(final String text) {
if (this.nl)
this.text = text.substring(escapeNewLine.length());
else
this.text = text;
return this.text;
}
public String getTextPlain() { return this.text; }
public String getText() {
if (this.nl)
return escapeNewLine + this.text;
return this.text;
}
public String toString() { return this.text; }
static Text[] split2Texts(final String text, final String escapeBegin, final String escapeEnd) {
if (text == null) return null;
if (text.length() < 2) return new Text[] { new Text(text, false, true) };
final int startLen = escapeBegin.length();
final int endLen = escapeEnd.length();
final ArrayList<Text> r = new ArrayList<Text>();
boolean escaped = text.startsWith(escapeBegin);
if (escaped) r.add(new Text("", false, true));
int i, j = 0;
while ((i = text.indexOf((escaped) ? escapeEnd : escapeBegin, j)) > -1) {
r.add(resolve2Text(text, escaped, (j > 0) ? j + ((escaped) ? startLen : endLen) : 0, i, escapeEnd));
j = i;
escaped = !escaped;
}
r.add(resolve2Text(text, escaped, (escaped) ? j : (j > 0) ? j + endLen : 0, -1, escapeEnd));
return r.toArray(new Text[r.size()]);
}
private static Text resolve2Text(final String text, final boolean escaped, final int from, int to, final String escapeEnd) {
if (to == -1) to = text.length();
return new Text(
text.substring(from, to),
escaped,
from < escapeEnd.length() + 2 || (!escaped && text.charAt(from - escapeEnd.length() - 1) == '\n'));
}
static String mergeTexts(final Text[] texts) {
final StringBuilder sb = new StringBuilder(2000);
for (int n=0; n < texts.length; n++)
sb.append(texts[n].getTextPlain());
return new String(sb);
}
}
}