diff --git a/htroot/Bookmarks.java b/htroot/Bookmarks.java index b99874aad..7e5ceb266 100644 --- a/htroot/Bookmarks.java +++ b/htroot/Bookmarks.java @@ -256,7 +256,7 @@ public class Bookmarks { while(count") + 7, line.toLowerCase().indexOf( "")); // de-replace html entities - title = wikiCode.deReplaceHTML(title); + title = htmlTools.deReplaceHTML(title); prop.put("title", title); } catch (IndexOutOfBoundsException e) { } diff --git a/htroot/yacy/list.java b/htroot/yacy/list.java index 6ad7858ff..99e4955cd 100644 --- a/htroot/yacy/list.java +++ b/htroot/yacy/list.java @@ -53,8 +53,8 @@ import java.io.File; import de.anomic.data.URLFetcherStack; +import de.anomic.data.htmlTools; import de.anomic.data.listManager; -import de.anomic.data.wikiCode; import de.anomic.http.httpHeader; import de.anomic.net.URL; import de.anomic.plasma.plasmaSwitchboard; @@ -122,7 +122,7 @@ public final class list { int cnt = 0; for (int i=0; i + Copyright (C) + + This library is free software; you can redistribute it and/or + modify it under the terms of the GNU Lesser General Public + License as published by the Free Software Foundation; either + version 2.1 of the License, or (at your option) any later version. + + This library is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + Lesser General Public License for more details. + + You should have received a copy of the GNU Lesser General Public + License along with this library; if not, write to the Free Software + Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA + +Also add information on how to contact you by electronic and paper mail. + +You should also get your employer (if you work as a programmer) or your +school, if any, to sign a "copyright disclaimer" for the library, if +necessary. Here is a sample; alter the names: + + Yoyodyne, Inc., hereby disclaims all copyright interest in the + library `Frob' (a library for tweaking knobs) written by James Random Hacker. + + , 1 April 1990 + Ty Coon, President of Vice + +That's all there is to it! + + diff --git a/source/de/anomic/data/Diff.java b/source/de/anomic/data/Diff.java index 67d44a7c4..a55255c15 100644 --- a/source/de/anomic/data/Diff.java +++ b/source/de/anomic/data/Diff.java @@ -272,7 +272,7 @@ public class Diff { case Diff.Part.ADDED: sb.append("added"); break; case Diff.Part.DELETED: sb.append("deleted"); break; } - sb.append("\">").append(wikiCode.replaceXMLEntities(ps[j].getString()).replaceAll("\n", "
")); + sb.append("\">").append(htmlTools.replaceXMLEntities(ps[j].getString()).replaceAll("\n", "
")); sb.append(""); } sb.append("

"); diff --git a/source/de/anomic/data/htmlTools.java b/source/de/anomic/data/htmlTools.java new file mode 100644 index 000000000..4249fa82a --- /dev/null +++ b/source/de/anomic/data/htmlTools.java @@ -0,0 +1,232 @@ +package de.anomic.data; + +public class htmlTools { + + /** Replaces special characters from a string. Avoids XSS attacks and ensures correct display of + * special characters in non UTF-8 capable browsers. + * @param text a string that possibly contains HTML + * @return the string with all special characters encoded + */ + //[MN] + public static String replaceHTML(String text) { + text = replace(text, xmlentities); + text = replace(text, htmlentities); + return text; + } + + /** Replaces special characters from a string. Ensures correct display of + * special characters in non UTF-8 capable browsers. + * @param text a string that possibly contains special characters + * @return the string with all special characters encoded + */ + //[MN] + public static String replaceHTMLEntities(String text) { + text = replace(text, htmlentities); + return text; + } + + /** Replaces special characters from a string. Avoids XSS attacks. + * @param text a string that possibly contains HTML + * @return the string without any HTML-tags that can be used for XSS + */ + //[MN] + public static String replaceXMLEntities(String text) { + text = replace(text, xmlentities); + return text; + } + + /** Replaces characters in a string with other characters defined in an array. + * @param text a string that possibly contains special characters + * @param entities array that contains characters to be replaced and characters it will be replaced by + * @return the string with all characters replaced by the corresponding character from array + */ + //[FB], changes by [MN] + public static String replace(String text, String[] entities) { + if (text==null) { return null; } + for (int x=0;x<=entities.length-1;x=x+2) { + int p=0; + while ((p=text.indexOf(entities[x],p))>=0) { + text=text.substring(0,p)+entities[x+1]+text.substring(p+entities[x].length()); + p+=entities[x+1].length(); + } + } + return text; + } + + public static String deReplaceHTML(String text) { + text = deReplaceHTMLEntities(text); + text = deReplaceXMLEntities(text); + return text; + } + + public static String deReplaceHTMLEntities(String text) { + return deReplace(text, htmlentities); + } + + public static String deReplaceXMLEntities(String text) { + return deReplace(text, xmlentities); + } + + public static String deReplace(String text, String[] entities) { + if (text == null) return null; + for (int i=entities.length-1; i>0; i-=2) { + int p = 0; + while ((p = text.indexOf(entities[i])) >= 0) { + text = text.substring(0, p) + entities[i - 1] + text.substring(p + entities[i].length()); + p += entities[i - 1].length(); + } + } + return text; + } + + //This array contains codes (see http://mindprod.com/jgloss/unicode.html for details) + //that will be replaced. To add new codes or patterns, just put them at the end + //of the list. Codes or patterns in this list can not be escaped with [= or
+    public static final String[] xmlentities={
+        // Ampersands _have_ to be replaced first. If they were replaced later,
+        // other replaced characters containing ampersands would get messed up.
+        "\u0026","&",      //ampersand
+        "\"",""",         //quotation mark
+        "\u003C","<",       //less than
+        "\u003E",">",       //greater than
+    };
+
+    //This array contains codes (see http://mindprod.com/jgloss/unicode.html for details) and
+    //patterns that will be replaced. To add new codes or patterns, just put them at the end
+    //of the list. Codes or patterns in this list can not be escaped with [= or 
+    public static final String[] htmlentities={
+        "\u005E","^",  // Caret
+
+        "\u0060","`",  // Accent Grave `
+        "\u007B","{",  // {
+        "\u007C","|",  // |
+        "\u007D","}",  // }
+        "\u007E","~",  // ~
+
+        "\u0082","‚",
+        "\u0083","ƒ",
+        "\u0084","„",
+        "\u0085","…",
+        "\u0086","†",
+        "\u0087","‡",
+        "\u0088","ˆ",
+        "\u0089","‰",
+        "\u008A","Š",
+        "\u008B","‹",
+        "\u008C","Œ",
+        "\u008D","",
+        "\u008E","Ž",
+
+        "\u0091","‘",
+        "\u0092","’",
+        "\u0093","“",
+        "\u0094","”",
+        "\u0095","•",
+        "\u0096","–",
+        "\u0097","—",
+        "\u0098","˜",
+        "\u0099","™",
+        "\u009A","š",
+        "\u009B","›",
+        "\u009C","œ",
+        "\u009D","",
+        "\u009E","ž",
+        "\u009F","Ÿ",
+
+        "\u00A1","¡",    //inverted (spanish) exclamation mark
+        "\u00A2","¢",     //cent
+        "\u00A3","£",    //pound
+        "\u00A4","¤",   //currency
+        "\u00A5","¥",      //yen
+        "\u00A6","¦",   //broken vertical bar
+        "\u00A7","§",     //section sign
+        "\u00A8","¨",      //diaeresis (umlaut)
+        "\u00A9","©",     //copyright sign
+        "\u00AA","ª",     //feminine ordinal indicator
+        "\u00AB","«",    //left-pointing double angle quotation mark
+        "\u00AC","¬",      //not sign
+        "\u00AD","­",      //soft hyphen
+        "\u00AE","®",      //registered sign
+        "\u00AF","¯",     //macron
+        "\u00B0","°",      //degree sign
+        "\u00B1","±",   //plus-minus sign
+        "\u00B2","²",     //superscript two
+        "\u00B3","³",     //superscript three
+        "\u00B4","´",    //acute accent
+        "\u00B5","µ",    //micro sign
+        "\u00B6","¶",     //paragraph sign
+        "\u00B7","·",   //middle dot
+        "\u00B8","¸",    //cedilla
+        "\u00B9","¹",     //superscript one
+        "\u00BA","º",     //masculine ordinal indicator
+        "\u00BB","»",    //right-pointing double angle quotation mark
+        "\u00BC","¼",   //fraction 1/4
+        "\u00BD","½",   //fraction 1/2
+        "\u00BE","¾",   //fraction 3/4
+        "\u00BF","¿",   //inverted (spanisch) questionmark
+        "\u00C0","À",
+        "\u00C1","Á",
+        "\u00C2","Â",
+        "\u00C3","Ã",
+        "\u00C4","Ä",
+        "\u00C5","Å",
+        "\u00C6","Æ",
+        "\u00C7","Ç",
+        "\u00C8","È",
+        "\u00C9","É",
+        "\u00CA","Ê",
+        "\u00CB","Ë",
+        "\u00CC","Ì",
+        "\u00CD","Í",
+        "\u00CE","Î",
+        "\u00CF","Ï",
+        "\u00D0","Ð",
+        "\u00D1","Ñ",
+        "\u00D2","Ò",
+        "\u00D3","Ó",
+        "\u00D4","Ô",
+        "\u00D5","Õ",
+        "\u00D6","Ö",
+        "\u00D7","×",
+        "\u00D8","Ø",
+        "\u00D9","Ù",
+        "\u00DA","Ú",
+        "\u00DB","Û",
+        "\u00DC","Ü",
+        "\u00DD","Ý",
+        "\u00DE","Þ",
+        "\u00DF","ß",
+        "\u00E0","à",
+        "\u00E1","á",
+        "\u00E2","â",
+        "\u00E3","ã",
+        "\u00E4","ä",
+        "\u00E5","å",
+        "\u00E6","æ",
+        "\u00E7","ç",
+        "\u00E8","è",
+        "\u00E9","é",
+        "\u00EA","ê",
+        "\u00EB","ë",
+        "\u00EC","ì",
+        "\u00ED","í",
+        "\u00EE","î",
+        "\u00EF","ï",
+        "\u00F0","ð",
+        "\u00F1","ñ",
+        "\u00F2","ò",
+        "\u00F3","ó",
+        "\u00F4","ô",
+        "\u00F5","õ",
+        "\u00F6","ö",
+        "\u00F7","÷",
+        "\u00F8","ø",
+        "\u00F9","ù",
+        "\u00FA","ú",
+        "\u00FB","û",
+        "\u00FC","ü",
+        "\u00FD","ý",
+        "\u00FE","þ",
+        "\u00FF","ÿ"
+    };
+}
diff --git a/source/de/anomic/data/wiki/WikiParserException.java b/source/de/anomic/data/wiki/WikiParserException.java
deleted file mode 100644
index ce2769111..000000000
--- a/source/de/anomic/data/wiki/WikiParserException.java
+++ /dev/null
@@ -1,20 +0,0 @@
-package de.anomic.data.wiki;
-
-public class WikiParserException extends RuntimeException {
-    
-    private static final long serialVersionUID = 1L;
-    
-    public WikiParserException() {  }
-    
-    public WikiParserException(String message) {
-        super(message);
-    }
-    
-    public WikiParserException(Throwable cause) {
-        super(cause);
-    }
-    
-    public WikiParserException(String message, Throwable cause) {
-        super(message, cause);
-    }
-}
diff --git a/source/de/anomic/data/wiki/abstractWikiParser.java b/source/de/anomic/data/wiki/abstractWikiParser.java
new file mode 100644
index 000000000..ac038c6a6
--- /dev/null
+++ b/source/de/anomic/data/wiki/abstractWikiParser.java
@@ -0,0 +1,50 @@
+package de.anomic.data.wiki;
+
+import java.io.BufferedReader;
+import java.io.ByteArrayInputStream;
+import java.io.IOException;
+import java.io.InputStreamReader;
+import java.io.StringReader;
+import java.io.UnsupportedEncodingException;
+
+import de.anomic.plasma.plasmaSwitchboard;
+
+public abstract class abstractWikiParser implements wikiParser {
+    
+    protected plasmaSwitchboard sb;
+    
+    public abstractWikiParser(plasmaSwitchboard sb) {
+        this.sb = sb;
+    }
+    
+    protected abstract String transform(BufferedReader reader, int length, plasmaSwitchboard sb) throws IOException;
+
+    public String transform(String content) {
+        return transform(content, this.sb);
+    }
+    
+    public String transform(String content, plasmaSwitchboard sb) {
+        try {
+            return transform(new BufferedReader(new StringReader(content)), content.length(), sb);
+        } catch (IOException e) {
+            return "internal error: " + e.getMessage();
+        }
+    }
+
+    public String transform(byte[] content) throws UnsupportedEncodingException {
+        return transform(content, "UTF-8", this.sb);
+    }
+    
+    public String transform(byte[] content, String encoding) throws UnsupportedEncodingException {
+        return transform(content, encoding, this.sb);
+    }
+
+    public String transform(byte[] content, String encoding, plasmaSwitchboard switchboard) throws UnsupportedEncodingException {
+        ByteArrayInputStream bais = new ByteArrayInputStream(content);
+        try {
+            return transform(new BufferedReader(new InputStreamReader(bais, encoding)), content.length, switchboard);
+        } catch (IOException e) {
+            return "internal error: " + e.getMessage();
+        }
+    }
+}
diff --git a/source/de/anomic/data/wiki/knwikiParser.java b/source/de/anomic/data/wiki/knwikiParser.java
new file mode 100644
index 000000000..7bbfd66ef
--- /dev/null
+++ b/source/de/anomic/data/wiki/knwikiParser.java
@@ -0,0 +1,285 @@
+// wikiParser.java 
+// ---------
+// part of YaCy
+// (C) by Michael Peter Christen; mc@anomic.de
+// first published on http://www.anomic.de
+// Frankfurt, Germany, 2007
+// Created 22.02.2007
+//
+// This file is contributed by Franz Brauße
+//
+// $LastChangedDate: $
+// $LastChangedRevision: $
+// $LastChangedBy: $
+//
+// This program is free software; you can redistribute it and/or modify
+// it under the terms of the GNU General Public License as published by
+// the Free Software Foundation; either version 2 of the License, or
+// (at your option) any later version.
+//
+// This program is distributed in the hope that it will be useful,
+// but WITHOUT ANY WARRANTY; without even the implied warranty of
+// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+// GNU General Public License for more details.
+//
+// You should have received a copy of the GNU General Public License
+// along with this program; if not, write to the Free Software
+// Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA  02111-1307  USA
+//
+// Using this software in any meaning (reading, learning, copying, compiling,
+// running) means that you agree that the Author(s) is (are) not responsible
+// for cost, loss of data or any harm that may be caused directly or indirectly
+// by usage of this softare or this documentation. The usage of this software
+// is on your own risk. The installation and usage (starting/running) of this
+// software may allow other people or application to access your computer and
+// any attached devices and is highly dependent on the configuration of the
+// software which must be done by the user of the software; the author(s) is
+// (are) also not responsible for proper configuration and usage of the
+// software, even if provoked by documentation provided together with
+// the software.
+//
+// Any changes to this file according to the GPL as documented in the file
+// gpl.txt aside this file in the shipment you received can be done to the
+// lines that follows this copyright notice here, but changes must not be
+// done inside the copyright notive above. A re-distribution must contain
+// the intact and unchanged copyright notice.
+// Contributions and changes to the program code must be marked as such.
+
+package de.anomic.data.wiki;
+
+import java.io.UnsupportedEncodingException;
+import java.util.ArrayList;
+import java.util.regex.Matcher;
+
+import de.anomic.data.wiki.tokens.DefinitionListToken;
+import de.anomic.data.wiki.tokens.LinkToken;
+import de.anomic.data.wiki.tokens.ListToken;
+import de.anomic.data.wiki.tokens.SimpleToken;
+import de.anomic.data.wiki.tokens.TableToken;
+import de.anomic.data.wiki.tokens.Token;
+import de.anomic.plasma.plasmaSwitchboard;
+import de.anomic.yacy.yacyCore;
+
+public class knwikiParser implements wikiParser {
+	
+	public final Token[] tokens;
+	private final String[] BEs;
+    
+    public knwikiParser(plasmaSwitchboard sb) {
+        tokens = new Token[] {
+                new SimpleToken('=', '=', new String[][] { null, { "h2" }, { "h3" }, { "h4" } }, true),
+                new SimpleToken('\'', '\'', new String[][] { null, { "i" }, { "b" }, null, { "b", "i" } }, false),
+                new LinkToken(yacyCore.seedDB.mySeed.getPublicAddress(), "Wiki.html?page=", sb),
+                new ListToken('*', "ul"),
+                new ListToken('#', "ol"),
+                new ListToken(':', "blockquote", null),
+                new ListToken(' ', null, "tt", false),
+                new DefinitionListToken(),
+                new TableToken()
+        };
+        ArrayList r = new ArrayList();
+        for (int i=0, k, j; i 1) {
+                        r.add(tokens[i].getBlockElementNames()[j].substring(0, k));
+                    } else {
+                        r.add(tokens[i].getBlockElementNames()[j]);
+                    }
+                }
+        r.add("hr");
+        BEs = (String[])r.toArray(new String[r.size()]);
+    }
+	
+	public static void main(String[] args) {
+		String text = "===T
itle===\n" +
+				"==blubb== was ==ein '''shice'''==...och.bla\n" +
+				"* ein \n" +
+				"*==test=
=\n" + + "** doppelt\n" + + "* ''tess*sst''\n" + + "*** xyz\n" + + "=]*** huch\n" + + "* ehehe***\n" + + "* blubb\n" + + "bliblablo\n\n\n" + + "* blubb\n" + + "{|border=-1\n" + + "|-\n" + + "||bla|| blubb\n" + + "|-\n" + + "||align center|och||huch||\n" + + "|}\n" + + "\n" + + "# bla\n" + + "# blubb\n" + + "'''''ehehehe''''', ne?!\n" + + "[http://www/index.html,ne?!] -\n" + + "[[Image:blubb|BLA]] ---- och\n" + + " blubb1\n" + + " blubb2\n" + + ":doppel-blubb[= huch =]\n" + + ";hier:da\n" + + ";dort:und so\n" + + ";;und:doppelt\n\n\n\n" + + "[[Image:blubb|BLA]]"; + // text = "[=\n=]* bla"; + String t = "[=] ein fucking [= test =]-text[=,ne?!=] joa, [=alles=]wunderbar," + + "[=denk ich=] mal =]"; + long l = System.currentTimeMillis(); + t = new knwikiParser(null).parse((args.length > 0) ? args[0] : text); + System.out.println("parsing time: " + (System.currentTimeMillis() - l) + " ms"); + System.out.println("--- --- ---"); + System.out.println(t); + } + + public String transform(String content) { + return parse(content); + } + + public String transform(String content, plasmaSwitchboard sb) { + return parse(content); + } + + public String transform(byte[] content) throws UnsupportedEncodingException { + return parse(new String(content, "UTF-8")); + } + + public String transform( + byte[] content, String encoding, + plasmaSwitchboard switchboard) throws UnsupportedEncodingException { + return parse(new String(content, encoding)); + } + + public String transform(byte[] content, String encoding) throws UnsupportedEncodingException { + return parse(new String(content, encoding)); + } + + public String parse(String text) { + Text[] tt = Text.split2Texts(text, "[=", "=]"); + for (int i=0; i", "
"); + for (int i=0; i"); + } + + private String replaceBRs(String text) { + StringBuffer sb = new StringBuffer(text.length()); + String[] tt = text.split("\n"); + boolean replace; + for (int i=0, j; i")) { replace = false; break; } + sb.append(tt[i]); + if (i < tt.length - 1) { + if (replace) sb.append("
"); + sb.append("\n"); + } + } + return new String(sb); + } + + private static class Text { + + public static final String escapeNewLine = "@"; + + private String text; + private final boolean escaped; + private final boolean nl; + + public Text(String text, boolean escaped, boolean newLineBefore) { + this.text = text; + this.escaped = escaped; + this.nl = newLineBefore; + } + + public String setTextPlain(String text) { return this.text = text; } + public String setText(String text) { + if (this.nl) + this.text = text.substring(escapeNewLine.length()); + else + this.text = text; + return this.text; + } + + public String getTextPlain() { return this.text; } + public String getText() { + if (this.nl) + return escapeNewLine + this.text; + else + return this.text; + } + + public String toString() { return this.text; } + public boolean isEscaped() { return this.escaped; } + public boolean isNewLineBefore() { return this.nl; } + + private static Text[] split2Texts(String text, String escapeBegin, String escapeEnd) { + if (text == null) return null; + if (text.length() < 2) return new Text[] { new Text(text, false, true) }; + + int startLen = escapeBegin.length(); + int endLen = escapeEnd.length(); + ArrayList r = new ArrayList(); + boolean escaped = text.startsWith(escapeBegin); + if (escaped) r.add(new Text("", false, true)); + int i, j = 0; + while ((i = text.indexOf((escaped) ? escapeEnd : escapeBegin, j)) > -1) { + r.add(resolve2Text(text, escaped, (j > 0) ? j + ((escaped) ? startLen : endLen) : 0, i, escapeEnd)); + j = i; + escaped = !escaped; + } + r.add(resolve2Text(text, escaped, (escaped) ? j : (j > 0) ? j + endLen : 0, -1, escapeEnd)); + return (Text[])r.toArray(new Text[r.size()]); + } + + private static Text resolve2Text(String text, boolean escaped, int from, int to, String escapeEnd) { + if (to == -1) to = text.length(); + return new Text( + text.substring(from, to), + escaped, + from < escapeEnd.length() + 2 || (!escaped && text.charAt(from - escapeEnd.length() - 1) == '\n')); + } + + private static String mergeTexts(Text[] texts) { + StringBuffer sb = new StringBuffer(); + for (int n=0; n < texts.length; n++) + sb.append(texts[n].getTextPlain()); + return new String(sb); + } + } +} diff --git a/source/de/anomic/data/wiki/tokens/AbstractToken.java b/source/de/anomic/data/wiki/tokens/AbstractToken.java index dbffa1b6a..231afecb3 100644 --- a/source/de/anomic/data/wiki/tokens/AbstractToken.java +++ b/source/de/anomic/data/wiki/tokens/AbstractToken.java @@ -47,15 +47,17 @@ package de.anomic.data.wiki.tokens; +import de.anomic.data.wiki.wikiParserException; + public abstract class AbstractToken implements Token { protected String text = null; protected String markup = null; protected boolean parsed = false; - protected abstract void parse(); + protected abstract void parse() throws wikiParserException; - public String getMarkup() { + public String getMarkup() throws wikiParserException { if (this.text == null) throw new IllegalArgumentException(); if (!this.parsed) parse(); @@ -64,5 +66,5 @@ public abstract class AbstractToken implements Token { public String getText() { return this.text; } - public String toString() { return getMarkup(); } + public String toString() { try { return getMarkup(); } catch (wikiParserException e) { return null; } } } diff --git a/source/de/anomic/data/wiki/tokens/LinkToken.java b/source/de/anomic/data/wiki/tokens/LinkToken.java index 74e6aa84a..3e27b1bff 100644 --- a/source/de/anomic/data/wiki/tokens/LinkToken.java +++ b/source/de/anomic/data/wiki/tokens/LinkToken.java @@ -55,7 +55,7 @@ import java.util.regex.Pattern; import de.anomic.data.bookmarksDB; import de.anomic.data.bookmarksDB.Bookmark; import de.anomic.data.bookmarksDB.Tag; -import de.anomic.data.wiki.WikiParserException; +import de.anomic.data.wiki.wikiParserException; import de.anomic.plasma.plasmaSwitchboard; public class LinkToken extends AbstractToken { @@ -106,13 +106,13 @@ public class LinkToken extends AbstractToken { this.sb = sb; } - protected void parse() { + protected void parse() throws wikiParserException { StringBuffer sb = new StringBuffer(); if (this.patternNr < 0 || this.patternNr >= patterns.length) - throw new WikiParserException("patternNr was not set correctly: " + this.patternNr); + throw new wikiParserException("patternNr was not set correctly: " + this.patternNr); Matcher m = patterns[this.patternNr].matcher(this.text); if (!m.find()) - throw new WikiParserException("Didn't find match for: (" + this.patternNr + ") " + this.text); + throw new wikiParserException("Didn't find match for: (" + this.patternNr + ") " + this.text); switch (this.patternNr) { case IMG: diff --git a/source/de/anomic/data/wiki/tokens/SimpleToken.java b/source/de/anomic/data/wiki/tokens/SimpleToken.java index 2290b9d02..ac701c429 100644 --- a/source/de/anomic/data/wiki/tokens/SimpleToken.java +++ b/source/de/anomic/data/wiki/tokens/SimpleToken.java @@ -51,7 +51,7 @@ import java.util.ArrayList; import java.util.regex.Matcher; import java.util.regex.Pattern; -import de.anomic.data.wiki.WikiParserException; +import de.anomic.data.wiki.wikiParserException; public class SimpleToken extends AbstractToken { @@ -88,7 +88,7 @@ public class SimpleToken extends AbstractToken { "([\\" + lastChar + "]{" + i + "," + definitionList.length + "})")}; } - public String getMarkup() { + public String getMarkup() throws wikiParserException { if (this.content == null) { if (this.text == null) { throw new IllegalArgumentException(); @@ -96,14 +96,14 @@ public class SimpleToken extends AbstractToken { setText(this.text, 0); } } - if (!this.parsed) try { parse(); } catch (WikiParserException e) { return this.text; } + if (!this.parsed) parse(); return this.markup; } - protected void parse() { + protected void parse() throws wikiParserException { String[] e; if (this.grade >= this.definitionList.length || (e = this.definitionList[this.grade]) == null) - throw new WikiParserException("Token not defined for grade: " + this.grade); + throw new wikiParserException("Token not defined for grade: " + this.grade); this.markup = getMarkup(e); this.parsed = true; } diff --git a/source/de/anomic/data/wiki/tokens/Token.java b/source/de/anomic/data/wiki/tokens/Token.java index 18393db1c..0d5675e9c 100644 --- a/source/de/anomic/data/wiki/tokens/Token.java +++ b/source/de/anomic/data/wiki/tokens/Token.java @@ -49,11 +49,13 @@ package de.anomic.data.wiki.tokens; import java.util.regex.Pattern; +import de.anomic.data.wiki.wikiParserException; + public interface Token { public Pattern[] getRegex(); public boolean setText(String text, int patternNr); public String getText(); - public String getMarkup(); + public String getMarkup() throws wikiParserException; public String[] getBlockElementNames(); } diff --git a/source/de/anomic/data/wiki/wikiParser.java b/source/de/anomic/data/wiki/wikiParser.java index 3165b07bb..dc2d82038 100644 --- a/source/de/anomic/data/wiki/wikiParser.java +++ b/source/de/anomic/data/wiki/wikiParser.java @@ -1,260 +1,14 @@ -// wikiParser.java -// --------- -// part of YaCy -// (C) by Michael Peter Christen; mc@anomic.de -// first published on http://www.anomic.de -// Frankfurt, Germany, 2007 -// Created 22.02.2007 -// -// This file is contributed by Franz Brauße -// -// $LastChangedDate: $ -// $LastChangedRevision: $ -// $LastChangedBy: $ -// -// This program is free software; you can redistribute it and/or modify -// it under the terms of the GNU General Public License as published by -// the Free Software Foundation; either version 2 of the License, or -// (at your option) any later version. -// -// This program is distributed in the hope that it will be useful, -// but WITHOUT ANY WARRANTY; without even the implied warranty of -// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the -// GNU General Public License for more details. -// -// You should have received a copy of the GNU General Public License -// along with this program; if not, write to the Free Software -// Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA -// -// Using this software in any meaning (reading, learning, copying, compiling, -// running) means that you agree that the Author(s) is (are) not responsible -// for cost, loss of data or any harm that may be caused directly or indirectly -// by usage of this softare or this documentation. The usage of this software -// is on your own risk. The installation and usage (starting/running) of this -// software may allow other people or application to access your computer and -// any attached devices and is highly dependent on the configuration of the -// software which must be done by the user of the software; the author(s) is -// (are) also not responsible for proper configuration and usage of the -// software, even if provoked by documentation provided together with -// the software. -// -// Any changes to this file according to the GPL as documented in the file -// gpl.txt aside this file in the shipment you received can be done to the -// lines that follows this copyright notice here, but changes must not be -// done inside the copyright notive above. A re-distribution must contain -// the intact and unchanged copyright notice. -// Contributions and changes to the program code must be marked as such. - package de.anomic.data.wiki; -import java.util.ArrayList; -import java.util.regex.Matcher; +import java.io.UnsupportedEncodingException; -import de.anomic.data.wiki.tokens.DefinitionListToken; -import de.anomic.data.wiki.tokens.LinkToken; -import de.anomic.data.wiki.tokens.ListToken; -import de.anomic.data.wiki.tokens.SimpleToken; -import de.anomic.data.wiki.tokens.TableToken; -import de.anomic.data.wiki.tokens.Token; import de.anomic.plasma.plasmaSwitchboard; -import de.anomic.yacy.yacyCore; -public class wikiParser { - - public final Token[] tokens; - private final String[] BEs; +public interface wikiParser { - public wikiParser(plasmaSwitchboard sb) { - tokens = new Token[] { - new SimpleToken('=', '=', new String[][] { null, { "h2" }, { "h3" }, { "h4" } }, true), - new SimpleToken('\'', '\'', new String[][] { null, { "i" }, { "b" }, null, { "b", "i" } }, false), - new LinkToken(yacyCore.seedDB.mySeed.getPublicAddress(), "Wiki.html?page=", sb), - new ListToken('*', "ul"), - new ListToken('#', "ol"), - new ListToken(':', "blockquote", null), - new ListToken(' ', null, "tt", false), - new DefinitionListToken(), - new TableToken() - }; - ArrayList r = new ArrayList(); - for (int i=0, k, j; i 1) { - r.add(tokens[i].getBlockElementNames()[j].substring(0, k)); - } else { - r.add(tokens[i].getBlockElementNames()[j]); - } - } - r.add("hr"); - BEs = (String[])r.toArray(new String[r.size()]); - } - - public static void main(String[] args) { - String text = "===T
itle===\n" +
-				"==blubb== was ==ein '''shice'''==...och.bla\n" +
-				"* ein \n" +
-				"*==test=
=\n" + - "** doppelt\n" + - "* ''tess*sst''\n" + - "*** xyz\n" + - "=]*** huch\n" + - "* ehehe***\n" + - "* blubb\n" + - "bliblablo\n\n\n" + - "* blubb\n" + - "{|border=-1\n" + - "|-\n" + - "||bla|| blubb\n" + - "|-\n" + - "||align center|och||huch||\n" + - "|}\n" + - "\n" + - "# bla\n" + - "# blubb\n" + - "'''''ehehehe''''', ne?!\n" + - "[http://www/index.html,ne?!] -\n" + - "[[Image:blubb|BLA]] ---- och\n" + - " blubb1\n" + - " blubb2\n" + - ":doppel-blubb[= huch =]\n" + - ";hier:da\n" + - ";dort:und so\n" + - ";;und:doppelt\n\n\n\n" + - "[[Image:blubb|BLA]]"; - // text = "[=\n=]* bla"; - String t = "[=] ein fucking [= test =]-text[=,ne?!=] joa, [=alles=]wunderbar," + - "[=denk ich=] mal =]"; - long l = System.currentTimeMillis(); - t = new wikiParser(null).parse((args.length > 0) ? args[0] : text); - System.out.println("parsing time: " + (System.currentTimeMillis() - l) + " ms"); - System.out.println("--- --- ---"); - System.out.println(t); - } - - public String parse(String text) { - Text[] tt = Text.split2Texts(text, "[=", "=]"); - for (int i=0; i", "
"); - for (int i=0; i"); - } - - private String replaceBRs(String text) { - StringBuffer sb = new StringBuffer(text.length()); - String[] tt = text.split("\n"); - boolean replace; - for (int i=0, j; i")) { replace = false; break; } - sb.append(tt[i]); - if (i < tt.length - 1) { - if (replace) sb.append("
"); - sb.append("\n"); - } - } - return new String(sb); - } - - private static class Text { - - public static final String escapeNewLine = "@"; - - private String text; - private final boolean escaped; - private final boolean nl; - - public Text(String text, boolean escaped, boolean newLineBefore) { - this.text = text; - this.escaped = escaped; - this.nl = newLineBefore; - } - - public String setTextPlain(String text) { return this.text = text; } - public String setText(String text) { - if (this.nl) - this.text = text.substring(escapeNewLine.length()); - else - this.text = text; - return this.text; - } - - public String getTextPlain() { return this.text; } - public String getText() { - if (this.nl) - return escapeNewLine + this.text; - else - return this.text; - } - - public String toString() { return this.text; } - public boolean isEscaped() { return this.escaped; } - public boolean isNewLineBefore() { return this.nl; } - - private static Text[] split2Texts(String text, String escapeBegin, String escapeEnd) { - if (text == null) return null; - if (text.length() < 2) return new Text[] { new Text(text, false, true) }; - - int startLen = escapeBegin.length(); - int endLen = escapeEnd.length(); - ArrayList r = new ArrayList(); - boolean escaped = text.startsWith(escapeBegin); - if (escaped) r.add(new Text("", false, true)); - int i, j = 0; - while ((i = text.indexOf((escaped) ? escapeEnd : escapeBegin, j)) > -1) { - r.add(resolve2Text(text, escaped, (j > 0) ? j + ((escaped) ? startLen : endLen) : 0, i, escapeEnd)); - j = i; - escaped = !escaped; - } - r.add(resolve2Text(text, escaped, (escaped) ? j : (j > 0) ? j + endLen : 0, -1, escapeEnd)); - return (Text[])r.toArray(new Text[r.size()]); - } - - private static Text resolve2Text(String text, boolean escaped, int from, int to, String escapeEnd) { - if (to == -1) to = text.length(); - return new Text( - text.substring(from, to), - escaped, - from < escapeEnd.length() + 2 || (!escaped && text.charAt(from - escapeEnd.length() - 1) == '\n')); - } - - private static String mergeTexts(Text[] texts) { - StringBuffer sb = new StringBuffer(); - for (int n=0; n < texts.length; n++) - sb.append(texts[n].getTextPlain()); - return new String(sb); - } - } + public String transform(String text); + public String transform(String text, plasmaSwitchboard switchboard); + public String transform(byte[] text) throws UnsupportedEncodingException; + public String transform(byte[] text, String encoding) throws UnsupportedEncodingException; + public String transform(byte[] text, String encoding, plasmaSwitchboard switchboard) throws UnsupportedEncodingException; } diff --git a/source/de/anomic/data/wiki/wikiParserException.java b/source/de/anomic/data/wiki/wikiParserException.java new file mode 100644 index 000000000..a6c9c560c --- /dev/null +++ b/source/de/anomic/data/wiki/wikiParserException.java @@ -0,0 +1,20 @@ +package de.anomic.data.wiki; + +public class wikiParserException extends Exception { + + private static final long serialVersionUID = 1L; + + public wikiParserException() { } + + public wikiParserException(String message) { + super(message); + } + + public wikiParserException(Throwable cause) { + super(cause); + } + + public wikiParserException(String message, Throwable cause) { + super(message, cause); + } +} diff --git a/source/de/anomic/data/wikiCode.java b/source/de/anomic/data/wikiCode.java index 7e16600eb..24ea8acb2 100644 --- a/source/de/anomic/data/wikiCode.java +++ b/source/de/anomic/data/wikiCode.java @@ -47,12 +47,11 @@ package de.anomic.data; import java.io.BufferedReader; -import java.io.ByteArrayInputStream; import java.io.IOException; -import java.io.InputStreamReader; -import java.io.UnsupportedEncodingException; import java.util.ArrayList; +import de.anomic.data.wiki.abstractWikiParser; +import de.anomic.data.wiki.wikiParser; import de.anomic.plasma.plasmaSwitchboard; import de.anomic.server.serverCore; import de.anomic.yacy.yacyCore; @@ -60,11 +59,10 @@ import de.anomic.yacy.yacyCore; /** This class provides methods to handle texts that have been posted in the yacyWiki or other * parts of YaCy that use this class, like the blog or the profile. */ -public class wikiCode { +public class wikiCode extends abstractWikiParser implements wikiParser { private String numListLevel=""; private String ListLevel=""; private String defListLevel=""; - private plasmaSwitchboard sb; private boolean cellprocessing=false; //needed for prevention of double-execution of replaceHTML private boolean defList = false; //needed for definition lists private boolean escape = false; //needed for escape @@ -84,270 +82,19 @@ public class wikiCode { /** Constructor of the class wikiCode */ public wikiCode(plasmaSwitchboard switchboard){ - sb=switchboard; - } - - public String transform(String content){ - try { - return transform(content.getBytes("UTF-8"), sb); - } catch (UnsupportedEncodingException e) { - return transform(content.getBytes(), sb); - } - } - - public String transform(byte[] content){ - return transform(content, sb); - } - - public String transform(byte[] content, plasmaSwitchboard switchboard) { - ByteArrayInputStream bais = new ByteArrayInputStream(content); - try { - BufferedReader br = new BufferedReader(new InputStreamReader(bais, - "UTF-8")); - String line; - StringBuffer out = new StringBuffer(content.length); - try { - while ((line = br.readLine()) != null) { - out.append(transformLine(line, switchboard)).append( - serverCore.crlfString); - } - return directory() + new String(out); - } catch (UnsupportedEncodingException e1) { - // can not happen - return null; - } - } catch (IOException e) { - return "internal error: " + e.getMessage(); - } - } - - /** Replaces special characters from a string. Avoids XSS attacks and ensures correct display of - * special characters in non UTF-8 capable browsers. - * @param text a string that possibly contains HTML - * @return the string with all special characters encoded - */ - //[MN] - public static String replaceHTML(String text) { - text = replace(text, xmlentities); - text = replace(text, htmlentities); - return text; - } - - /** Replaces special characters from a string. Ensures correct display of - * special characters in non UTF-8 capable browsers. - * @param text a string that possibly contains special characters - * @return the string with all special characters encoded - */ - //[MN] - public static String replaceHTMLEntities(String text) { - text = replace(text, htmlentities); - return text; - } - - /** Replaces special characters from a string. Avoids XSS attacks. - * @param text a string that possibly contains HTML - * @return the string without any HTML-tags that can be used for XSS - */ - //[MN] - public static String replaceXMLEntities(String text) { - text = replace(text, xmlentities); - return text; - } - - /** Replaces characters in a string with other characters defined in an array. - * @param text a string that possibly contains special characters - * @param entities array that contains characters to be replaced and characters it will be replaced by - * @return the string with all characters replaced by the corresponding character from array - */ - //[FB], changes by [MN] - public static String replace(String text, String[] entities) { - if (text==null) { return null; } - for (int x=0;x<=entities.length-1;x=x+2) { - int p=0; - while ((p=text.indexOf(entities[x],p))>=0) { - text=text.substring(0,p)+entities[x+1]+text.substring(p+entities[x].length()); - p+=entities[x+1].length(); - } - } - return text; + super(switchboard); } - public static String deReplaceHTML(String text) { - text = deReplaceHTMLEntities(text); - text = deReplaceXMLEntities(text); - return text; + protected String transform( + BufferedReader reader, + int length, + plasmaSwitchboard switchboard) throws IOException { + StringBuffer out = new StringBuffer(length); + String line; + while ((line = reader.readLine()) != null) + out.append(transformLine(line, switchboard)).append(serverCore.crlfString); + return out.insert(0, directory()).toString(); } - - public static String deReplaceHTMLEntities(String text) { - return deReplace(text, htmlentities); - } - - public static String deReplaceXMLEntities(String text) { - return deReplace(text, xmlentities); - } - - public static String deReplace(String text, String[] entities) { - if (text == null) return null; - for (int i=entities.length-1; i>0; i-=2) { - int p = 0; - while ((p = text.indexOf(entities[i])) >= 0) { - text = text.substring(0, p) + entities[i - 1] + text.substring(p + entities[i].length()); - p += entities[i - 1].length(); - } - } - return text; - } - - //This array contains codes (see http://mindprod.com/jgloss/unicode.html for details) - //that will be replaced. To add new codes or patterns, just put them at the end - //of the list. Codes or patterns in this list can not be escaped with [= or
-    public static String[] xmlentities={
-        // Ampersands _have_ to be replaced first. If they were replaced later,
-        // other replaced characters containing ampersands would get messed up.
-        "\u0026","&",      //ampersand
-        "\"",""",         //quotation mark
-        "\u003C","<",       //less than
-        "\u003E",">",       //greater than
-    };
-
-    //This array contains codes (see http://mindprod.com/jgloss/unicode.html for details) and
-    //patterns that will be replaced. To add new codes or patterns, just put them at the end
-    //of the list. Codes or patterns in this list can not be escaped with [= or 
-    public static String[] htmlentities={
-        "\u005E","^",  // Caret
-
-        "\u0060","`",  // Accent Grave `
-        "\u007B","{",  // {
-        "\u007C","|",  // |
-        "\u007D","}",  // }
-        "\u007E","~",  // ~
-
-        "\u0082","‚",
-        "\u0083","ƒ",
-        "\u0084","„",
-        "\u0085","…",
-        "\u0086","†",
-        "\u0087","‡",
-        "\u0088","ˆ",
-        "\u0089","‰",
-        "\u008A","Š",
-        "\u008B","‹",
-        "\u008C","Œ",
-        "\u008D","",
-        "\u008E","Ž",
-
-        "\u0091","‘",
-        "\u0092","’",
-        "\u0093","“",
-        "\u0094","”",
-        "\u0095","•",
-        "\u0096","–",
-        "\u0097","—",
-        "\u0098","˜",
-        "\u0099","™",
-        "\u009A","š",
-        "\u009B","›",
-        "\u009C","œ",
-        "\u009D","",
-        "\u009E","ž",
-        "\u009F","Ÿ",
-
-        "\u00A1","¡",    //inverted (spanish) exclamation mark
-        "\u00A2","¢",     //cent
-        "\u00A3","£",    //pound
-        "\u00A4","¤",   //currency
-        "\u00A5","¥",      //yen
-        "\u00A6","¦",   //broken vertical bar
-        "\u00A7","§",     //section sign
-        "\u00A8","¨",      //diaeresis (umlaut)
-        "\u00A9","©",     //copyright sign
-        "\u00AA","ª",     //feminine ordinal indicator
-        "\u00AB","«",    //left-pointing double angle quotation mark
-        "\u00AC","¬",      //not sign
-        "\u00AD","­",      //soft hyphen
-        "\u00AE","®",      //registered sign
-        "\u00AF","¯",     //macron
-        "\u00B0","°",      //degree sign
-        "\u00B1","±",   //plus-minus sign
-        "\u00B2","²",     //superscript two
-        "\u00B3","³",     //superscript three
-        "\u00B4","´",    //acute accent
-        "\u00B5","µ",    //micro sign
-        "\u00B6","¶",     //paragraph sign
-        "\u00B7","·",   //middle dot
-        "\u00B8","¸",    //cedilla
-        "\u00B9","¹",     //superscript one
-        "\u00BA","º",     //masculine ordinal indicator
-        "\u00BB","»",    //right-pointing double angle quotation mark
-        "\u00BC","¼",   //fraction 1/4
-        "\u00BD","½",   //fraction 1/2
-        "\u00BE","¾",   //fraction 3/4
-        "\u00BF","¿",   //inverted (spanisch) questionmark
-        "\u00C0","À",
-        "\u00C1","Á",
-        "\u00C2","Â",
-        "\u00C3","Ã",
-        "\u00C4","Ä",
-        "\u00C5","Å",
-        "\u00C6","Æ",
-        "\u00C7","Ç",
-        "\u00C8","È",
-        "\u00C9","É",
-        "\u00CA","Ê",
-        "\u00CB","Ë",
-        "\u00CC","Ì",
-        "\u00CD","Í",
-        "\u00CE","Î",
-        "\u00CF","Ï",
-        "\u00D0","Ð",
-        "\u00D1","Ñ",
-        "\u00D2","Ò",
-        "\u00D3","Ó",
-        "\u00D4","Ô",
-        "\u00D5","Õ",
-        "\u00D6","Ö",
-        "\u00D7","×",
-        "\u00D8","Ø",
-        "\u00D9","Ù",
-        "\u00DA","Ú",
-        "\u00DB","Û",
-        "\u00DC","Ü",
-        "\u00DD","Ý",
-        "\u00DE","Þ",
-        "\u00DF","ß",
-        "\u00E0","à",
-        "\u00E1","á",
-        "\u00E2","â",
-        "\u00E3","ã",
-        "\u00E4","ä",
-        "\u00E5","å",
-        "\u00E6","æ",
-        "\u00E7","ç",
-        "\u00E8","è",
-        "\u00E9","é",
-        "\u00EA","ê",
-        "\u00EB","ë",
-        "\u00EC","ì",
-        "\u00ED","í",
-        "\u00EE","î",
-        "\u00EF","ï",
-        "\u00F0","ð",
-        "\u00F1","ñ",
-        "\u00F2","ò",
-        "\u00F3","ó",
-        "\u00F4","ô",
-        "\u00F5","õ",
-        "\u00F6","ö",
-        "\u00F7","÷",
-        "\u00F8","ø",
-        "\u00F9","ù",
-        "\u00FA","ú",
-        "\u00FB","û",
-        "\u00FC","ü",
-        "\u00FD","ý",
-        "\u00FE","þ",
-        "\u00FF","ÿ"
-    };
 
     /** This method processes tables in the wiki code.
       * @param a string that might contain parts of a table
@@ -1073,12 +820,12 @@ public class wikiCode {
     public String transformLine(String result, plasmaSwitchboard switchboard) {
         //If HTML has not bee replaced yet (can happen if method gets called in recursion), replace now!
         if (!replacedHTML || preformattedSpan){
-            result = replaceXMLEntities(result);
+            result = htmlTools.replaceXMLEntities(result);
             replacedHTML = true;
         }
         //If special characters have not bee replaced yet, replace now!
         if (!replacedCharacters || preformattedSpan){
-            result = replaceHTMLEntities(result);
+            result = htmlTools.replaceHTMLEntities(result);
             replacedCharacters = true;
         }
 
diff --git a/source/de/anomic/http/httpd.java b/source/de/anomic/http/httpd.java
index 5f89ecd34..df7f57676 100644
--- a/source/de/anomic/http/httpd.java
+++ b/source/de/anomic/http/httpd.java
@@ -64,6 +64,7 @@ import java.util.Iterator;
 import java.util.Properties;
 import java.util.StringTokenizer;
 
+import de.anomic.data.htmlTools;
 import de.anomic.data.userDB;
 import de.anomic.data.wikiCode;
 import de.anomic.kelondro.kelondroBase64Order;
@@ -899,11 +900,11 @@ public final class httpd implements serverHandler {
     // 06.01.2007: decode HTML entities by [FB]
     public static String decodeHtmlEntities(String s) {
         // replace all entities defined in wikiCode.characters and htmlentities
-        for (int i=1; i