// htmlTools.java // ----------------------- // (C) by Michael Peter Christen; mc@anomic.de, // (C) by Jan Sandbrink (NN), Franz Brausse (FB, karlchenofhell), // (C) by Bjoern 'fuchs' Krombholz (fuchsi) // first published on http://www.yacy.net // // $LastChangedDate: $ // $LastChangedRevision: $ // $LastChangedBy: $ // // This program is free software; you can redistribute it and/or modify // it under the terms of the GNU General Public License as published by // the Free Software Foundation; either version 2 of the License, or // (at your option) any later version. // // This program is distributed in the hope that it will be useful, // but WITHOUT ANY WARRANTY; without even the implied warranty of // MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the // GNU General Public License for more details. // // You should have received a copy of the GNU General Public License // along with this program; if not, write to the Free Software // Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA // // Using this software in any meaning (reading, learning, copying, compiling, // running) means that you agree that the Author(s) is (are) not responsible // for cost, loss of data or any harm that may be caused directly or indirectly // by usage of this softare or this documentation. The usage of this software // is on your own risk. The installation and usage (starting/running) of this // software may allow other people or application to access your computer and // any attached devices and is highly dependent on the configuration of the // software which must be done by the user of the software; the author(s) is // (are) also not responsible for proper configuration and usage of the // software, even if provoked by documentation provided together with // the software. // // Any changes to this file according to the GPL as documented in the file // gpl.txt aside this file in the shipment you received can be done to the // lines that follows this copyright notice here, but changes must not be // done inside the copyright notive above. A re-distribution must contain // the intact and unchanged copyright notice. // Contributions and changes to the program code must be marked as such. package de.anomic.data; public class htmlTools { /** Replaces characters in a string with other entities according to HTML standards. * @param text a string that possibly contains special characters * @param includingAmpersand if false ampersands are not encoded * @param forXML if true then only &, ", < and > will * be transcoded. * @return the string with all characters replaced by the corresponding character from array */ public static String encodeUnicode2html(String text, boolean includingAmpersand, boolean forXML) { if (text == null) return null; int spos = (includingAmpersand ? 0 : 2); // if (forXML), then only encode ampersand, quotation mark, less than and // greather than which are the first 4 pairs in default mapping table int epos = (forXML ? 8 : mapping.length); return encode(text, mapping, spos, epos); } /** * Like {@link #encodeUnicode2html(String, boolean, boolean)} with forXML = false */ public static String encodeUnicode2html(String text, boolean includingAmpersand) { return encodeUnicode2html(text, includingAmpersand, false); } /** * Replaces special entities ampersand, quotation marks, and less than/graiter than * by the escaping entities allowed in XML documents. * * Like {@link #encodeUnicode2html(String, boolean, boolean)} with * includingAmpersand = true and foxXML = true. * * @param text the original String * @return the encoded String */ public static String encodeUnicode2xml(String text) { return encodeUnicode2html(text, true, true); } /** * Generic method that replaces occurences of special character entities defined in map * array with their corresponding mapping. * @param text The String too process. * @param map An array defining the entity mapping. * @param spos It is possible to use a subset of the map only. This parameter defines the * starting point in the map array. * @param epos The ending point, see above. * @return A copy of the original String with all entities defined in map replaced. */ public static String encode(String text, final String[] map, int spos, int epos) { StringBuffer sb = new StringBuffer(text.length()); int textpos = 0; search: while (textpos < text.length()) { // find a (forward) mapping loop: for (int i = spos; i < epos; i += 2) { if (text.charAt(textpos) != map[i].charAt(0)) continue loop; // found match sb.append(map[i + 1]); textpos++; continue search; } // not found match sb.append(text.charAt(textpos)); textpos++; } return sb.toString(); } public static String decodeHtml2Unicode(String text) { if (text == null) return null; int pos = 0; StringBuffer sb = new StringBuffer(text.length()); search: while (pos < text.length()) { // find a reverse mapping. TODO: replace matching with hashtable(s) loop: for (int i = 0; i < mapping.length; i += 2) { if (pos + mapping[i + 1].length() > text.length()) continue loop; for (int j = mapping[i + 1].length() - 1; j >= 0; j--) { if (text.charAt(pos + j) != mapping[i + 1].charAt(j)) continue loop; } // found match sb.append(mapping[i]); pos = pos + mapping[i + 1].length(); continue search; } // not found match sb.append(text.charAt(pos)); pos++; } return new String(sb); } //This array contains codes (see http://mindprod.com/jgloss/unicode.html for details) //that will be replaced. To add new codes or patterns, just put them at the end //of the list. Codes or patterns in this list can not be escaped with [= or
    private static final String[] mapping = {
        // Ampersands _have_ to be replaced first. If they were replaced later,
        // other replaced characters containing ampersands would get messed up.
        "\u0026","&",      //ampersand
        "\"",""",         //quotation mark
        "\u003C","<",       //less than
        "\u003E",">",       //greater than
        "\\",    "\",  // Backslash
        "\u005E","^",  // Caret

        "\u0060","`",  // Accent Grave `
        "\u007B","{",  // {
        "\u007C","|",  // |
        "\u007D","}",  // }
        "\u007E","~",  // ~

        "\u0082","‚",
        "\u0083","ƒ",
        "\u0084","„",
        "\u0085","…",
        "\u0086","†",
        "\u0087","‡",
        "\u0088","ˆ",
        "\u0089","‰",
        "\u008A","Š",
        "\u008B","‹",
        "\u008C","Œ",
        "\u008D","",
        "\u008E","Ž",

        "\u0091","‘",
        "\u0092","’",
        "\u0093","“",
        "\u0094","”",
        "\u0095","•",
        "\u0096","–",
        "\u0097","—",
        "\u0098","˜",
        "\u0099","™",
        "\u009A","š",
        "\u009B","›",
        "\u009C","œ",
        "\u009D","",
        "\u009E","ž",
        "\u009F","Ÿ",

        "\u00A1","¡",    //inverted (spanish) exclamation mark
        "\u00A2","¢",     //cent
        "\u00A3","£",    //pound
        "\u00A4","¤",   //currency
        "\u00A5","¥",      //yen
        "\u00A6","¦",   //broken vertical bar
        "\u00A7","§",     //section sign
        "\u00A8","¨",      //diaeresis (umlaut)
        "\u00A9","©",     //copyright sign
        "\u00AA","ª",     //feminine ordinal indicator
        "\u00AB","«",    //left-pointing double angle quotation mark
        "\u00AC","¬",      //not sign
        "\u00AD","­",      //soft hyphen
        "\u00AE","®",      //registered sign
        "\u00AF","¯",     //macron
        "\u00B0","°",      //degree sign
        "\u00B1","±",   //plus-minus sign
        "\u00B2","²",     //superscript two
        "\u00B3","³",     //superscript three
        "\u00B4","´",    //acute accent
        "\u00B5","µ",    //micro sign
        "\u00B6","¶",     //paragraph sign
        "\u00B7","·",   //middle dot
        "\u00B8","¸",    //cedilla
        "\u00B9","¹",     //superscript one
        "\u00BA","º",     //masculine ordinal indicator
        "\u00BB","»",    //right-pointing double angle quotation mark
        "\u00BC","¼",   //fraction 1/4
        "\u00BD","½",   //fraction 1/2
        "\u00BE","¾",   //fraction 3/4
        "\u00BF","¿",   //inverted (spanisch) questionmark
        "\u00C0","À",
        "\u00C1","Á",
        "\u00C2","Â",
        "\u00C3","Ã",
        "\u00C4","Ä",
        "\u00C5","Å",
        "\u00C6","Æ",
        "\u00C7","Ç",
        "\u00C8","È",
        "\u00C9","É",
        "\u00CA","Ê",
        "\u00CB","Ë",
        "\u00CC","Ì",
        "\u00CD","Í",
        "\u00CE","Î",
        "\u00CF","Ï",
        "\u00D0","Ð",
        "\u00D1","Ñ",
        "\u00D2","Ò",
        "\u00D3","Ó",
        "\u00D4","Ô",
        "\u00D5","Õ",
        "\u00D6","Ö",
        "\u00D7","×",
        "\u00D8","Ø",
        "\u00D9","Ù",
        "\u00DA","Ú",
        "\u00DB","Û",
        "\u00DC","Ü",
        "\u00DD","Ý",
        "\u00DE","Þ",
        "\u00DF","ß",
        "\u00E0","à",
        "\u00E1","á",
        "\u00E2","â",
        "\u00E3","ã",
        "\u00E4","ä",
        "\u00E5","å",
        "\u00E6","æ",
        "\u00E7","ç",
        "\u00E8","è",
        "\u00E9","é",
        "\u00EA","ê",
        "\u00EB","ë",
        "\u00EC","ì",
        "\u00ED","í",
        "\u00EE","î",
        "\u00EF","ï",
        "\u00F0","ð",
        "\u00F1","ñ",
        "\u00F2","ò",
        "\u00F3","ó",
        "\u00F4","ô",
        "\u00F5","õ",
        "\u00F6","ö",
        "\u00F7","÷",
        "\u00F8","ø",
        "\u00F9","ù",
        "\u00FA","ú",
        "\u00FB","û",
        "\u00FC","ü",
        "\u00FD","ý",
        "\u00FE","þ",
        "\u00FF","ÿ"
    };
    
    public static void main(String[] args) {
        String text = "Test-Text mit & um zyklische ü & Ersetzungen auszuschliessen";
        String txet = encodeUnicode2html(text, true);
        System.out.println(txet);
        System.out.println(decodeHtml2Unicode(txet));
        if (decodeHtml2Unicode(txet).equals(text)) System.out.println("correct");
    }
}