// CharacterCoding.java // ---------------------------------- // (C) 22.10.2008 by Michael Peter Christen; mc@yacy.net // first published on http://yacy.net // Frankfurt, Germany, 2008 // // $LastChangedDate$ // $LastChangedRevision$ // $LastChangedBy$ // // This program is free software; you can redistribute it and/or modify // it under the terms of the GNU General Public License as published by // the Free Software Foundation; either version 2 of the License, or // (at your option) any later version. // // This program is distributed in the hope that it will be useful, // but WITHOUT ANY WARRANTY; without even the implied warranty of // MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the // GNU General Public License for more details. // // You should have received a copy of the GNU General Public License // along with this program; if not, write to the Free Software // Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA package net.yacy.document.parser.html; import java.util.HashMap; import java.util.Map; public class CharacterCoding { private static final char amp_unicode = "\u0026".charAt(0); private static final String amp_html = "&"; private static final String space_html = " "; private static final String[] mapping4xml = { "\"",""", //quotation mark "\u003C","<", //less than "\u003E",">", //greater than }; private static final String[] mapping4html = { "\\", "\", // Backslash "\u005E","^", // Caret "\u0060","`", // Accent Grave ` "\u007B","{", // { "\u007C","|", // | "\u007D","}", // } "\u007E","~", // ~ "\u0082","‚", "\u0083","ƒ", "\u0084","„", "\u0085","…", "\u0086","†", "\u0087","‡", "\u0088","ˆ", "\u0089","‰", "\u008A","Š", "\u008B","‹", "\u008C","Œ", "\u008D","", "\u008E","Ž", "\u0091","‘", "\u0092","’", "\u0093","“", "\u0094","”", "\u0095","•", "\u0096","–", "\u0097","—", "\u0098","˜", "\u0099","™", "\u009A","š", "\u009B","›", "\u009C","œ", "\u009D","", "\u009E","ž", "\u009F","Ÿ", "\u00A1","¡", //inverted (spanish) exclamation mark "\u00A2","¢", //cent "\u00A3","£", //pound "\u00A4","¤", //currency "\u00A5","¥", //yen "\u00A6","¦", //broken vertical bar "\u00A7","§", //section sign "\u00A8","¨", //diaeresis (umlaut) "\u00A9","©", //copyright sign "\u00AA","ª", //feminine ordinal indicator "\u00AB","«", //left-pointing double angle quotation mark "\u00AC","¬", //not sign "\u00AD","­", //soft hyphen "\u00AE","®", //registered sign "\u00AF","¯", //macron "\u00B0","°", //degree sign "\u00B1","±", //plus-minus sign "\u00B2","²", //superscript two "\u00B3","³", //superscript three "\u00B4","´", //acute accent "\u00B5","µ", //micro sign "\u00B6","¶", //paragraph sign "\u00B7","·", //middle dot "\u00B8","¸", //cedilla "\u00B9","¹", //superscript one "\u00BA","º", //masculine ordinal indicator "\u00BB","»", //right-pointing double angle quotation mark "\u00BC","¼", //fraction 1/4 "\u00BD","½", //fraction 1/2 "\u00BE","¾", //fraction 3/4 "\u00BF","¿", //inverted (spanisch) questionmark "\u00C0","À", "\u00C1","Á", "\u00C2","Â", "\u00C3","Ã", "\u00C4","Ä", "\u00C5","Å", "\u00C6","Æ", "\u00C7","Ç", "\u00C8","È", "\u00C9","É", "\u00CA","Ê", "\u00CB","Ë", "\u00CC","Ì", "\u00CD","Í", "\u00CE","Î", "\u00CF","Ï", "\u00D0","Ð", "\u00D1","Ñ", "\u00D2","Ò", "\u00D3","Ó", "\u00D4","Ô", "\u00D5","Õ", "\u00D6","Ö", "\u00D7","×", "\u00D8","Ø", "\u00D9","Ù", "\u00DA","Ú", "\u00DB","Û", "\u00DC","Ü", "\u00DD","Ý", "\u00DE","Þ", "\u00DF","ß", "\u00E0","à", "\u00E1","á", "\u00E2","â", "\u00E3","ã", "\u00E4","ä", "\u00E5","å", "\u00E6","æ", "\u00E7","ç", "\u00E8","è", "\u00E9","é", "\u00EA","ê", "\u00EB","ë", "\u00EC","ì", "\u00ED","í", "\u00EE","î", "\u00EF","ï", "\u00F0","ð", "\u00F1","ñ", "\u00F2","ò", "\u00F3","ó", "\u00F4","ô", "\u00F5","õ", "\u00F6","ö", "\u00F7","÷", "\u00F8","ø", "\u00F9","ù", "\u00FA","ú", "\u00FB","û", "\u00FC","ü", "\u00FD","ý", "\u00FE","þ", "\u00FF","ÿ" }; private final static Map html2unicode4xml = new HashMap(mapping4xml.length * 2); private final static Map html2unicode4html = new HashMap(mapping4html.length * 2); private final static Map unicode2html4xml = new HashMap(mapping4xml.length * 2); private final static Map unicode2html4html = new HashMap(mapping4html.length * 2); static { Character c; for (int i = 0; i < mapping4html.length; i += 2) { c = Character.valueOf(mapping4html[i].charAt(0)); html2unicode4html.put(mapping4html[i + 1], c); unicode2html4html.put(c, mapping4html[i + 1]); } for (int i = 0; i < mapping4xml.length; i += 2) { c = Character.valueOf(mapping4xml[i].charAt(0)); html2unicode4xml.put(mapping4xml[i + 1], c); unicode2html4xml.put(c, mapping4xml[i + 1]); } } public static String unicode2xml(final String text, boolean amp) { return unicode2html(text, amp, false); } public static String unicode2html(final String text, boolean amp) { return unicode2html(text, amp, true); } private static String unicode2html(final String text, boolean amp, boolean html) { if (text == null) return null; final StringBuilder sb = new StringBuilder(text.length() * 12 / 10); int textpos = 0; String r; char c; while (textpos < text.length()) { // find a (forward) mapping c = text.charAt(textpos); if (amp && c == amp_unicode) { sb.append(amp_html); textpos++; continue; } if ((r = unicode2html4xml.get(c)) != null) { sb.append(r); textpos++; continue; } if (html && (r = unicode2html4html.get(c)) != null) { sb.append(r); textpos++; continue; } sb.append(c); textpos++; } return sb.toString(); } public static String html2unicode(final String text) { if (text == null) return null; int p = 0, p1, q; final StringBuilder sb = new StringBuilder(text.length()); String s; Character r; while (p < text.length()) { p1 = text.indexOf('&', p); if (p1 < 0) { sb.append(text, p, text.length()); break; } sb.append(text, p, p1); p = p1; if (p >= text.length()) break; q = text.indexOf(';', p); if (q < 0) { // if there is now no semicolon, then this will also fail when another ampersand is found afterwards // we are finished here sb.append(text, p, text.length()); break; } s = text.substring(p, q + 1); p = q + 1; if (s.equals(amp_html)) { sb.append(amp_unicode); continue; } if (s.equals(space_html)) { sb.append(" "); continue; } if ((r = html2unicode4xml.get(s)) != null) { sb.append(r.charValue()); continue; } if ((r = html2unicode4html.get(s)) != null) { sb.append(r); continue; } if (s.charAt(1) == '#') { if (s.charAt(2) == 'x' || s.charAt(2) == 'X') { sb.append(new char[] {(char) Integer.parseInt(s.substring(3, s.length() - 1), 16)}); continue; } String ucs = s.substring(2, s.length() - 1); try { int uc = Integer.parseInt(ucs); sb.append(new char[] {(char) uc}); } catch (NumberFormatException e) {} continue; } // the entity is unknown, skip it } return sb.toString(); } public static void main(final String[] args) { final String text = "Test-Text mit & um zyklische ü & Ersetzungen auszuschliessen"; final String txet = unicode2html(text, true); System.out.println(txet); System.out.println(html2unicode(txet)); if (html2unicode(txet).equals(text)) System.out.println("correct"); final String text2 = "encodeUnicode2xml: & \" < >"; System.out.println(text2); System.out.println(unicode2xml(text2, true)); final String text3 = "space täst"; System.out.println(text3); System.out.println(html2unicode(text3)); } }