yacy_search_server/source/net/yacy/document/parser/html/CharacterCoding.java

302 lines
10 KiB
Java

// CharacterCoding.java
// ----------------------------------
// (C) 22.10.2008 by Michael Peter Christen; mc@yacy.net
// first published on http://yacy.net
// Frankfurt, Germany, 2008
//
// $LastChangedDate$
// $LastChangedRevision$
// $LastChangedBy$
//
// This program is free software; you can redistribute it and/or modify
// it under the terms of the GNU General Public License as published by
// the Free Software Foundation; either version 2 of the License, or
// (at your option) any later version.
//
// This program is distributed in the hope that it will be useful,
// but WITHOUT ANY WARRANTY; without even the implied warranty of
// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
// GNU General Public License for more details.
//
// You should have received a copy of the GNU General Public License
// along with this program; if not, write to the Free Software
// Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
package net.yacy.document.parser.html;
import java.util.HashMap;
public class CharacterCoding {
private static final char amp_unicode = "\u0026".charAt(0);
private static final String amp_html = "&";
private static final String space_html = " ";
private static final String[] mapping4xml = {
"\"",""", //quotation mark
"\u003C","<", //less than
"\u003E",">", //greater than
};
private static final String[] mapping4html = {
"\\", "\", // Backslash
"\u005E","^", // Caret
"\u0060","`", // Accent Grave `
"\u007B","{", // {
"\u007C","|", // |
"\u007D","}", // }
"\u007E","~", // ~
"\u0082","‚",
"\u0083","ƒ",
"\u0084","„",
"\u0085","…",
"\u0086","†",
"\u0087","‡",
"\u0088","ˆ",
"\u0089","‰",
"\u008A","Š",
"\u008B","‹",
"\u008C","Œ",
"\u008D","",
"\u008E","Ž",
"\u0091","‘",
"\u0092","’",
"\u0093","“",
"\u0094","”",
"\u0095","•",
"\u0096","–",
"\u0097","—",
"\u0098","˜",
"\u0099","™",
"\u009A","š",
"\u009B","›",
"\u009C","œ",
"\u009D","",
"\u009E","ž",
"\u009F","Ÿ",
"\u00A1","¡", //inverted (spanish) exclamation mark
"\u00A2","¢", //cent
"\u00A3","£", //pound
"\u00A4","¤", //currency
"\u00A5","¥", //yen
"\u00A6","¦", //broken vertical bar
"\u00A7","§", //section sign
"\u00A8","¨", //diaeresis (umlaut)
"\u00A9","©", //copyright sign
"\u00AA","ª", //feminine ordinal indicator
"\u00AB","«", //left-pointing double angle quotation mark
"\u00AC","¬", //not sign
"\u00AD","­", //soft hyphen
"\u00AE","®", //registered sign
"\u00AF","¯", //macron
"\u00B0","°", //degree sign
"\u00B1","±", //plus-minus sign
"\u00B2","²", //superscript two
"\u00B3","³", //superscript three
"\u00B4","´", //acute accent
"\u00B5","µ", //micro sign
"\u00B6","¶", //paragraph sign
"\u00B7","·", //middle dot
"\u00B8","¸", //cedilla
"\u00B9","¹", //superscript one
"\u00BA","º", //masculine ordinal indicator
"\u00BB","»", //right-pointing double angle quotation mark
"\u00BC","¼", //fraction 1/4
"\u00BD","½", //fraction 1/2
"\u00BE","¾", //fraction 3/4
"\u00BF","¿", //inverted (spanisch) questionmark
"\u00C0","À",
"\u00C1","Á",
"\u00C2","Â",
"\u00C3","Ã",
"\u00C4","Ä",
"\u00C5","Å",
"\u00C6","Æ",
"\u00C7","Ç",
"\u00C8","È",
"\u00C9","É",
"\u00CA","Ê",
"\u00CB","Ë",
"\u00CC","Ì",
"\u00CD","Í",
"\u00CE","Î",
"\u00CF","Ï",
"\u00D0","Ð",
"\u00D1","Ñ",
"\u00D2","Ò",
"\u00D3","Ó",
"\u00D4","Ô",
"\u00D5","Õ",
"\u00D6","Ö",
"\u00D7","×",
"\u00D8","Ø",
"\u00D9","Ù",
"\u00DA","Ú",
"\u00DB","Û",
"\u00DC","Ü",
"\u00DD","Ý",
"\u00DE","Þ",
"\u00DF","ß",
"\u00E0","à",
"\u00E1","á",
"\u00E2","â",
"\u00E3","ã",
"\u00E4","ä",
"\u00E5","å",
"\u00E6","æ",
"\u00E7","ç",
"\u00E8","è",
"\u00E9","é",
"\u00EA","ê",
"\u00EB","ë",
"\u00EC","ì",
"\u00ED","í",
"\u00EE","î",
"\u00EF","ï",
"\u00F0","ð",
"\u00F1","ñ",
"\u00F2","ò",
"\u00F3","ó",
"\u00F4","ô",
"\u00F5","õ",
"\u00F6","ö",
"\u00F7","÷",
"\u00F8","ø",
"\u00F9","ù",
"\u00FA","ú",
"\u00FB","û",
"\u00FC","ü",
"\u00FD","ý",
"\u00FE","þ",
"\u00FF","ÿ"
};
private final static HashMap<String, Character> html2unicode4xml = new HashMap<String, Character>(mapping4xml.length * 2);
private final static HashMap<String, Character> html2unicode4html = new HashMap<String, Character>(mapping4html.length * 2);
private final static HashMap<Character, String> unicode2html4xml = new HashMap<Character, String>(mapping4xml.length * 2);
private final static HashMap<Character, String> unicode2html4html = new HashMap<Character, String>(mapping4html.length * 2);
static {
Character c;
for (int i = 0; i < mapping4html.length; i += 2) {
c = Character.valueOf(mapping4html[i].charAt(0));
html2unicode4html.put(mapping4html[i + 1], c);
unicode2html4html.put(c, mapping4html[i + 1]);
}
for (int i = 0; i < mapping4xml.length; i += 2) {
c = Character.valueOf(mapping4xml[i].charAt(0));
html2unicode4xml.put(mapping4xml[i + 1], c);
unicode2html4xml.put(c, mapping4xml[i + 1]);
}
}
public static String unicode2xml(final String text, boolean amp) {
return unicode2html(text, amp, false);
}
public static String unicode2html(final String text, boolean amp) {
return unicode2html(text, amp, true);
}
private static String unicode2html(final String text, boolean amp, boolean html) {
if (text == null) return null;
final StringBuilder sb = new StringBuilder(text.length() * 12 / 10);
int textpos = 0;
String r;
char c;
while (textpos < text.length()) {
// find a (forward) mapping
c = text.charAt(textpos);
if (amp && c == amp_unicode) {
sb.append(amp_html);
textpos++;
continue;
}
if ((r = unicode2html4xml.get(c)) != null) {
sb.append(r);
textpos++;
continue;
}
if (html && (r = unicode2html4html.get(c)) != null) {
sb.append(r);
textpos++;
continue;
}
sb.append(c);
textpos++;
}
return sb.toString();
}
public static String html2unicode(final String text) {
if (text == null) return null;
int p = 0, p1, q;
final StringBuilder sb = new StringBuilder(text.length());
String s;
Character r;
while (p < text.length()) {
p1 = text.indexOf('&', p);
if (p1 < 0) p1 = text.length();
sb.append(text.subSequence(p, p1));
p = p1;
if (p >= text.length()) break;
q = text.indexOf(';', p);
if (q < 0) {
p++;
continue;
}
s = text.substring(p, q + 1);
p = q + 1;
if (s.equals(amp_html)) {
sb.append(amp_unicode);
continue;
}
if (s.equals(space_html)) {
sb.append(" ");
continue;
}
if ((r = html2unicode4xml.get(s)) != null) {
sb.append(r.charValue());
continue;
}
if ((r = html2unicode4html.get(s)) != null) {
sb.append(r);
continue;
}
if (s.charAt(1) == '#') {
if (s.charAt(2) == 'x' || s.charAt(2) == 'X') {
sb.append(new char[] {(char) Integer.parseInt(s.substring(3, s.length() - 1), 16)});
continue;
}
String ucs = s.substring(2, s.length() - 1);
try {
int uc = Integer.parseInt(ucs);
sb.append(new char[] {(char) uc});
} catch (NumberFormatException e) {}
continue;
}
// the entity is unknown, skip it
}
return sb.toString();
}
public static void main(final String[] args) {
final String text = "Test-Text mit & um zyklische &uuml; &amp; Ersetzungen auszuschliessen";
final String txet = unicode2html(text, true);
System.out.println(txet);
System.out.println(html2unicode(txet));
if (html2unicode(txet).equals(text)) System.out.println("correct");
final String text2 = "encodeUnicode2xml: & \" < >";
System.out.println(text2);
System.out.println(unicode2xml(text2, true));
final String text3 = "space&nbsp;t&auml;st";
System.out.println(text3);
System.out.println(html2unicode(text3));
}
}