Merge pull request #405 from jfhs/jfhs/support-all-html-entities

Improve HTML entities support
This commit is contained in:
Michael Christen 2021-03-31 01:44:54 +02:00 committed by GitHub
commit 42ea2a1c6f
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23
4 changed files with 2780 additions and 154 deletions

2597
defaults/htmlEntities.json Normal file

File diff suppressed because it is too large Load Diff

View File

@ -24,7 +24,20 @@
package net.yacy.document.parser.html;
import net.yacy.search.Switchboard;
import org.json.JSONArray;
import org.json.JSONException;
import org.json.JSONObject;
import java.io.BufferedReader;
import java.io.File;
import java.io.IOException;
import java.nio.charset.Charset;
import java.nio.charset.StandardCharsets;
import java.nio.file.Files;
import java.nio.file.Paths;
import java.util.HashMap;
import java.util.Iterator;
import java.util.Map;
import java.util.regex.Pattern;
@ -44,171 +57,187 @@ public final class CharacterCoding {
/** Special characters which have to be mapped for XML. */
private static final String[] MAPPING4XML = {
"\"", """, //quotation mark
"\u003C", "<", //less than
"\u003E", ">", //greater than
"\"", """, //quotation mark
"\u003C", "<", //less than
"\u003E", ">", //greater than
};
/** Special characters which have to be mapped for HTML. */
private static final String[] MAPPING4HTML = {
"\\", "\", // Backslash
"\u005E", "^", // Caret
"\\", "\", // Backslash
"\u005E", "^", // Caret
"\u0060", "`", // Accent Grave `
"\u007B", "{", // {
"\u007C", "|", // |
"\u007D", "}", // }
"\u007E", "~", // ~
"\u0060", "`", // Accent Grave `
"\u007B", "{", // {
"\u007C", "|", // |
"\u007D", "}", // }
"\u007E", "~", // ~
"\u0082", "‚",
"\u0083", "ƒ",
"\u0084", "„",
"\u0085", "…",
"\u0086", "†",
"\u0087", "‡",
"\u0088", "ˆ",
"\u0089", "‰",
"\u008A", "Š",
"\u008B", "‹",
"\u008C", "Œ",
"\u008D", "",
"\u008E", "Ž",
"\u0082", "‚",
"\u0083", "ƒ",
"\u0084", "„",
"\u0085", "…",
"\u0086", "†",
"\u0087", "‡",
"\u0088", "ˆ",
"\u0089", "‰",
"\u008A", "Š",
"\u008B", "‹",
"\u008C", "Œ",
"\u008D", "",
"\u008E", "Ž",
"\u0091", "‘",
"\u0092", "’",
"\u0093", "“",
"\u0094", "”",
"\u0095", "•",
"\u0096", "–",
"\u0097", "—",
"\u0098", "˜",
"\u0099", "™",
"\u009A", "š",
"\u009B", "›",
"\u009C", "œ",
"\u009D", "",
"\u009E", "ž",
"\u009F", "Ÿ",
"\u0091", "‘",
"\u0092", "’",
"\u0093", "“",
"\u0094", "”",
"\u0095", "•",
"\u0096", "–",
"\u0097", "—",
"\u0098", "˜",
"\u0099", "™",
"\u009A", "š",
"\u009B", "›",
"\u009C", "œ",
"\u009D", "",
"\u009E", "ž",
"\u009F", "Ÿ",
"\u00A1", "¡", //inverted (spanish) exclamation mark
"\u00A2", "¢", //cent
"\u00A3", "£", //pound
"\u00A4", "¤", //currency
"\u00A5", "¥", //yen
"\u00A6", "¦", //broken vertical bar
"\u00A7", "§", //section sign
"\u00A8", "¨", //diaeresis (umlaut)
"\u00A9", "©", //copyright sign
"\u00AA", "ª", //feminine ordinal indicator
"\u00AB", "«", //left-pointing double angle quotation mark
"\u00AC", "¬", //not sign
"\u00AD", "­", //soft hyphen
"\u00AE", "®", //registered sign
"\u00AF", "¯", //macron
"\u00B0", "°", //degree sign
"\u00B1", "±", //plus-minus sign
"\u00B2", "²", //superscript two
"\u00B3", "³", //superscript three
"\u00B4", "´", //acute accent
"\u00B5", "µ", //micro sign
"\u00B6", "¶", //paragraph sign
"\u00B7", "·", //middle dot
"\u00B8", "¸", //cedilla
"\u00B9", "¹", //superscript one
"\u00BA", "º", //masculine ordinal indicator
"\u00BB", "»", //right-pointing double angle quotation mark
"\u00BC", "¼", //fraction 1/4
"\u00BD", "½", //fraction 1/2
"\u00BE", "¾", //fraction 3/4
"\u00BF", "¿", //inverted (spanisch) questionmark
"\u00C0", "À",
"\u00C1", "Á",
"\u00C2", "Â",
"\u00C3", "Ã",
"\u00C4", "Ä",
"\u00C5", "Å",
"\u00C6", "Æ",
"\u00C7", "Ç",
"\u00C8", "È",
"\u00C9", "É",
"\u00CA", "Ê",
"\u00CB", "Ë",
"\u00CC", "Ì",
"\u00CD", "Í",
"\u00CE", "Î",
"\u00CF", "Ï",
"\u00D0", "Ð",
"\u00D1", "Ñ",
"\u00D2", "Ò",
"\u00D3", "Ó",
"\u00D4", "Ô",
"\u00D5", "Õ",
"\u00D6", "Ö",
"\u00D7", "×",
"\u00D8", "Ø",
"\u00D9", "Ù",
"\u00DA", "Ú",
"\u00DB", "Û",
"\u00DC", "Ü",
"\u00DD", "Ý",
"\u00DE", "Þ",
"\u00DF", "ß",
"\u00E0", "à",
"\u00E1", "á",
"\u00E2", "â",
"\u00E3", "ã",
"\u00E4", "ä",
"\u00E5", "å",
"\u00E6", "æ",
"\u00E7", "ç",
"\u00E8", "è",
"\u00E9", "é",
"\u00EA", "ê",
"\u00EB", "ë",
"\u00EC", "ì",
"\u00ED", "í",
"\u00EE", "î",
"\u00EF", "ï",
"\u00F0", "ð",
"\u00F1", "ñ",
"\u00F2", "ò",
"\u00F3", "ó",
"\u00F4", "ô",
"\u00F5", "õ",
"\u00F6", "ö",
"\u00F7", "÷",
"\u00F8", "ø",
"\u00F9", "ù",
"\u00FA", "ú",
"\u00FB", "û",
"\u00FC", "ü",
"\u00FD", "ý",
"\u00FE", "þ",
"\u00FF", "ÿ"
"\u00A1", "¡", //inverted (spanish) exclamation mark
"\u00A2", "¢", //cent
"\u00A3", "£", //pound
"\u00A4", "¤", //currency
"\u00A5", "¥", //yen
"\u00A6", "¦", //broken vertical bar
"\u00A7", "§", //section sign
"\u00A8", "¨", //diaeresis (umlaut)
"\u00A9", "©", //copyright sign
"\u00AA", "ª", //feminine ordinal indicator
"\u00AB", "«", //left-pointing double angle quotation mark
"\u00AC", "¬", //not sign
"\u00AD", "­", //soft hyphen
"\u00AE", "®", //registered sign
"\u00AF", "¯", //macron
"\u00B0", "°", //degree sign
"\u00B1", "±", //plus-minus sign
"\u00B2", "²", //superscript two
"\u00B3", "³", //superscript three
"\u00B4", "´", //acute accent
"\u00B5", "µ", //micro sign
"\u00B6", "¶", //paragraph sign
"\u00B7", "·", //middle dot
"\u00B8", "¸", //cedilla
"\u00B9", "¹", //superscript one
"\u00BA", "º", //masculine ordinal indicator
"\u00BB", "»", //right-pointing double angle quotation mark
"\u00BC", "¼", //fraction 1/4
"\u00BD", "½", //fraction 1/2
"\u00BE", "¾", //fraction 3/4
"\u00BF", "¿", //inverted (spanisch) questionmark
"\u00C0", "À",
"\u00C1", "Á",
"\u00C2", "Â",
"\u00C3", "Ã",
"\u00C4", "Ä",
"\u00C5", "Å",
"\u00C6", "Æ",
"\u00C7", "Ç",
"\u00C8", "È",
"\u00C9", "É",
"\u00CA", "Ê",
"\u00CB", "Ë",
"\u00CC", "Ì",
"\u00CD", "Í",
"\u00CE", "Î",
"\u00CF", "Ï",
"\u00D0", "Ð",
"\u00D1", "Ñ",
"\u00D2", "Ò",
"\u00D3", "Ó",
"\u00D4", "Ô",
"\u00D5", "Õ",
"\u00D6", "Ö",
"\u00D7", "×",
"\u00D8", "Ø",
"\u00D9", "Ù",
"\u00DA", "Ú",
"\u00DB", "Û",
"\u00DC", "Ü",
"\u00DD", "Ý",
"\u00DE", "Þ",
"\u00DF", "ß",
"\u00E0", "à",
"\u00E1", "á",
"\u00E2", "â",
"\u00E3", "ã",
"\u00E4", "ä",
"\u00E5", "å",
"\u00E6", "æ",
"\u00E7", "ç",
"\u00E8", "è",
"\u00E9", "é",
"\u00EA", "ê",
"\u00EB", "ë",
"\u00EC", "ì",
"\u00ED", "í",
"\u00EE", "î",
"\u00EF", "ï",
"\u00F0", "ð",
"\u00F1", "ñ",
"\u00F2", "ò",
"\u00F3", "ó",
"\u00F4", "ô",
"\u00F5", "õ",
"\u00F6", "ö",
"\u00F7", "÷",
"\u00F8", "ø",
"\u00F9", "ù",
"\u00FA", "ú",
"\u00FB", "û",
"\u00FC", "ü",
"\u00FD", "ý",
"\u00FE", "þ",
"\u00FF", "ÿ"
};
/** Mapping for XML to unicode. */
private static final Map<String, Character> HTML2UNICODE4XML =
new HashMap<String, Character>(MAPPING4XML.length * 2);
private static final Map<String, String> HTML2UNICODE4XML =
new HashMap<String, String>();
/** Mapping for HTML to unicode. */
private static final Map<String, Character> HTML2UNICODE4HTML =
new HashMap<String, Character>(MAPPING4HTML.length * 2);
private static final Map<String, String> HTML2UNICODE4HTML =
new HashMap<String, String>();
/** Mapping for unicode to XML. */
private static final Map<Character, String> UNICODE2HTML4XML =
new HashMap<Character, String>(MAPPING4XML.length * 2);
/** Mapping for unicode to HTML. */
private static final Map<Character, String> UNICODE2HTML4HTML =
new HashMap<Character, String>(MAPPING4HTML.length * 2);
new HashMap<Character, String>(MAPPING4XML.length * 2);
static void parseJsonEntities(JSONObject entities, Map<String, String> entityToChar) throws JSONException {
for (Iterator<String> it = entities.keys(); it.hasNext(); ) {
String entity = it.next();
String c = entities.getJSONObject(entity).getString("characters");
entityToChar.put(entity, c);
}
}
static {
try {
byte[] encoded = Files.readAllBytes(Paths.get(Switchboard.getSwitchboard() != null ? Switchboard.getSwitchboard().appPath.getAbsolutePath() : ".", "defaults", "htmlEntities.json"));
JSONObject json = new JSONObject(new String(encoded, StandardCharsets.UTF_8));
parseJsonEntities(json.getJSONObject("xml"), HTML2UNICODE4XML);
parseJsonEntities(json.getJSONObject("html4"), HTML2UNICODE4HTML);
parseJsonEntities(json.getJSONObject("html5"), HTML2UNICODE4HTML);
} catch (IOException | JSONException e) {
e.printStackTrace();
}
Character c;
for (int i = 0; i < MAPPING4HTML.length; i += 2) {
c = Character.valueOf(MAPPING4HTML[i].charAt(0));
HTML2UNICODE4HTML.put(MAPPING4HTML[i + 1], c);
c = MAPPING4HTML[i].charAt(0);
UNICODE2HTML4HTML.put(c, MAPPING4HTML[i + 1]);
}
for (int i = 0; i < MAPPING4XML.length; i += 2) {
c = Character.valueOf(MAPPING4XML[i].charAt(0));
HTML2UNICODE4XML.put(MAPPING4XML[i + 1], c);
c = MAPPING4XML[i].charAt(0);
UNICODE2HTML4XML.put(c, MAPPING4XML[i + 1]);
}
}
@ -220,7 +249,6 @@ public final class CharacterCoding {
/**
* Replaces characters which have special representation in XML.
* @see #MAPPING4XML
* @param text text with character to replace
* @param amp true if ampersands shall be replaced, else false
* @return text with replaced characters
@ -231,7 +259,6 @@ public final class CharacterCoding {
/**
* Replaces characters which have special representation in HTML.
* @see #MAPPING4HTML
* @param text text with character to replace
* @param amp true if ampersands shall be replaced, else false
* @return text with replaced characters
@ -246,7 +273,7 @@ public final class CharacterCoding {
* @param amp true if ampersands shall be replaced, else false
* @param html true if characters shall be replaced for embedding in
* HTML, false for XML (far more characters are replaced for HTML,
* compare {@link #MAPPING4HTML} with {@link #MAPPING4XML}
* see defaults/htmlEntities.json
* @return text with replaced characters
*/
private static String unicode2html(
@ -291,7 +318,7 @@ public final class CharacterCoding {
int p = 0, p1, q;
final StringBuilder sb = new StringBuilder(text.length());
String s;
Character r;
String r;
while (p < text.length()) {
p1 = text.indexOf('&', p);
if (p1 < 0) {
@ -328,7 +355,7 @@ public final class CharacterCoding {
continue;
}
if ((r = HTML2UNICODE4XML.get(s)) != null) {
sb.append(r.charValue());
sb.append(r);
continue;
}
if ((r = HTML2UNICODE4HTML.get(s)) != null) {

View File

@ -769,18 +769,18 @@ public class ContentScraper extends AbstractScraper implements Scraper {
final String content = tag.opts.getProperty("content", EMPTY_STRING);
String name = tag.opts.getProperty("name", EMPTY_STRING);
if (name.length() > 0) {
this.metas.put(name.toLowerCase(), CharacterCoding.html2unicode(content));
this.metas.put(name.toLowerCase(), content);
if (name.toLowerCase().equals("generator")) {
this.evaluationScores.match(Element.metagenerator, content);
}
}
name = tag.opts.getProperty("http-equiv", EMPTY_STRING);
if (name.length() > 0) {
this.metas.put(name.toLowerCase(), CharacterCoding.html2unicode(content));
this.metas.put(name.toLowerCase(), content);
}
name = tag.opts.getProperty("property", EMPTY_STRING);
if (name.length() > 0) {
this.metas.put(name.toLowerCase(), CharacterCoding.html2unicode(content));
this.metas.put(name.toLowerCase(), content);
}
} else if (tag.name.equalsIgnoreCase("area")) {
final String areatitle = cleanLine(tag.opts.getProperty("title", EMPTY_STRING));
@ -904,7 +904,6 @@ public class ContentScraper extends AbstractScraper implements Scraper {
// System.out.println("ScrapeTag1: tag.tagname=" + tag.tagname + ", opts=" + tag.opts.toString() + ", text=" + UTF8.String(text));
if (tag.name.equalsIgnoreCase("a") && tag.content.length() < 2048) {
String href = tag.opts.getProperty("href", EMPTY_STRING);
href = CharacterCoding.html2unicode(href);
AnchorURL url;
if ((href.length() > 0) && ((url = absolutePath(href)) != null)) {
if (followDenied()) {

View File

@ -32,6 +32,7 @@ import java.io.Writer;
import java.util.Properties;
import net.yacy.cora.document.encoding.UTF8;
import net.yacy.document.parser.html.CharacterCoding;
public final class CharBuffer extends Writer {
@ -444,6 +445,7 @@ public final class CharBuffer extends Writer {
while ((pos < this.length) && (this.buffer[pos] <= 32)) pos++;
// doublequotes are obligatory. However, we want to be fuzzy if they
// are ommittet
String value = null;
if (pos >= this.length) {
// error case: input ended too early
break;
@ -453,7 +455,7 @@ public final class CharBuffer extends Writer {
start = pos;
while ((pos < this.length) && (this.buffer[pos] != doublequote)) pos++;
if (pos >= this.length) break; // this is the case if we found no parent doublequote
p.setProperty(key, new String(this.buffer, start, pos - start).trim());
value = new String(this.buffer, start, pos - start).trim();
pos++;
} else if (this.buffer[pos] == singlequote) {
// search next singlequote
@ -461,14 +463,15 @@ public final class CharBuffer extends Writer {
start = pos;
while ((pos < this.length) && (this.buffer[pos] != singlequote)) pos++;
if (pos >= this.length) break; // this is the case if we found no parent singlequote
p.setProperty(key, new String(this.buffer, start, pos - start).trim());
value = new String(this.buffer, start, pos - start).trim();
pos++;
} else {
// search next whitespace
start = pos;
while ((pos < this.length) && (this.buffer[pos] > 32)) pos++;
p.setProperty(key, new String(this.buffer, start, pos - start).trim());
value = new String(this.buffer, start, pos - start).trim();
}
p.setProperty(key, CharacterCoding.html2unicode(value));
// pos should point now to a whitespace: eat up spaces
while ((pos < this.length) && (this.buffer[pos] <= 32)) pos++;
// go on with next loop