Merge pull request #405 from jfhs/jfhs/support-all-html-entities

Improve HTML entities support
2024-09-19 00:01:41 +02:00 · 2021-03-31 01:44:54 +02:00 · 2021-03-31 01:44:54 +02:00 · 42ea2a1c6f
commit 42ea2a1c6f
parent b2af745dd6 10bddc2c2d
4 changed files with 2780 additions and 154 deletions
--- a/defaults/htmlEntities.json
+++ b/defaults/htmlEntities.json
--- a/source/net/yacy/document/parser/html/CharacterCoding.java
+++ b/source/net/yacy/document/parser/html/CharacterCoding.java
@ -24,7 +24,20 @@

 package net.yacy.document.parser.html;

+import net.yacy.search.Switchboard;
+import org.json.JSONArray;
+import org.json.JSONException;
+import org.json.JSONObject;
+
+import java.io.BufferedReader;
+import java.io.File;
+import java.io.IOException;
+import java.nio.charset.Charset;
+import java.nio.charset.StandardCharsets;
+import java.nio.file.Files;
+import java.nio.file.Paths;
 import java.util.HashMap;
+import java.util.Iterator;
 import java.util.Map;
 import java.util.regex.Pattern;

@ -188,27 +201,43 @@ public final class CharacterCoding {
    };

    /** Mapping for XML to unicode. */
-    private static final Map<String, Character> HTML2UNICODE4XML =
-            new HashMap<String, Character>(MAPPING4XML.length * 2);
+    private static final Map<String, String> HTML2UNICODE4XML =
+            new HashMap<String, String>();
    /** Mapping for HTML to unicode. */
-    private static final Map<String, Character> HTML2UNICODE4HTML =
-            new HashMap<String, Character>(MAPPING4HTML.length * 2);
+    private static final Map<String, String> HTML2UNICODE4HTML =
+            new HashMap<String, String>();
    /** Mapping for unicode to XML. */
    private static final Map<Character, String> UNICODE2HTML4XML =
            new HashMap<Character, String>(MAPPING4XML.length * 2);
    /** Mapping for unicode to HTML. */
    private static final Map<Character, String> UNICODE2HTML4HTML =
-            new HashMap<Character, String>(MAPPING4HTML.length * 2);
+            new HashMap<Character, String>(MAPPING4XML.length * 2);
+
+    static void parseJsonEntities(JSONObject entities, Map<String, String> entityToChar) throws JSONException {
+        for (Iterator<String> it = entities.keys(); it.hasNext(); ) {
+            String entity = it.next();
+            String c = entities.getJSONObject(entity).getString("characters");
+            entityToChar.put(entity, c);
+        }
+    }
+
    static {
+        try {
+            byte[] encoded = Files.readAllBytes(Paths.get(Switchboard.getSwitchboard() != null ? Switchboard.getSwitchboard().appPath.getAbsolutePath() : ".", "defaults", "htmlEntities.json"));
+            JSONObject json = new JSONObject(new String(encoded, StandardCharsets.UTF_8));
+            parseJsonEntities(json.getJSONObject("xml"), HTML2UNICODE4XML);
+            parseJsonEntities(json.getJSONObject("html4"), HTML2UNICODE4HTML);
+            parseJsonEntities(json.getJSONObject("html5"), HTML2UNICODE4HTML);
+        } catch (IOException | JSONException e) {
+            e.printStackTrace();
+        }
        Character c;
        for (int i = 0; i < MAPPING4HTML.length; i += 2) {
-            c = Character.valueOf(MAPPING4HTML[i].charAt(0));
-            HTML2UNICODE4HTML.put(MAPPING4HTML[i + 1], c);
+            c = MAPPING4HTML[i].charAt(0);
            UNICODE2HTML4HTML.put(c, MAPPING4HTML[i + 1]);
        }
        for (int i = 0; i < MAPPING4XML.length; i += 2) {
-            c = Character.valueOf(MAPPING4XML[i].charAt(0));
-            HTML2UNICODE4XML.put(MAPPING4XML[i + 1], c);
+            c = MAPPING4XML[i].charAt(0);
            UNICODE2HTML4XML.put(c, MAPPING4XML[i + 1]);
        }
    }
@ -220,7 +249,6 @@ public final class CharacterCoding {

    /**
     * Replaces characters which have special representation in XML.
-     * @see #MAPPING4XML
     * @param text text with character to replace
     * @param amp true if ampersands shall be replaced, else false
     * @return text with replaced characters
@ -231,7 +259,6 @@ public final class CharacterCoding {

    /**
     * Replaces characters which have special representation in HTML.
-     * @see #MAPPING4HTML
     * @param text text with character to replace
     * @param amp true if ampersands shall be replaced, else false
     * @return text with replaced characters
@ -246,7 +273,7 @@ public final class CharacterCoding {
     * @param amp true if ampersands shall be replaced, else false
     * @param html true if characters shall be replaced for embedding in
     * HTML, false for XML (far more characters are replaced for HTML,
-     * compare {@link #MAPPING4HTML} with {@link #MAPPING4XML}
+     * see defaults/htmlEntities.json
     * @return text with replaced characters
     */
    private static String unicode2html(
@ -291,7 +318,7 @@ public final class CharacterCoding {
        int p = 0, p1, q;
        final StringBuilder sb = new StringBuilder(text.length());
        String s;
-        Character r;
+        String r;
        while (p < text.length()) {
            p1 = text.indexOf('&', p);
            if (p1 < 0) {
@ -328,7 +355,7 @@ public final class CharacterCoding {
                continue;
            }
            if ((r = HTML2UNICODE4XML.get(s)) != null) {
-                sb.append(r.charValue());
+                sb.append(r);
                continue;
            }
            if ((r = HTML2UNICODE4HTML.get(s)) != null) {
--- a/source/net/yacy/document/parser/html/ContentScraper.java
+++ b/source/net/yacy/document/parser/html/ContentScraper.java
@ -769,18 +769,18 @@ public class ContentScraper extends AbstractScraper implements Scraper {
            final String content = tag.opts.getProperty("content", EMPTY_STRING);
            String name = tag.opts.getProperty("name", EMPTY_STRING);
            if (name.length() > 0) {
-                this.metas.put(name.toLowerCase(), CharacterCoding.html2unicode(content));
+                this.metas.put(name.toLowerCase(), content);
                if (name.toLowerCase().equals("generator")) {
                    this.evaluationScores.match(Element.metagenerator, content);
                }
            }
            name = tag.opts.getProperty("http-equiv", EMPTY_STRING);
            if (name.length() > 0) {
-                this.metas.put(name.toLowerCase(), CharacterCoding.html2unicode(content));
+                this.metas.put(name.toLowerCase(), content);
            }
            name = tag.opts.getProperty("property", EMPTY_STRING);
            if (name.length() > 0) {
-                this.metas.put(name.toLowerCase(), CharacterCoding.html2unicode(content));
+                this.metas.put(name.toLowerCase(), content);
            }
        } else if (tag.name.equalsIgnoreCase("area")) {
            final String areatitle = cleanLine(tag.opts.getProperty("title", EMPTY_STRING));
@ -904,7 +904,6 @@ public class ContentScraper extends AbstractScraper implements Scraper {
        // System.out.println("ScrapeTag1: tag.tagname=" + tag.tagname + ", opts=" + tag.opts.toString() + ", text=" + UTF8.String(text));
        if (tag.name.equalsIgnoreCase("a") && tag.content.length() < 2048) {
            String href = tag.opts.getProperty("href", EMPTY_STRING);
-            href = CharacterCoding.html2unicode(href);
            AnchorURL url;
            if ((href.length() > 0) && ((url = absolutePath(href)) != null)) {
                if (followDenied()) {
--- a/source/net/yacy/kelondro/io/CharBuffer.java
+++ b/source/net/yacy/kelondro/io/CharBuffer.java
@ -32,6 +32,7 @@ import java.io.Writer;
 import java.util.Properties;

 import net.yacy.cora.document.encoding.UTF8;
+import net.yacy.document.parser.html.CharacterCoding;

 public final class CharBuffer extends Writer {

@ -444,6 +445,7 @@ public final class CharBuffer extends Writer {
            while ((pos < this.length) && (this.buffer[pos] <= 32)) pos++;
            // doublequotes are obligatory. However, we want to be fuzzy if they
            // are ommittet
+            String value = null;
            if (pos >= this.length) {
                // error case: input ended too early
                break;
@ -453,7 +455,7 @@ public final class CharBuffer extends Writer {
                start = pos;
                while ((pos < this.length) && (this.buffer[pos] != doublequote)) pos++;
                if (pos >= this.length) break; // this is the case if we found no parent doublequote
-                p.setProperty(key, new String(this.buffer, start, pos - start).trim());
+                value = new String(this.buffer, start, pos - start).trim();
                pos++;
            } else if (this.buffer[pos] == singlequote) {
                // search next singlequote
@ -461,14 +463,15 @@ public final class CharBuffer extends Writer {
                start = pos;
                while ((pos < this.length) && (this.buffer[pos] != singlequote)) pos++;
                if (pos >= this.length) break; // this is the case if we found no parent singlequote
-                p.setProperty(key, new String(this.buffer, start, pos - start).trim());
+                value = new String(this.buffer, start, pos - start).trim();
                pos++;
            } else {
                // search next whitespace
                start = pos;
                while ((pos < this.length) && (this.buffer[pos] > 32)) pos++;
-                p.setProperty(key, new String(this.buffer, start, pos - start).trim());
+                value = new String(this.buffer, start, pos - start).trim();
            }
+            p.setProperty(key, CharacterCoding.html2unicode(value));
            // pos should point now to a whitespace: eat up spaces
            while ((pos < this.length) && (this.buffer[pos] <= 32)) pos++;
            // go on with next loop