mirror of
https://github.com/yacy/yacy_search_server.git
synced 2024-09-19 00:01:41 +02:00
Merge pull request #405 from jfhs/jfhs/support-all-html-entities
Improve HTML entities support
This commit is contained in:
commit
42ea2a1c6f
2597
defaults/htmlEntities.json
Normal file
2597
defaults/htmlEntities.json
Normal file
File diff suppressed because it is too large
Load Diff
|
@ -24,7 +24,20 @@
|
|||
|
||||
package net.yacy.document.parser.html;
|
||||
|
||||
import net.yacy.search.Switchboard;
|
||||
import org.json.JSONArray;
|
||||
import org.json.JSONException;
|
||||
import org.json.JSONObject;
|
||||
|
||||
import java.io.BufferedReader;
|
||||
import java.io.File;
|
||||
import java.io.IOException;
|
||||
import java.nio.charset.Charset;
|
||||
import java.nio.charset.StandardCharsets;
|
||||
import java.nio.file.Files;
|
||||
import java.nio.file.Paths;
|
||||
import java.util.HashMap;
|
||||
import java.util.Iterator;
|
||||
import java.util.Map;
|
||||
import java.util.regex.Pattern;
|
||||
|
||||
|
@ -44,171 +57,187 @@ public final class CharacterCoding {
|
|||
|
||||
/** Special characters which have to be mapped for XML. */
|
||||
private static final String[] MAPPING4XML = {
|
||||
"\"", """, //quotation mark
|
||||
"\u003C", "<", //less than
|
||||
"\u003E", ">", //greater than
|
||||
"\"", """, //quotation mark
|
||||
"\u003C", "<", //less than
|
||||
"\u003E", ">", //greater than
|
||||
};
|
||||
|
||||
/** Special characters which have to be mapped for HTML. */
|
||||
private static final String[] MAPPING4HTML = {
|
||||
"\\", "\", // Backslash
|
||||
"\u005E", "^", // Caret
|
||||
"\\", "\", // Backslash
|
||||
"\u005E", "^", // Caret
|
||||
|
||||
"\u0060", "`", // Accent Grave `
|
||||
"\u007B", "{", // {
|
||||
"\u007C", "|", // |
|
||||
"\u007D", "}", // }
|
||||
"\u007E", "~", // ~
|
||||
"\u0060", "`", // Accent Grave `
|
||||
"\u007B", "{", // {
|
||||
"\u007C", "|", // |
|
||||
"\u007D", "}", // }
|
||||
"\u007E", "~", // ~
|
||||
|
||||
"\u0082", "‚",
|
||||
"\u0083", "ƒ",
|
||||
"\u0084", "„",
|
||||
"\u0085", "…",
|
||||
"\u0086", "†",
|
||||
"\u0087", "‡",
|
||||
"\u0088", "ˆ",
|
||||
"\u0089", "‰",
|
||||
"\u008A", "Š",
|
||||
"\u008B", "‹",
|
||||
"\u008C", "Œ",
|
||||
"\u008D", "",
|
||||
"\u008E", "Ž",
|
||||
"\u0082", "‚",
|
||||
"\u0083", "ƒ",
|
||||
"\u0084", "„",
|
||||
"\u0085", "…",
|
||||
"\u0086", "†",
|
||||
"\u0087", "‡",
|
||||
"\u0088", "ˆ",
|
||||
"\u0089", "‰",
|
||||
"\u008A", "Š",
|
||||
"\u008B", "‹",
|
||||
"\u008C", "Œ",
|
||||
"\u008D", "",
|
||||
"\u008E", "Ž",
|
||||
|
||||
"\u0091", "‘",
|
||||
"\u0092", "’",
|
||||
"\u0093", "“",
|
||||
"\u0094", "”",
|
||||
"\u0095", "•",
|
||||
"\u0096", "–",
|
||||
"\u0097", "—",
|
||||
"\u0098", "˜",
|
||||
"\u0099", "™",
|
||||
"\u009A", "š",
|
||||
"\u009B", "›",
|
||||
"\u009C", "œ",
|
||||
"\u009D", "",
|
||||
"\u009E", "ž",
|
||||
"\u009F", "Ÿ",
|
||||
"\u0091", "‘",
|
||||
"\u0092", "’",
|
||||
"\u0093", "“",
|
||||
"\u0094", "”",
|
||||
"\u0095", "•",
|
||||
"\u0096", "–",
|
||||
"\u0097", "—",
|
||||
"\u0098", "˜",
|
||||
"\u0099", "™",
|
||||
"\u009A", "š",
|
||||
"\u009B", "›",
|
||||
"\u009C", "œ",
|
||||
"\u009D", "",
|
||||
"\u009E", "ž",
|
||||
"\u009F", "Ÿ",
|
||||
|
||||
"\u00A1", "¡", //inverted (spanish) exclamation mark
|
||||
"\u00A2", "¢", //cent
|
||||
"\u00A3", "£", //pound
|
||||
"\u00A4", "¤", //currency
|
||||
"\u00A5", "¥", //yen
|
||||
"\u00A6", "¦", //broken vertical bar
|
||||
"\u00A7", "§", //section sign
|
||||
"\u00A8", "¨", //diaeresis (umlaut)
|
||||
"\u00A9", "©", //copyright sign
|
||||
"\u00AA", "ª", //feminine ordinal indicator
|
||||
"\u00AB", "«", //left-pointing double angle quotation mark
|
||||
"\u00AC", "¬", //not sign
|
||||
"\u00AD", "­", //soft hyphen
|
||||
"\u00AE", "®", //registered sign
|
||||
"\u00AF", "¯", //macron
|
||||
"\u00B0", "°", //degree sign
|
||||
"\u00B1", "±", //plus-minus sign
|
||||
"\u00B2", "²", //superscript two
|
||||
"\u00B3", "³", //superscript three
|
||||
"\u00B4", "´", //acute accent
|
||||
"\u00B5", "µ", //micro sign
|
||||
"\u00B6", "¶", //paragraph sign
|
||||
"\u00B7", "·", //middle dot
|
||||
"\u00B8", "¸", //cedilla
|
||||
"\u00B9", "¹", //superscript one
|
||||
"\u00BA", "º", //masculine ordinal indicator
|
||||
"\u00BB", "»", //right-pointing double angle quotation mark
|
||||
"\u00BC", "¼", //fraction 1/4
|
||||
"\u00BD", "½", //fraction 1/2
|
||||
"\u00BE", "¾", //fraction 3/4
|
||||
"\u00BF", "¿", //inverted (spanisch) questionmark
|
||||
"\u00C0", "À",
|
||||
"\u00C1", "Á",
|
||||
"\u00C2", "Â",
|
||||
"\u00C3", "Ã",
|
||||
"\u00C4", "Ä",
|
||||
"\u00C5", "Å",
|
||||
"\u00C6", "Æ",
|
||||
"\u00C7", "Ç",
|
||||
"\u00C8", "È",
|
||||
"\u00C9", "É",
|
||||
"\u00CA", "Ê",
|
||||
"\u00CB", "Ë",
|
||||
"\u00CC", "Ì",
|
||||
"\u00CD", "Í",
|
||||
"\u00CE", "Î",
|
||||
"\u00CF", "Ï",
|
||||
"\u00D0", "Ð",
|
||||
"\u00D1", "Ñ",
|
||||
"\u00D2", "Ò",
|
||||
"\u00D3", "Ó",
|
||||
"\u00D4", "Ô",
|
||||
"\u00D5", "Õ",
|
||||
"\u00D6", "Ö",
|
||||
"\u00D7", "×",
|
||||
"\u00D8", "Ø",
|
||||
"\u00D9", "Ù",
|
||||
"\u00DA", "Ú",
|
||||
"\u00DB", "Û",
|
||||
"\u00DC", "Ü",
|
||||
"\u00DD", "Ý",
|
||||
"\u00DE", "Þ",
|
||||
"\u00DF", "ß",
|
||||
"\u00E0", "à",
|
||||
"\u00E1", "á",
|
||||
"\u00E2", "â",
|
||||
"\u00E3", "ã",
|
||||
"\u00E4", "ä",
|
||||
"\u00E5", "å",
|
||||
"\u00E6", "æ",
|
||||
"\u00E7", "ç",
|
||||
"\u00E8", "è",
|
||||
"\u00E9", "é",
|
||||
"\u00EA", "ê",
|
||||
"\u00EB", "ë",
|
||||
"\u00EC", "ì",
|
||||
"\u00ED", "í",
|
||||
"\u00EE", "î",
|
||||
"\u00EF", "ï",
|
||||
"\u00F0", "ð",
|
||||
"\u00F1", "ñ",
|
||||
"\u00F2", "ò",
|
||||
"\u00F3", "ó",
|
||||
"\u00F4", "ô",
|
||||
"\u00F5", "õ",
|
||||
"\u00F6", "ö",
|
||||
"\u00F7", "÷",
|
||||
"\u00F8", "ø",
|
||||
"\u00F9", "ù",
|
||||
"\u00FA", "ú",
|
||||
"\u00FB", "û",
|
||||
"\u00FC", "ü",
|
||||
"\u00FD", "ý",
|
||||
"\u00FE", "þ",
|
||||
"\u00FF", "ÿ"
|
||||
"\u00A1", "¡", //inverted (spanish) exclamation mark
|
||||
"\u00A2", "¢", //cent
|
||||
"\u00A3", "£", //pound
|
||||
"\u00A4", "¤", //currency
|
||||
"\u00A5", "¥", //yen
|
||||
"\u00A6", "¦", //broken vertical bar
|
||||
"\u00A7", "§", //section sign
|
||||
"\u00A8", "¨", //diaeresis (umlaut)
|
||||
"\u00A9", "©", //copyright sign
|
||||
"\u00AA", "ª", //feminine ordinal indicator
|
||||
"\u00AB", "«", //left-pointing double angle quotation mark
|
||||
"\u00AC", "¬", //not sign
|
||||
"\u00AD", "­", //soft hyphen
|
||||
"\u00AE", "®", //registered sign
|
||||
"\u00AF", "¯", //macron
|
||||
"\u00B0", "°", //degree sign
|
||||
"\u00B1", "±", //plus-minus sign
|
||||
"\u00B2", "²", //superscript two
|
||||
"\u00B3", "³", //superscript three
|
||||
"\u00B4", "´", //acute accent
|
||||
"\u00B5", "µ", //micro sign
|
||||
"\u00B6", "¶", //paragraph sign
|
||||
"\u00B7", "·", //middle dot
|
||||
"\u00B8", "¸", //cedilla
|
||||
"\u00B9", "¹", //superscript one
|
||||
"\u00BA", "º", //masculine ordinal indicator
|
||||
"\u00BB", "»", //right-pointing double angle quotation mark
|
||||
"\u00BC", "¼", //fraction 1/4
|
||||
"\u00BD", "½", //fraction 1/2
|
||||
"\u00BE", "¾", //fraction 3/4
|
||||
"\u00BF", "¿", //inverted (spanisch) questionmark
|
||||
"\u00C0", "À",
|
||||
"\u00C1", "Á",
|
||||
"\u00C2", "Â",
|
||||
"\u00C3", "Ã",
|
||||
"\u00C4", "Ä",
|
||||
"\u00C5", "Å",
|
||||
"\u00C6", "Æ",
|
||||
"\u00C7", "Ç",
|
||||
"\u00C8", "È",
|
||||
"\u00C9", "É",
|
||||
"\u00CA", "Ê",
|
||||
"\u00CB", "Ë",
|
||||
"\u00CC", "Ì",
|
||||
"\u00CD", "Í",
|
||||
"\u00CE", "Î",
|
||||
"\u00CF", "Ï",
|
||||
"\u00D0", "Ð",
|
||||
"\u00D1", "Ñ",
|
||||
"\u00D2", "Ò",
|
||||
"\u00D3", "Ó",
|
||||
"\u00D4", "Ô",
|
||||
"\u00D5", "Õ",
|
||||
"\u00D6", "Ö",
|
||||
"\u00D7", "×",
|
||||
"\u00D8", "Ø",
|
||||
"\u00D9", "Ù",
|
||||
"\u00DA", "Ú",
|
||||
"\u00DB", "Û",
|
||||
"\u00DC", "Ü",
|
||||
"\u00DD", "Ý",
|
||||
"\u00DE", "Þ",
|
||||
"\u00DF", "ß",
|
||||
"\u00E0", "à",
|
||||
"\u00E1", "á",
|
||||
"\u00E2", "â",
|
||||
"\u00E3", "ã",
|
||||
"\u00E4", "ä",
|
||||
"\u00E5", "å",
|
||||
"\u00E6", "æ",
|
||||
"\u00E7", "ç",
|
||||
"\u00E8", "è",
|
||||
"\u00E9", "é",
|
||||
"\u00EA", "ê",
|
||||
"\u00EB", "ë",
|
||||
"\u00EC", "ì",
|
||||
"\u00ED", "í",
|
||||
"\u00EE", "î",
|
||||
"\u00EF", "ï",
|
||||
"\u00F0", "ð",
|
||||
"\u00F1", "ñ",
|
||||
"\u00F2", "ò",
|
||||
"\u00F3", "ó",
|
||||
"\u00F4", "ô",
|
||||
"\u00F5", "õ",
|
||||
"\u00F6", "ö",
|
||||
"\u00F7", "÷",
|
||||
"\u00F8", "ø",
|
||||
"\u00F9", "ù",
|
||||
"\u00FA", "ú",
|
||||
"\u00FB", "û",
|
||||
"\u00FC", "ü",
|
||||
"\u00FD", "ý",
|
||||
"\u00FE", "þ",
|
||||
"\u00FF", "ÿ"
|
||||
};
|
||||
|
||||
/** Mapping for XML to unicode. */
|
||||
private static final Map<String, Character> HTML2UNICODE4XML =
|
||||
new HashMap<String, Character>(MAPPING4XML.length * 2);
|
||||
private static final Map<String, String> HTML2UNICODE4XML =
|
||||
new HashMap<String, String>();
|
||||
/** Mapping for HTML to unicode. */
|
||||
private static final Map<String, Character> HTML2UNICODE4HTML =
|
||||
new HashMap<String, Character>(MAPPING4HTML.length * 2);
|
||||
private static final Map<String, String> HTML2UNICODE4HTML =
|
||||
new HashMap<String, String>();
|
||||
/** Mapping for unicode to XML. */
|
||||
private static final Map<Character, String> UNICODE2HTML4XML =
|
||||
new HashMap<Character, String>(MAPPING4XML.length * 2);
|
||||
/** Mapping for unicode to HTML. */
|
||||
private static final Map<Character, String> UNICODE2HTML4HTML =
|
||||
new HashMap<Character, String>(MAPPING4HTML.length * 2);
|
||||
new HashMap<Character, String>(MAPPING4XML.length * 2);
|
||||
|
||||
static void parseJsonEntities(JSONObject entities, Map<String, String> entityToChar) throws JSONException {
|
||||
for (Iterator<String> it = entities.keys(); it.hasNext(); ) {
|
||||
String entity = it.next();
|
||||
String c = entities.getJSONObject(entity).getString("characters");
|
||||
entityToChar.put(entity, c);
|
||||
}
|
||||
}
|
||||
|
||||
static {
|
||||
try {
|
||||
byte[] encoded = Files.readAllBytes(Paths.get(Switchboard.getSwitchboard() != null ? Switchboard.getSwitchboard().appPath.getAbsolutePath() : ".", "defaults", "htmlEntities.json"));
|
||||
JSONObject json = new JSONObject(new String(encoded, StandardCharsets.UTF_8));
|
||||
parseJsonEntities(json.getJSONObject("xml"), HTML2UNICODE4XML);
|
||||
parseJsonEntities(json.getJSONObject("html4"), HTML2UNICODE4HTML);
|
||||
parseJsonEntities(json.getJSONObject("html5"), HTML2UNICODE4HTML);
|
||||
} catch (IOException | JSONException e) {
|
||||
e.printStackTrace();
|
||||
}
|
||||
Character c;
|
||||
for (int i = 0; i < MAPPING4HTML.length; i += 2) {
|
||||
c = Character.valueOf(MAPPING4HTML[i].charAt(0));
|
||||
HTML2UNICODE4HTML.put(MAPPING4HTML[i + 1], c);
|
||||
c = MAPPING4HTML[i].charAt(0);
|
||||
UNICODE2HTML4HTML.put(c, MAPPING4HTML[i + 1]);
|
||||
}
|
||||
for (int i = 0; i < MAPPING4XML.length; i += 2) {
|
||||
c = Character.valueOf(MAPPING4XML[i].charAt(0));
|
||||
HTML2UNICODE4XML.put(MAPPING4XML[i + 1], c);
|
||||
c = MAPPING4XML[i].charAt(0);
|
||||
UNICODE2HTML4XML.put(c, MAPPING4XML[i + 1]);
|
||||
}
|
||||
}
|
||||
|
@ -220,7 +249,6 @@ public final class CharacterCoding {
|
|||
|
||||
/**
|
||||
* Replaces characters which have special representation in XML.
|
||||
* @see #MAPPING4XML
|
||||
* @param text text with character to replace
|
||||
* @param amp true if ampersands shall be replaced, else false
|
||||
* @return text with replaced characters
|
||||
|
@ -231,7 +259,6 @@ public final class CharacterCoding {
|
|||
|
||||
/**
|
||||
* Replaces characters which have special representation in HTML.
|
||||
* @see #MAPPING4HTML
|
||||
* @param text text with character to replace
|
||||
* @param amp true if ampersands shall be replaced, else false
|
||||
* @return text with replaced characters
|
||||
|
@ -246,7 +273,7 @@ public final class CharacterCoding {
|
|||
* @param amp true if ampersands shall be replaced, else false
|
||||
* @param html true if characters shall be replaced for embedding in
|
||||
* HTML, false for XML (far more characters are replaced for HTML,
|
||||
* compare {@link #MAPPING4HTML} with {@link #MAPPING4XML}
|
||||
* see defaults/htmlEntities.json
|
||||
* @return text with replaced characters
|
||||
*/
|
||||
private static String unicode2html(
|
||||
|
@ -291,7 +318,7 @@ public final class CharacterCoding {
|
|||
int p = 0, p1, q;
|
||||
final StringBuilder sb = new StringBuilder(text.length());
|
||||
String s;
|
||||
Character r;
|
||||
String r;
|
||||
while (p < text.length()) {
|
||||
p1 = text.indexOf('&', p);
|
||||
if (p1 < 0) {
|
||||
|
@ -328,7 +355,7 @@ public final class CharacterCoding {
|
|||
continue;
|
||||
}
|
||||
if ((r = HTML2UNICODE4XML.get(s)) != null) {
|
||||
sb.append(r.charValue());
|
||||
sb.append(r);
|
||||
continue;
|
||||
}
|
||||
if ((r = HTML2UNICODE4HTML.get(s)) != null) {
|
||||
|
|
|
@ -769,18 +769,18 @@ public class ContentScraper extends AbstractScraper implements Scraper {
|
|||
final String content = tag.opts.getProperty("content", EMPTY_STRING);
|
||||
String name = tag.opts.getProperty("name", EMPTY_STRING);
|
||||
if (name.length() > 0) {
|
||||
this.metas.put(name.toLowerCase(), CharacterCoding.html2unicode(content));
|
||||
this.metas.put(name.toLowerCase(), content);
|
||||
if (name.toLowerCase().equals("generator")) {
|
||||
this.evaluationScores.match(Element.metagenerator, content);
|
||||
}
|
||||
}
|
||||
name = tag.opts.getProperty("http-equiv", EMPTY_STRING);
|
||||
if (name.length() > 0) {
|
||||
this.metas.put(name.toLowerCase(), CharacterCoding.html2unicode(content));
|
||||
this.metas.put(name.toLowerCase(), content);
|
||||
}
|
||||
name = tag.opts.getProperty("property", EMPTY_STRING);
|
||||
if (name.length() > 0) {
|
||||
this.metas.put(name.toLowerCase(), CharacterCoding.html2unicode(content));
|
||||
this.metas.put(name.toLowerCase(), content);
|
||||
}
|
||||
} else if (tag.name.equalsIgnoreCase("area")) {
|
||||
final String areatitle = cleanLine(tag.opts.getProperty("title", EMPTY_STRING));
|
||||
|
@ -904,7 +904,6 @@ public class ContentScraper extends AbstractScraper implements Scraper {
|
|||
// System.out.println("ScrapeTag1: tag.tagname=" + tag.tagname + ", opts=" + tag.opts.toString() + ", text=" + UTF8.String(text));
|
||||
if (tag.name.equalsIgnoreCase("a") && tag.content.length() < 2048) {
|
||||
String href = tag.opts.getProperty("href", EMPTY_STRING);
|
||||
href = CharacterCoding.html2unicode(href);
|
||||
AnchorURL url;
|
||||
if ((href.length() > 0) && ((url = absolutePath(href)) != null)) {
|
||||
if (followDenied()) {
|
||||
|
|
|
@ -32,6 +32,7 @@ import java.io.Writer;
|
|||
import java.util.Properties;
|
||||
|
||||
import net.yacy.cora.document.encoding.UTF8;
|
||||
import net.yacy.document.parser.html.CharacterCoding;
|
||||
|
||||
public final class CharBuffer extends Writer {
|
||||
|
||||
|
@ -444,6 +445,7 @@ public final class CharBuffer extends Writer {
|
|||
while ((pos < this.length) && (this.buffer[pos] <= 32)) pos++;
|
||||
// doublequotes are obligatory. However, we want to be fuzzy if they
|
||||
// are ommittet
|
||||
String value = null;
|
||||
if (pos >= this.length) {
|
||||
// error case: input ended too early
|
||||
break;
|
||||
|
@ -453,7 +455,7 @@ public final class CharBuffer extends Writer {
|
|||
start = pos;
|
||||
while ((pos < this.length) && (this.buffer[pos] != doublequote)) pos++;
|
||||
if (pos >= this.length) break; // this is the case if we found no parent doublequote
|
||||
p.setProperty(key, new String(this.buffer, start, pos - start).trim());
|
||||
value = new String(this.buffer, start, pos - start).trim();
|
||||
pos++;
|
||||
} else if (this.buffer[pos] == singlequote) {
|
||||
// search next singlequote
|
||||
|
@ -461,14 +463,15 @@ public final class CharBuffer extends Writer {
|
|||
start = pos;
|
||||
while ((pos < this.length) && (this.buffer[pos] != singlequote)) pos++;
|
||||
if (pos >= this.length) break; // this is the case if we found no parent singlequote
|
||||
p.setProperty(key, new String(this.buffer, start, pos - start).trim());
|
||||
value = new String(this.buffer, start, pos - start).trim();
|
||||
pos++;
|
||||
} else {
|
||||
// search next whitespace
|
||||
start = pos;
|
||||
while ((pos < this.length) && (this.buffer[pos] > 32)) pos++;
|
||||
p.setProperty(key, new String(this.buffer, start, pos - start).trim());
|
||||
value = new String(this.buffer, start, pos - start).trim();
|
||||
}
|
||||
p.setProperty(key, CharacterCoding.html2unicode(value));
|
||||
// pos should point now to a whitespace: eat up spaces
|
||||
while ((pos < this.length) && (this.buffer[pos] <= 32)) pos++;
|
||||
// go on with next loop
|
||||
|
|
Loading…
Reference in New Issue
Block a user