From 5159a090b0162fb40be5891a526e3c220f24224c Mon Sep 17 00:00:00 2001 From: orbiter Date: Sun, 3 Jul 2005 23:33:25 +0000 Subject: [PATCH] fixed parser bug with lowercase force (appeared in: http://spellbound.sourceforge.net/) git-svn-id: https://svn.berlios.de/svnroot/repos/yacy/trunk@367 6c8d7289-2bf4-0310-a012-ef5d649a1542 --- htroot/Settings_p.html | 3 +-- .../htmlFilter/htmlFilterAbstractScraper.java | 2 +- .../htmlFilterAbstractTransformer.java | 8 ++++---- .../htmlFilter/htmlFilterContentScraper.java | 19 +++++++++++++------ .../htmlFilterContentTransformer.java | 19 +++++++++++++------ .../htmlFilter/htmlFilterOutputStream.java | 4 ++-- source/de/anomic/http/httpd.java | 3 +++ .../de/anomic/plasma/plasmaSnippetCache.java | 2 +- yacy.blue | 1 + 9 files changed, 39 insertions(+), 22 deletions(-) diff --git a/htroot/Settings_p.html b/htroot/Settings_p.html index 97f4c0c87..7462df4db 100644 --- a/htroot/Settings_p.html +++ b/htroot/Settings_p.html @@ -139,8 +139,7 @@ Alternatively, you can simply set a virtual server port on your NAT/Server to en

This is the account that restricts access to the proxy function. You probably don't want to share the proxy to the internet, so you should set the IP-Number Access Domain to a pattern that corresponds to you local intranet. The default setting should be right in most cases. If you want, you can also set a proxy account so that every proxy user must authenticate first, but this is rather unusual.

- + diff --git a/source/de/anomic/htmlFilter/htmlFilterAbstractScraper.java b/source/de/anomic/htmlFilter/htmlFilterAbstractScraper.java index 591ca5881..bbf87c0e7 100644 --- a/source/de/anomic/htmlFilter/htmlFilterAbstractScraper.java +++ b/source/de/anomic/htmlFilter/htmlFilterAbstractScraper.java @@ -1 +1 @@ -// htmlFilterAbstractScraper.java // --------------------------- // (C) by Michael Peter Christen; mc@anomic.de // first published on http://www.anomic.de // Frankfurt, Germany, 2004 // last major change: 18.02.2004 // // You agree that the Author(s) is (are) not responsible for cost, // loss of data or any harm that may be caused by usage of this softare or // this documentation. The usage of this software is on your own risk. The // installation and usage (starting/running) of this software may allow other // people or application to access your computer and any attached devices and // is highly dependent on the configuration of the software which must be // done by the user of the software;the author(s) is (are) also // not responsible for proper configuration and usage of the software, even // if provoked by documentation provided together with the software. // // THE SOFTWARE THAT FOLLOWS AS ART OF PROGRAMMING BELOW THIS SECTION // IS PUBLISHED UNDER THE GPL AS DOCUMENTED IN THE FILE gpl.txt ASIDE THIS // FILE AND AS IN http://www.gnu.org/licenses/gpl.txt // ANY CHANGES TO THIS FILE ACCORDING TO THE GPL CAN BE DONE TO THE // LINES THAT FOLLOWS THIS COPYRIGHT NOTICE HERE, BUT CHANGES MUST NOT // BE DONE ABOVE OR INSIDE THE COPYRIGHT NOTICE. A RE-DISTRIBUTION // MUST CONTAIN THE INTACT AND UNCHANGED COPYRIGHT NOTICE. // CONTRIBUTIONS AND CHANGES TO THE PROGRAM CODE SHOULD BE MARKED AS SUCH. package de.anomic.htmlFilter; import java.util.HashSet; import java.util.HashMap; import java.util.Properties; import de.anomic.server.serverByteBuffer; public abstract class htmlFilterAbstractScraper implements htmlFilterScraper { public static final byte lb = (byte) '<'; public static final byte rb = (byte) '>'; public static final byte sl = (byte) '/'; private HashSet tags0; private HashSet tags1; // define a translation table for html character codings private static HashMap trans = new HashMap(300); static { trans.put(""", "\""); //Anführungszeichen oben trans.put("&", "&"); //Ampersand-Zeichen, kaufmännisches Und trans.put("<", "<"); //öffnende spitze Klammer trans.put(">", ">"); //schließende spitze Klammer trans.put(" ", " "); //Erzwungenes Leerzeichen trans.put("¡", "!"); //umgekehrtes Ausrufezeichen trans.put("¢", " cent "); //Cent-Zeichen trans.put("£", " pound "); //Pfund-Zeichen trans.put("¤", " currency "); //Währungs-Zeichen trans.put("¥", " yen "); //Yen-Zeichen trans.put("¦", " "); //durchbrochener Strich trans.put("§", " paragraph "); //Paragraph-Zeichen trans.put("¨", " "); //Pünktchen oben trans.put("©", " copyright "); //Copyright-Zeichen trans.put("ª", " "); //Ordinal-Zeichen weiblich trans.put("«", " "); //angewinkelte Anführungszeichen links trans.put("¬", " not "); //Verneinungs-Zeichen trans.put("­", "-"); //kurzer Trennstrich trans.put("®", " trademark "); //Registriermarke-Zeichen trans.put("¯", " "); //Überstrich trans.put("°", " degree "); //Grad-Zeichen trans.put("±", " +/- "); //Plusminus-Zeichen trans.put("²", " square "); //Hoch-2-Zeichen trans.put("³", " 3 "); //Hoch-3-Zeichen trans.put("´", " "); //Acute-Zeichen trans.put("µ", " micro "); //Mikro-Zeichen trans.put("¶", " paragraph "); //Absatz-Zeichen trans.put("·", " "); //Mittelpunkt trans.put("¸", " "); //Häkchen unten trans.put("¹", " "); //Hoch-1-Zeichen trans.put("º", " degree "); //Ordinal-Zeichen männlich trans.put("»", " "); //angewinkelte Anführungszeichen rechts trans.put("¼", " quarter "); //ein Viertel trans.put("½", " half "); //ein Halb trans.put("¾", " 3/4 "); //drei Viertel trans.put("¿", "?"); //umgekehrtes Fragezeichen trans.put("À", "A"); //A mit Accent grave trans.put("Á", "A"); //A mit Accent acute trans.put("Â", "A"); //A mit Circumflex trans.put("Ã", "A"); //A mit Tilde trans.put("Ä", "Ae"); //A Umlaut trans.put("Å", "A"); //A mit Ring trans.put("Æ", "A"); //A mit legiertem E trans.put("Ç", "C"); //C mit Häkchen trans.put("È", "E"); //E mit Accent grave trans.put("É", "E"); //E mit Accent acute trans.put("Ê", "E"); //E mit Circumflex trans.put("Ë", "E"); //E Umlaut trans.put("Ì", "I"); //I mit Accent grave trans.put("Í", "I"); //I mit Accent acute trans.put("Î", "I"); //I mit Circumflex trans.put("Ï", "I"); //I Umlaut trans.put("Ð", "D"); //Eth (isländisch) trans.put("Ñ", "N"); //N mit Tilde trans.put("Ò", "O"); //O mit Accent grave trans.put("Ó", "O"); //O mit Accent acute trans.put("Ô", "O"); //O mit Circumflex trans.put("Õ", "O"); //O mit Tilde trans.put("Ö", "Oe"); //O Umlaut trans.put("×", " times "); //Mal-Zeichen trans.put("Ø", "O"); //O mit Schrägstrich trans.put("Ù", "U"); //U mit Accent grave trans.put("Ú", "U"); //U mit Accent acute trans.put("Û", "U"); //U mit Circumflex trans.put("Ü", "Ue"); //U Umlaut trans.put("Ý", "Y"); //Y mit Accent acute trans.put("Þ", "P"); //THORN (isländisch) trans.put("ß", "ss"); //scharfes S trans.put("à", "a"); //a mit Accent grave trans.put("á", "a"); //a mit Accent acute trans.put("â", "a"); //a mit Circumflex trans.put("ã", "a"); //a mit Tilde trans.put("ä", "ae"); //a Umlaut trans.put("å", "a"); //a mit Ring trans.put("æ", "a"); //a mit legiertem e trans.put("ç", "c"); //c mit Häkchen trans.put("è", "e"); //e mit Accent grave trans.put("é", "e"); //e mit Accent acute trans.put("ê", "e"); //e mit Circumflex trans.put("ë", "e"); //e Umlaut trans.put("ì", "i"); //i mit Accent grave trans.put("í", "i"); //i mit Accent acute trans.put("î", "i"); //i mit Circumflex trans.put("ï", "i"); //i Umlaut trans.put("ð", "d"); //eth (isländisch) trans.put("ñ", "n"); //n mit Tilde trans.put("ò", "o"); //o mit Accent grave trans.put("ó", "o"); //o mit Accent acute trans.put("ô", "o"); //o mit Circumflex trans.put("õ", "o"); //o mit Tilde trans.put("ö", "oe"); //o Umlaut trans.put("÷", "%"); //Divisions-Zeichen trans.put("ø", "o"); //o mit Schrägstrich trans.put("ù", "u"); //u mit Accent grave trans.put("ú", "u"); //u mit Accent acute trans.put("û", "u"); //u mit Circumflex trans.put("ü", "ue"); //u Umlaut trans.put("ý", "y"); //y mit Accent acute trans.put("þ", "p"); //thorn (isländisch) trans.put("ÿ", "y"); //y Umlaut trans.put("Α", " Alpha "); //Alpha groß trans.put("α", " alpha "); //alpha klein trans.put("Β", " Beta "); //Beta groß trans.put("β", " beta "); //beta klein trans.put("Γ", " Gamma "); //Gamma groß trans.put("γ", " gamma "); //gamma klein trans.put("Δ", " Delta "); //Delta groß trans.put("δ", " delta "); //delta klein trans.put("Ε", " Epsilon "); //Epsilon groß trans.put("ε", " epsilon "); //epsilon klein trans.put("Ζ", " Zeta "); //Zeta groß trans.put("ζ", " zeta "); //zeta klein trans.put("Η", " Eta "); //Eta groß trans.put("η", " eta "); //eta klein trans.put("Θ", " Theta "); //Theta groß trans.put("θ", " theta "); //theta klein trans.put("Ι", " Iota "); //Iota groß trans.put("ι", " iota "); //iota klein trans.put("Κ", " Kappa "); //Kappa groß trans.put("κ", " kappa "); //kappa klein trans.put("Λ", " Lambda "); //Lambda groß trans.put("λ", " lambda "); //lambda klein trans.put("Μ", " Mu "); //Mu groß trans.put("μ", " mu "); //mu klein trans.put("Ν", " Nu "); //Nu groß trans.put("ν", " nu "); //nu klein trans.put("Ξ", " Xi "); //Xi groß trans.put("ξ", " xi "); //xi klein trans.put("Ο", " Omicron "); //Omicron groß trans.put("ο", " omicron "); //omicron klein trans.put("Π", " Pi "); //Pi groß trans.put("π", " pi "); //pi klein trans.put("Ρ", " Rho "); //Rho groß trans.put("ρ", " rho "); //rho klein trans.put("Σ", " Sigma "); //Sigma groß trans.put("ς", " sigma "); //sigmaf klein trans.put("σ", " sigma "); //sigma klein trans.put("Τ", " Tau "); //Tau groß trans.put("τ", " tau "); //tau klein trans.put("Υ", " Ypsilon "); //Upsilon groß trans.put("υ", " ypsilon "); //upsilon klein trans.put("Φ", " Phi "); //Phi groß trans.put("φ", " phi "); //phi klein trans.put("Χ", " Chi "); //Chi groß trans.put("χ", " chi "); //chi klein trans.put("Ψ", " Psi "); //Psi groß trans.put("ψ", " psi "); //psi klein trans.put("Ω", " Omega "); //Omega groß trans.put("ω", " omega "); //omega klein trans.put("ϑ", " theta "); //theta Symbol trans.put("ϒ", " ypsilon "); //upsilon mit Haken trans.put("ϖ", " pi "); //pi Symbol trans.put("∀", " for all "); //für alle trans.put("∂", " part of "); //teilweise trans.put("∃", " exists "); //existiert trans.put("∅", " null "); //leer trans.put("∇", " nabla "); //nabla trans.put("∈", " element of "); //Element von trans.put("∉", " not element of "); //kein Element von trans.put("∋", " contains "); //enthält als Element trans.put("∏", " product "); //Produkt trans.put("∑", " sum "); //Summe trans.put("−", " minus "); //minus trans.put("∗", " times "); //Asterisk trans.put("√", " sqare root "); //Quadratwurzel trans.put("∝", " proportional to "); //proportional zu trans.put("∞", " unlimited "); //unendlich trans.put("∠", " angle "); //Winkel trans.put("∧", " and "); //und trans.put("∨", " or "); //oder trans.put("∩", " "); //Schnittpunkt trans.put("∪", " unity "); //Einheit trans.put("∫", " integral "); //Integral trans.put("∴", " cause "); //deshalb trans.put("∼", " similar to "); //ähnlich wie trans.put("≅", " equal "); //annähernd gleich trans.put("≈", " equal "); //beinahe gleich trans.put("≠", " not equal "); //ungleich trans.put("≡", " identical "); //identisch mit trans.put("≤", " smaller or equal than "); //kleiner gleich trans.put("≥", " greater or equal than "); //größer gleich trans.put("⊂", " subset of "); //Untermenge von trans.put("⊃", " superset of "); //Obermenge von trans.put("⊄", " not subset of "); //keine Untermenge von trans.put("⊆", ""); //Untermenge von oder gleich mit trans.put("⊇", ""); //Obermenge von oder gleich mit trans.put("⊕", ""); //Direktsumme trans.put("⊗", ""); //Vektorprodukt trans.put("⊥", ""); //senkrecht zu trans.put("⋅", ""); //Punkt-Operator trans.put("◊", ""); //Raute trans.put("⌈", ""); //links oben trans.put("⌉", ""); //rechts oben trans.put("⌊", ""); //links unten trans.put("⌋", ""); //rechts unten trans.put("⟨", ""); //spitze Klammer links trans.put("⟩", ""); //spitze Klammer rechts trans.put("←", ""); //Pfeil links trans.put("↑", ""); //Pfeil oben trans.put("→", ""); //Pfeil rechts trans.put("↓", ""); //Pfeil unten trans.put("↔", ""); //Pfeil links/rechts trans.put("↵", ""); //Pfeil unten-Knick-links trans.put("⇐", ""); //Doppelpfeil links trans.put("⇑", ""); //Doppelpfeil oben trans.put("⇒", ""); //Doppelpfeil rechts trans.put("⇓", ""); //Doppelpfeil unten trans.put("⇔", ""); //Doppelpfeil links/rechts trans.put("•", ""); //Bullet-Zeichen trans.put("…", ""); //Horizontale Ellipse trans.put("′", ""); //Minutenzeichen trans.put("‾", ""); //Überstrich trans.put("⁄", ""); //Bruchstrich trans.put("℘", ""); //Weierstrass p trans.put("ℑ", ""); //Zeichen für "imaginär" trans.put("ℜ", ""); //Zeichen für "real" trans.put("™", ""); //Trademark-Zeichen trans.put("€", ""); //Euro-Zeichen trans.put("ℵ", ""); //Alef-Symbol trans.put("♠", ""); //Pik-Zeichen trans.put("♣", ""); //Kreuz-Zeichen trans.put("♥", ""); //Herz-Zeichen trans.put("♦", ""); //Karo-Zeichen trans.put(" ", ""); //Leerzeichen Breite n trans.put(" ", ""); //Leerzeichen Breite m trans.put(" ", ""); //Schmales Leerzeichen trans.put("‌", ""); //null breiter Nichtverbinder trans.put("‍", ""); //null breiter Verbinder trans.put("‎", ""); //links-nach-rechts-Zeichen trans.put("‏", ""); //rechts-nach-links-Zeichen trans.put("–", ""); //Gedankenstrich Breite n trans.put("—", ""); //Gedankenstrich Breite m trans.put("‘", ""); //einfaches Anführungszeichen links trans.put("’", ""); //einfaches Anführungszeichen rechts trans.put("‚", ""); //einfaches low-9-Zeichen trans.put("“", ""); //doppeltes Anführungszeichen links trans.put("”", ""); //doppeltes Anführungszeichen rechts trans.put("„", ""); //doppeltes low-9-Zeichen rechts trans.put("†", ""); //Kreuz trans.put("‡", ""); //Doppelkreuz trans.put("‰", ""); //zu tausend trans.put("‹", ""); //angewinkeltes einzelnes Anf.zeichen links trans.put("›", ""); //angewinkeltes einzelnes Anf.zeichen rechts } public htmlFilterAbstractScraper(HashSet tags0, HashSet tags1) { this.tags0 = tags0; this.tags1 = tags1; } public boolean isTag0(String tag) { return (tags0 != null) && (tags0.contains(tag)); } public boolean isTag1(String tag) { return (tags1 != null) && (tags1.contains(tag)); } //the 'missing' method that shall be implemented: public abstract void scrapeText(byte[] text); // the other methods must take into account to construct the return value correctly public void scrapeTag0(String tagname, Properties tagopts) { } public void scrapeTag1(String tagname, Properties tagopts, byte[] text) { } // string conversions private static String code_iso8859s(int c) { switch ((int) c & 0xff) { // german umlaute and ligaturen case 0xc4: return "AE"; case 0xd6: return "OE"; case 0xdc: return "UE"; case 0xe4: return "ae"; case 0xf6: return "oe"; case 0xfc: return "ue"; case 0xdf: return "ss"; // accent on letters; i.e. french characters case 0xc0: case 0xc1: case 0xc2: case 0xc3: case 0xc5: return "A"; case 0xc6: return "AE"; case 0xc7: return "C"; case 0xc8: case 0xc9: case 0xca: return "E"; case 0xcc: case 0xcd: case 0xce: case 0xcf: return "I"; case 0xd0: return "D"; case 0xd1: return "N"; case 0xd2: case 0xd3: case 0xd4: case 0xd5: case 0xd8: return "O"; case 0xd7: return "x"; case 0xd9: case 0xda: case 0xdb: return "U"; case 0xdd: return "Y"; case 0xde: return "p"; case 0xe0: case 0xe1: case 0xe2: case 0xe3: case 0xe5: return "a"; case 0xe6: return "ae"; case 0xe7: return "c"; case 0xe8: case 0xe9: case 0xea: return "e"; case 0xec: case 0xed: case 0xee: case 0xef: return "i"; case 0xf0: return "d"; case 0xf1: return "n"; case 0xf2: case 0xf3: case 0xf4: case 0xf5: case 0xf8: return "o"; case 0xf7: return "%"; case 0xf9: case 0xfa: case 0xfb: return "u"; case 0xfd: case 0xff: return "y"; case 0xfe: return "p"; // special characters case 0xa4: return " euro "; default: return null; } } public static serverByteBuffer convertUmlaute(serverByteBuffer bb) { serverByteBuffer t = new serverByteBuffer(bb.length() + 20); int b0, b1, b2; String z; int i = 0; while (i < bb.length()) { b0 = bb.byteAt(i) & 0xff; // check utf-8 encoding if ((b0 < 128) || (i + 1 == bb.length())) { t.append(b0); i++; } else { b1 = bb.byteAt(i + 1) & 0xff; if (b1 > 0x3f) { z = code_iso8859s(b0); i++; } else if ((b0 > 0xbf) && (b0 < 0xe0)) { z = code_iso8859s(((b0 & 0x1f) << 0x6) | (b1 & 0x3f)); i += 2; } else { if (i + 2 >= bb.length()) { z = null; i++; } else { b2 = bb.byteAt(i + 2) & 0xff; if (b2 > 0x3f) { z = code_iso8859s(b0); i++; } else { z = code_iso8859s(((b0 & 0xf) << 0xc) | ((b1 & 0x3f) << 0x6) | (b2 & 0x3f)); i += 3; } } } if (z == null) t.append(b0); else t.append(z); } } return t; } private static byte[] transscript(byte[] code) { String t = (String) trans.get(new String(code)); if (t == null) return new byte[0]; else return t.getBytes(); } protected static serverByteBuffer transscriptAll(serverByteBuffer bb) { int p0 = 0, p1; byte[] t; while ((p0 = bb.indexOf((byte) '&', p0)) >= 0) { p1 = bb.indexOf((byte) ';', p0); if (p1 >= 0) { t = transscript(bb.getBytes(p0, p1 + 1)); bb = new serverByteBuffer(bb.getBytes(0, p0), bb.length() + p0 - p1 + t.length).append(t).append(bb.getBytes(p1 + 1)); } else { bb = new serverByteBuffer(bb.getBytes(0, p0), bb.length()).append(bb.getBytes(p0 + 1)); } } t = null; return bb; } protected static serverByteBuffer stripAllTags(serverByteBuffer bb) { int p0 = 0, p1; while ((p0 = bb.indexOf(lb, p0)) >= 0) { p1 = bb.indexOf(rb, p0); if (p1 >= 0) { bb = new serverByteBuffer(bb.getBytes(0, p0), bb.length() + p0 - p1 + 1).trim().append((byte) 32).append(new serverByteBuffer(bb.getBytes(p1 + 1)).trim()); } else { bb = new serverByteBuffer(bb.getBytes(0, p0), bb.length()).trim().append(new serverByteBuffer(bb.getBytes(p0 + 1)).trim()); } } return bb.trim(); } public static serverByteBuffer stripAll(serverByteBuffer bb) { //return stripAllTags(s); return convertUmlaute(transscriptAll(stripAllTags(bb))); } public void close() { // free resources tags0 = null; tags1 = null; } public void finalize() { close(); } } \ No newline at end of file +// htmlFilterAbstractScraper.java // --------------------------- // (C) by Michael Peter Christen; mc@anomic.de // first published on http://www.anomic.de // Frankfurt, Germany, 2004 // last major change: 18.02.2004 // // You agree that the Author(s) is (are) not responsible for cost, // loss of data or any harm that may be caused by usage of this softare or // this documentation. The usage of this software is on your own risk. The // installation and usage (starting/running) of this software may allow other // people or application to access your computer and any attached devices and // is highly dependent on the configuration of the software which must be // done by the user of the software;the author(s) is (are) also // not responsible for proper configuration and usage of the software, even // if provoked by documentation provided together with the software. // // THE SOFTWARE THAT FOLLOWS AS ART OF PROGRAMMING BELOW THIS SECTION // IS PUBLISHED UNDER THE GPL AS DOCUMENTED IN THE FILE gpl.txt ASIDE THIS // FILE AND AS IN http://www.gnu.org/licenses/gpl.txt // ANY CHANGES TO THIS FILE ACCORDING TO THE GPL CAN BE DONE TO THE // LINES THAT FOLLOWS THIS COPYRIGHT NOTICE HERE, BUT CHANGES MUST NOT // BE DONE ABOVE OR INSIDE THE COPYRIGHT NOTICE. A RE-DISTRIBUTION // MUST CONTAIN THE INTACT AND UNCHANGED COPYRIGHT NOTICE. // CONTRIBUTIONS AND CHANGES TO THE PROGRAM CODE SHOULD BE MARKED AS SUCH. package de.anomic.htmlFilter; import java.util.TreeSet; import java.util.HashMap; import java.util.Properties; import de.anomic.server.serverByteBuffer; public abstract class htmlFilterAbstractScraper implements htmlFilterScraper { public static final byte lb = (byte) '<'; public static final byte rb = (byte) '>'; public static final byte sl = (byte) '/'; private TreeSet tags0; private TreeSet tags1; // define a translation table for html character codings private static HashMap trans = new HashMap(300); static { trans.put(""", "\""); //Anführungszeichen oben trans.put("&", "&"); //Ampersand-Zeichen, kaufmännisches Und trans.put("<", "<"); //öffnende spitze Klammer trans.put(">", ">"); //schließende spitze Klammer trans.put(" ", " "); //Erzwungenes Leerzeichen trans.put("¡", "!"); //umgekehrtes Ausrufezeichen trans.put("¢", " cent "); //Cent-Zeichen trans.put("£", " pound "); //Pfund-Zeichen trans.put("¤", " currency "); //Währungs-Zeichen trans.put("¥", " yen "); //Yen-Zeichen trans.put("¦", " "); //durchbrochener Strich trans.put("§", " paragraph "); //Paragraph-Zeichen trans.put("¨", " "); //Pünktchen oben trans.put("©", " copyright "); //Copyright-Zeichen trans.put("ª", " "); //Ordinal-Zeichen weiblich trans.put("«", " "); //angewinkelte Anführungszeichen links trans.put("¬", " not "); //Verneinungs-Zeichen trans.put("­", "-"); //kurzer Trennstrich trans.put("®", " trademark "); //Registriermarke-Zeichen trans.put("¯", " "); //Überstrich trans.put("°", " degree "); //Grad-Zeichen trans.put("±", " +/- "); //Plusminus-Zeichen trans.put("²", " square "); //Hoch-2-Zeichen trans.put("³", " 3 "); //Hoch-3-Zeichen trans.put("´", " "); //Acute-Zeichen trans.put("µ", " micro "); //Mikro-Zeichen trans.put("¶", " paragraph "); //Absatz-Zeichen trans.put("·", " "); //Mittelpunkt trans.put("¸", " "); //Häkchen unten trans.put("¹", " "); //Hoch-1-Zeichen trans.put("º", " degree "); //Ordinal-Zeichen männlich trans.put("»", " "); //angewinkelte Anführungszeichen rechts trans.put("¼", " quarter "); //ein Viertel trans.put("½", " half "); //ein Halb trans.put("¾", " 3/4 "); //drei Viertel trans.put("¿", "?"); //umgekehrtes Fragezeichen trans.put("À", "A"); //A mit Accent grave trans.put("Á", "A"); //A mit Accent acute trans.put("Â", "A"); //A mit Circumflex trans.put("Ã", "A"); //A mit Tilde trans.put("Ä", "Ae"); //A Umlaut trans.put("Å", "A"); //A mit Ring trans.put("Æ", "A"); //A mit legiertem E trans.put("Ç", "C"); //C mit Häkchen trans.put("È", "E"); //E mit Accent grave trans.put("É", "E"); //E mit Accent acute trans.put("Ê", "E"); //E mit Circumflex trans.put("Ë", "E"); //E Umlaut trans.put("Ì", "I"); //I mit Accent grave trans.put("Í", "I"); //I mit Accent acute trans.put("Î", "I"); //I mit Circumflex trans.put("Ï", "I"); //I Umlaut trans.put("Ð", "D"); //Eth (isländisch) trans.put("Ñ", "N"); //N mit Tilde trans.put("Ò", "O"); //O mit Accent grave trans.put("Ó", "O"); //O mit Accent acute trans.put("Ô", "O"); //O mit Circumflex trans.put("Õ", "O"); //O mit Tilde trans.put("Ö", "Oe"); //O Umlaut trans.put("×", " times "); //Mal-Zeichen trans.put("Ø", "O"); //O mit Schrägstrich trans.put("Ù", "U"); //U mit Accent grave trans.put("Ú", "U"); //U mit Accent acute trans.put("Û", "U"); //U mit Circumflex trans.put("Ü", "Ue"); //U Umlaut trans.put("Ý", "Y"); //Y mit Accent acute trans.put("Þ", "P"); //THORN (isländisch) trans.put("ß", "ss"); //scharfes S trans.put("à", "a"); //a mit Accent grave trans.put("á", "a"); //a mit Accent acute trans.put("â", "a"); //a mit Circumflex trans.put("ã", "a"); //a mit Tilde trans.put("ä", "ae"); //a Umlaut trans.put("å", "a"); //a mit Ring trans.put("æ", "a"); //a mit legiertem e trans.put("ç", "c"); //c mit Häkchen trans.put("è", "e"); //e mit Accent grave trans.put("é", "e"); //e mit Accent acute trans.put("ê", "e"); //e mit Circumflex trans.put("ë", "e"); //e Umlaut trans.put("ì", "i"); //i mit Accent grave trans.put("í", "i"); //i mit Accent acute trans.put("î", "i"); //i mit Circumflex trans.put("ï", "i"); //i Umlaut trans.put("ð", "d"); //eth (isländisch) trans.put("ñ", "n"); //n mit Tilde trans.put("ò", "o"); //o mit Accent grave trans.put("ó", "o"); //o mit Accent acute trans.put("ô", "o"); //o mit Circumflex trans.put("õ", "o"); //o mit Tilde trans.put("ö", "oe"); //o Umlaut trans.put("÷", "%"); //Divisions-Zeichen trans.put("ø", "o"); //o mit Schrägstrich trans.put("ù", "u"); //u mit Accent grave trans.put("ú", "u"); //u mit Accent acute trans.put("û", "u"); //u mit Circumflex trans.put("ü", "ue"); //u Umlaut trans.put("ý", "y"); //y mit Accent acute trans.put("þ", "p"); //thorn (isländisch) trans.put("ÿ", "y"); //y Umlaut trans.put("Α", " Alpha "); //Alpha groß trans.put("α", " alpha "); //alpha klein trans.put("Β", " Beta "); //Beta groß trans.put("β", " beta "); //beta klein trans.put("Γ", " Gamma "); //Gamma groß trans.put("γ", " gamma "); //gamma klein trans.put("Δ", " Delta "); //Delta groß trans.put("δ", " delta "); //delta klein trans.put("Ε", " Epsilon "); //Epsilon groß trans.put("ε", " epsilon "); //epsilon klein trans.put("Ζ", " Zeta "); //Zeta groß trans.put("ζ", " zeta "); //zeta klein trans.put("Η", " Eta "); //Eta groß trans.put("η", " eta "); //eta klein trans.put("Θ", " Theta "); //Theta groß trans.put("θ", " theta "); //theta klein trans.put("Ι", " Iota "); //Iota groß trans.put("ι", " iota "); //iota klein trans.put("Κ", " Kappa "); //Kappa groß trans.put("κ", " kappa "); //kappa klein trans.put("Λ", " Lambda "); //Lambda groß trans.put("λ", " lambda "); //lambda klein trans.put("Μ", " Mu "); //Mu groß trans.put("μ", " mu "); //mu klein trans.put("Ν", " Nu "); //Nu groß trans.put("ν", " nu "); //nu klein trans.put("Ξ", " Xi "); //Xi groß trans.put("ξ", " xi "); //xi klein trans.put("Ο", " Omicron "); //Omicron groß trans.put("ο", " omicron "); //omicron klein trans.put("Π", " Pi "); //Pi groß trans.put("π", " pi "); //pi klein trans.put("Ρ", " Rho "); //Rho groß trans.put("ρ", " rho "); //rho klein trans.put("Σ", " Sigma "); //Sigma groß trans.put("ς", " sigma "); //sigmaf klein trans.put("σ", " sigma "); //sigma klein trans.put("Τ", " Tau "); //Tau groß trans.put("τ", " tau "); //tau klein trans.put("Υ", " Ypsilon "); //Upsilon groß trans.put("υ", " ypsilon "); //upsilon klein trans.put("Φ", " Phi "); //Phi groß trans.put("φ", " phi "); //phi klein trans.put("Χ", " Chi "); //Chi groß trans.put("χ", " chi "); //chi klein trans.put("Ψ", " Psi "); //Psi groß trans.put("ψ", " psi "); //psi klein trans.put("Ω", " Omega "); //Omega groß trans.put("ω", " omega "); //omega klein trans.put("ϑ", " theta "); //theta Symbol trans.put("ϒ", " ypsilon "); //upsilon mit Haken trans.put("ϖ", " pi "); //pi Symbol trans.put("∀", " for all "); //für alle trans.put("∂", " part of "); //teilweise trans.put("∃", " exists "); //existiert trans.put("∅", " null "); //leer trans.put("∇", " nabla "); //nabla trans.put("∈", " element of "); //Element von trans.put("∉", " not element of "); //kein Element von trans.put("∋", " contains "); //enthält als Element trans.put("∏", " product "); //Produkt trans.put("∑", " sum "); //Summe trans.put("−", " minus "); //minus trans.put("∗", " times "); //Asterisk trans.put("√", " sqare root "); //Quadratwurzel trans.put("∝", " proportional to "); //proportional zu trans.put("∞", " unlimited "); //unendlich trans.put("∠", " angle "); //Winkel trans.put("∧", " and "); //und trans.put("∨", " or "); //oder trans.put("∩", " "); //Schnittpunkt trans.put("∪", " unity "); //Einheit trans.put("∫", " integral "); //Integral trans.put("∴", " cause "); //deshalb trans.put("∼", " similar to "); //ähnlich wie trans.put("≅", " equal "); //annähernd gleich trans.put("≈", " equal "); //beinahe gleich trans.put("≠", " not equal "); //ungleich trans.put("≡", " identical "); //identisch mit trans.put("≤", " smaller or equal than "); //kleiner gleich trans.put("≥", " greater or equal than "); //größer gleich trans.put("⊂", " subset of "); //Untermenge von trans.put("⊃", " superset of "); //Obermenge von trans.put("⊄", " not subset of "); //keine Untermenge von trans.put("⊆", ""); //Untermenge von oder gleich mit trans.put("⊇", ""); //Obermenge von oder gleich mit trans.put("⊕", ""); //Direktsumme trans.put("⊗", ""); //Vektorprodukt trans.put("⊥", ""); //senkrecht zu trans.put("⋅", ""); //Punkt-Operator trans.put("◊", ""); //Raute trans.put("⌈", ""); //links oben trans.put("⌉", ""); //rechts oben trans.put("⌊", ""); //links unten trans.put("⌋", ""); //rechts unten trans.put("⟨", ""); //spitze Klammer links trans.put("⟩", ""); //spitze Klammer rechts trans.put("←", ""); //Pfeil links trans.put("↑", ""); //Pfeil oben trans.put("→", ""); //Pfeil rechts trans.put("↓", ""); //Pfeil unten trans.put("↔", ""); //Pfeil links/rechts trans.put("↵", ""); //Pfeil unten-Knick-links trans.put("⇐", ""); //Doppelpfeil links trans.put("⇑", ""); //Doppelpfeil oben trans.put("⇒", ""); //Doppelpfeil rechts trans.put("⇓", ""); //Doppelpfeil unten trans.put("⇔", ""); //Doppelpfeil links/rechts trans.put("•", ""); //Bullet-Zeichen trans.put("…", ""); //Horizontale Ellipse trans.put("′", ""); //Minutenzeichen trans.put("‾", ""); //Überstrich trans.put("⁄", ""); //Bruchstrich trans.put("℘", ""); //Weierstrass p trans.put("ℑ", ""); //Zeichen für "imaginär" trans.put("ℜ", ""); //Zeichen für "real" trans.put("™", ""); //Trademark-Zeichen trans.put("€", ""); //Euro-Zeichen trans.put("ℵ", ""); //Alef-Symbol trans.put("♠", ""); //Pik-Zeichen trans.put("♣", ""); //Kreuz-Zeichen trans.put("♥", ""); //Herz-Zeichen trans.put("♦", ""); //Karo-Zeichen trans.put(" ", ""); //Leerzeichen Breite n trans.put(" ", ""); //Leerzeichen Breite m trans.put(" ", ""); //Schmales Leerzeichen trans.put("‌", ""); //null breiter Nichtverbinder trans.put("‍", ""); //null breiter Verbinder trans.put("‎", ""); //links-nach-rechts-Zeichen trans.put("‏", ""); //rechts-nach-links-Zeichen trans.put("–", ""); //Gedankenstrich Breite n trans.put("—", ""); //Gedankenstrich Breite m trans.put("‘", ""); //einfaches Anführungszeichen links trans.put("’", ""); //einfaches Anführungszeichen rechts trans.put("‚", ""); //einfaches low-9-Zeichen trans.put("“", ""); //doppeltes Anführungszeichen links trans.put("”", ""); //doppeltes Anführungszeichen rechts trans.put("„", ""); //doppeltes low-9-Zeichen rechts trans.put("†", ""); //Kreuz trans.put("‡", ""); //Doppelkreuz trans.put("‰", ""); //zu tausend trans.put("‹", ""); //angewinkeltes einzelnes Anf.zeichen links trans.put("›", ""); //angewinkeltes einzelnes Anf.zeichen rechts } public htmlFilterAbstractScraper(TreeSet tags0, TreeSet tags1) { this.tags0 = tags0; this.tags1 = tags1; } public boolean isTag0(String tag) { return (tags0 != null) && (tags0.contains(tag)); } public boolean isTag1(String tag) { return (tags1 != null) && (tags1.contains(tag)); } //the 'missing' method that shall be implemented: public abstract void scrapeText(byte[] text); // the other methods must take into account to construct the return value correctly public void scrapeTag0(String tagname, Properties tagopts) { } public void scrapeTag1(String tagname, Properties tagopts, byte[] text) { } // string conversions private static String code_iso8859s(int c) { switch ((int) c & 0xff) { // german umlaute and ligaturen case 0xc4: return "AE"; case 0xd6: return "OE"; case 0xdc: return "UE"; case 0xe4: return "ae"; case 0xf6: return "oe"; case 0xfc: return "ue"; case 0xdf: return "ss"; // accent on letters; i.e. french characters case 0xc0: case 0xc1: case 0xc2: case 0xc3: case 0xc5: return "A"; case 0xc6: return "AE"; case 0xc7: return "C"; case 0xc8: case 0xc9: case 0xca: return "E"; case 0xcc: case 0xcd: case 0xce: case 0xcf: return "I"; case 0xd0: return "D"; case 0xd1: return "N"; case 0xd2: case 0xd3: case 0xd4: case 0xd5: case 0xd8: return "O"; case 0xd7: return "x"; case 0xd9: case 0xda: case 0xdb: return "U"; case 0xdd: return "Y"; case 0xde: return "p"; case 0xe0: case 0xe1: case 0xe2: case 0xe3: case 0xe5: return "a"; case 0xe6: return "ae"; case 0xe7: return "c"; case 0xe8: case 0xe9: case 0xea: return "e"; case 0xec: case 0xed: case 0xee: case 0xef: return "i"; case 0xf0: return "d"; case 0xf1: return "n"; case 0xf2: case 0xf3: case 0xf4: case 0xf5: case 0xf8: return "o"; case 0xf7: return "%"; case 0xf9: case 0xfa: case 0xfb: return "u"; case 0xfd: case 0xff: return "y"; case 0xfe: return "p"; // special characters case 0xa4: return " euro "; default: return null; } } public static serverByteBuffer convertUmlaute(serverByteBuffer bb) { serverByteBuffer t = new serverByteBuffer(bb.length() + 20); int b0, b1, b2; String z; int i = 0; while (i < bb.length()) { b0 = bb.byteAt(i) & 0xff; // check utf-8 encoding if ((b0 < 128) || (i + 1 == bb.length())) { t.append(b0); i++; } else { b1 = bb.byteAt(i + 1) & 0xff; if (b1 > 0x3f) { z = code_iso8859s(b0); i++; } else if ((b0 > 0xbf) && (b0 < 0xe0)) { z = code_iso8859s(((b0 & 0x1f) << 0x6) | (b1 & 0x3f)); i += 2; } else { if (i + 2 >= bb.length()) { z = null; i++; } else { b2 = bb.byteAt(i + 2) & 0xff; if (b2 > 0x3f) { z = code_iso8859s(b0); i++; } else { z = code_iso8859s(((b0 & 0xf) << 0xc) | ((b1 & 0x3f) << 0x6) | (b2 & 0x3f)); i += 3; } } } if (z == null) t.append(b0); else t.append(z); } } return t; } private static byte[] transscript(byte[] code) { String t = (String) trans.get(new String(code)); if (t == null) return new byte[0]; else return t.getBytes(); } protected static serverByteBuffer transscriptAll(serverByteBuffer bb) { int p0 = 0, p1; byte[] t; while ((p0 = bb.indexOf((byte) '&', p0)) >= 0) { p1 = bb.indexOf((byte) ';', p0); if (p1 >= 0) { t = transscript(bb.getBytes(p0, p1 + 1)); bb = new serverByteBuffer(bb.getBytes(0, p0), bb.length() + p0 - p1 + t.length).append(t).append(bb.getBytes(p1 + 1)); } else { bb = new serverByteBuffer(bb.getBytes(0, p0), bb.length()).append(bb.getBytes(p0 + 1)); } } t = null; return bb; } protected static serverByteBuffer stripAllTags(serverByteBuffer bb) { int p0 = 0, p1; while ((p0 = bb.indexOf(lb, p0)) >= 0) { p1 = bb.indexOf(rb, p0); if (p1 >= 0) { bb = new serverByteBuffer(bb.getBytes(0, p0), bb.length() + p0 - p1 + 1).trim().append((byte) 32).append(new serverByteBuffer(bb.getBytes(p1 + 1)).trim()); } else { bb = new serverByteBuffer(bb.getBytes(0, p0), bb.length()).trim().append(new serverByteBuffer(bb.getBytes(p0 + 1)).trim()); } } return bb.trim(); } public static serverByteBuffer stripAll(serverByteBuffer bb) { //return stripAllTags(s); return convertUmlaute(transscriptAll(stripAllTags(bb))); } public void close() { // free resources tags0 = null; tags1 = null; } public void finalize() { close(); } } \ No newline at end of file diff --git a/source/de/anomic/htmlFilter/htmlFilterAbstractTransformer.java b/source/de/anomic/htmlFilter/htmlFilterAbstractTransformer.java index a22e84e27..3d36e4a36 100644 --- a/source/de/anomic/htmlFilter/htmlFilterAbstractTransformer.java +++ b/source/de/anomic/htmlFilter/htmlFilterAbstractTransformer.java @@ -40,15 +40,15 @@ package de.anomic.htmlFilter; -import java.util.HashSet; +import java.util.TreeSet; import java.util.Properties; public abstract class htmlFilterAbstractTransformer implements htmlFilterTransformer { - private HashSet tags0; - private HashSet tags1; + private TreeSet tags0; + private TreeSet tags1; - public htmlFilterAbstractTransformer(HashSet tags0, HashSet tags1) { + public htmlFilterAbstractTransformer(TreeSet tags0, TreeSet tags1) { this.tags0 = tags0; this.tags1 = tags1; } diff --git a/source/de/anomic/htmlFilter/htmlFilterContentScraper.java b/source/de/anomic/htmlFilter/htmlFilterContentScraper.java index 89d5e6783..7fe778d79 100644 --- a/source/de/anomic/htmlFilter/htmlFilterContentScraper.java +++ b/source/de/anomic/htmlFilter/htmlFilterContentScraper.java @@ -43,9 +43,11 @@ package de.anomic.htmlFilter; import java.net.URL; import java.net.MalformedURLException; import java.util.HashMap; -import java.util.HashSet; +import java.util.TreeSet; import java.util.Map; import java.util.Properties; +import java.util.Locale; +import java.text.Collator; import de.anomic.server.serverByteBuffer; @@ -54,15 +56,20 @@ public class htmlFilterContentScraper extends htmlFilterAbstractScraper implemen // statics: for initialisation of the HTMLFilterAbstractScraper - private static HashSet linkTags0; - private static HashSet linkTags1; - + private static TreeSet linkTags0; + private static TreeSet linkTags1; + private static final Collator insensitiveCollator = Collator.getInstance(Locale.US); static { - linkTags0 = new HashSet(); + insensitiveCollator.setStrength(Collator.SECONDARY); + insensitiveCollator.setDecomposition(Collator.NO_DECOMPOSITION); + } + + static { + linkTags0 = new TreeSet(insensitiveCollator); linkTags0.add("img"); linkTags0.add("base"); - linkTags1 = new HashSet(); + linkTags1 = new TreeSet(insensitiveCollator); linkTags1.add("a"); linkTags1.add("h1"); linkTags1.add("title"); diff --git a/source/de/anomic/htmlFilter/htmlFilterContentTransformer.java b/source/de/anomic/htmlFilter/htmlFilterContentTransformer.java index a744c0b9d..0d9358131 100644 --- a/source/de/anomic/htmlFilter/htmlFilterContentTransformer.java +++ b/source/de/anomic/htmlFilter/htmlFilterContentTransformer.java @@ -43,23 +43,30 @@ package de.anomic.htmlFilter; import java.io.BufferedReader; import java.io.File; import java.io.FileReader; -import java.util.HashSet; +import java.util.TreeSet; import java.util.Properties; import java.util.Vector; +import java.util.Locale; +import java.text.Collator; import de.anomic.server.serverByteBuffer; public class htmlFilterContentTransformer extends htmlFilterAbstractTransformer implements htmlFilterTransformer { // statics: for initialisation of the HTMLFilterAbstractTransformer - private static HashSet linkTags0; - private static HashSet linkTags1; - + private static TreeSet linkTags0; + private static TreeSet linkTags1; + private static final Collator insensitiveCollator = Collator.getInstance(Locale.US); static { - linkTags0 = new HashSet(); + insensitiveCollator.setStrength(Collator.SECONDARY); + insensitiveCollator.setDecomposition(Collator.NO_DECOMPOSITION); + } + + static { + linkTags0 = new TreeSet(insensitiveCollator); linkTags0.add("img"); - linkTags1 = new HashSet(); + linkTags1 = new TreeSet(insensitiveCollator); linkTags1.add("a"); } diff --git a/source/de/anomic/htmlFilter/htmlFilterOutputStream.java b/source/de/anomic/htmlFilter/htmlFilterOutputStream.java index 5408f9612..df6369973 100644 --- a/source/de/anomic/htmlFilter/htmlFilterOutputStream.java +++ b/source/de/anomic/htmlFilter/htmlFilterOutputStream.java @@ -254,14 +254,14 @@ public final class htmlFilterOutputStream extends OutputStream { if (in[1] == '/') { // a closing tag tagend = tagEnd(in, 2); - tag = new String(in, 2, tagend - 2).toLowerCase(); + tag = new String(in, 2, tagend - 2); byte[] text = new byte[in.length - tagend - 1]; System.arraycopy(in, tagend, text, 0, in.length - tagend - 1); return filterTag(tag, false, text, quotechar); } else { // an opening tag tagend = tagEnd(in, 1); - tag = new String(in, 1, tagend - 1).toLowerCase(); + tag = new String(in, 1, tagend - 1); byte[] text = new byte[in.length - tagend - 1]; System.arraycopy(in, tagend, text, 0, in.length - tagend - 1); return filterTag(tag, true, text, quotechar); diff --git a/source/de/anomic/http/httpd.java b/source/de/anomic/http/httpd.java index d7f509ce0..46e951125 100644 --- a/source/de/anomic/http/httpd.java +++ b/source/de/anomic/http/httpd.java @@ -223,6 +223,8 @@ public final class httpd implements serverHandler { int pos; while (st.hasMoreTokens()) { pattern = st.nextToken(); + if (key.matches(pattern)) return true; + /* pos = pattern.indexOf("*"); if (pos < 0) { // no wild card: exact match @@ -232,6 +234,7 @@ public final class httpd implements serverHandler { if ((key.startsWith(pattern.substring(0, pos))) && (key.endsWith(pattern.substring(pos + 1)))) return true; } + */ } return false; } diff --git a/source/de/anomic/plasma/plasmaSnippetCache.java b/source/de/anomic/plasma/plasmaSnippetCache.java index 64fa0dd16..97fbe0a98 100644 --- a/source/de/anomic/plasma/plasmaSnippetCache.java +++ b/source/de/anomic/plasma/plasmaSnippetCache.java @@ -244,7 +244,7 @@ public class plasmaSnippetCache { hash = (String) j.next(); pos = (Integer) hs.get(hash); if (pos == null) { - remaininghashes.add(hash); + remaininghashes.add(new String(hash)); } else { p = pos.intValue(); if (p > maxpos) maxpos = p; diff --git a/yacy.blue b/yacy.blue index e69de29bb..7d64d50e8 100644 --- a/yacy.blue +++ b/yacy.blue @@ -0,0 +1 @@ +ebcblue
IP-Number filter:
IP-Number filter:
Account Name:      
Password: