mirror of
https://github.com/yacy/yacy_search_server.git
synced 2024-09-19 00:01:41 +02:00
cleaned, Properties
git-svn-id: https://svn.berlios.de/svnroot/repos/yacy/trunk@867 6c8d7289-2bf4-0310-a012-ef5d649a1542
This commit is contained in:
parent
68aa215479
commit
afc5ef2819
|
@ -1,9 +1,12 @@
|
|||
// htmlFilterContentTransformer.java
|
||||
// htmlFilterContentTransformer.java
|
||||
// ---------------------------------
|
||||
// (C) by Michael Peter Christen; mc@anomic.de
|
||||
// first published on http://www.anomic.de
|
||||
// Frankfurt, Germany, 2004
|
||||
// last major change: 18.02.2004
|
||||
//
|
||||
// $LastChangedDate$
|
||||
// $LastChangedRevision$
|
||||
// $LastChangedBy$
|
||||
//
|
||||
// This program is free software; you can redistribute it and/or modify
|
||||
// it under the terms of the GNU General Public License as published by
|
||||
|
@ -48,7 +51,6 @@ import java.util.ArrayList;
|
|||
import java.util.Locale;
|
||||
import java.util.Properties;
|
||||
import java.util.TreeSet;
|
||||
|
||||
import de.anomic.server.serverByteBuffer;
|
||||
|
||||
public class htmlFilterContentTransformer extends htmlFilterAbstractTransformer implements htmlFilterTransformer {
|
||||
|
@ -56,88 +58,96 @@ public class htmlFilterContentTransformer extends htmlFilterAbstractTransformer
|
|||
// statics: for initialisation of the HTMLFilterAbstractTransformer
|
||||
private static TreeSet linkTags0;
|
||||
private static TreeSet linkTags1;
|
||||
|
||||
private static final Collator insensitiveCollator = Collator.getInstance(Locale.US);
|
||||
static {
|
||||
insensitiveCollator.setStrength(Collator.SECONDARY);
|
||||
insensitiveCollator.setDecomposition(Collator.NO_DECOMPOSITION);
|
||||
insensitiveCollator.setStrength(Collator.SECONDARY);
|
||||
insensitiveCollator.setDecomposition(Collator.NO_DECOMPOSITION);
|
||||
}
|
||||
|
||||
static {
|
||||
linkTags0 = new TreeSet(insensitiveCollator);
|
||||
linkTags0.add("img");
|
||||
|
||||
linkTags1 = new TreeSet(insensitiveCollator);
|
||||
linkTags1.add("a");
|
||||
static {
|
||||
linkTags0 = new TreeSet(insensitiveCollator);
|
||||
linkTags0.add("img");
|
||||
|
||||
linkTags1 = new TreeSet(insensitiveCollator);
|
||||
linkTags1.add("a");
|
||||
}
|
||||
|
||||
private static ArrayList bluelist = null;
|
||||
|
||||
public htmlFilterContentTransformer() {
|
||||
super(linkTags0, linkTags1);
|
||||
super(linkTags0, linkTags1);
|
||||
}
|
||||
|
||||
public void init(String initarg) {
|
||||
//System.out.println("Transformer init: " + initarg);
|
||||
if (bluelist == null) {
|
||||
// here, the initarg is used to load a list of bluelisted words
|
||||
bluelist = new ArrayList();
|
||||
File f = new File(initarg);
|
||||
if ((f.exists()) && (f.canRead())) try {
|
||||
BufferedReader r = new BufferedReader(new FileReader(f));
|
||||
String s;
|
||||
while ((s = r.readLine()) != null) {
|
||||
if ((!(s.startsWith("#"))) && (s.length() > 0)) bluelist.add(s.toLowerCase());
|
||||
}
|
||||
r.close();
|
||||
} catch (Exception e) {
|
||||
}
|
||||
//if (bluelist.size() == 0) System.out.println("BLUELIST is empty");
|
||||
}
|
||||
// System.out.println("Transformer init: " + initarg);
|
||||
if (bluelist == null) {
|
||||
// here, the initarg is used to load a list of bluelisted words
|
||||
bluelist = new ArrayList();
|
||||
File f = new File(initarg);
|
||||
if (f.canRead()) {
|
||||
try {
|
||||
BufferedReader r = new BufferedReader(new FileReader(f));
|
||||
String s;
|
||||
while ((s = r.readLine()) != null) {
|
||||
if (!s.startsWith("#") && s.length() > 0) bluelist.add(s.toLowerCase());
|
||||
}
|
||||
r.close();
|
||||
} catch (Exception e) {
|
||||
}
|
||||
// if (bluelist.size() == 0) System.out.println("BLUELIST is empty");
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
public boolean isIdentityTransformer() {
|
||||
return bluelist.size() == 0;
|
||||
}
|
||||
|
||||
|
||||
private static byte[] genBlueLetters(int length) {
|
||||
serverByteBuffer bb = new serverByteBuffer(" <FONT COLOR=#0000FF>".getBytes());
|
||||
length = length / 2;
|
||||
if (length > 10) length = 7;
|
||||
while (length-- > 0) bb.append((byte) 'X');
|
||||
bb.append("</FONT> ".getBytes());
|
||||
return bb.getBytes();
|
||||
serverByteBuffer bb = new serverByteBuffer(" <FONT COLOR=#0000FF>".getBytes());
|
||||
length = length / 2;
|
||||
if (length > 10) length = 7;
|
||||
while (length-- > 0) {
|
||||
bb.append((byte) 'X');
|
||||
}
|
||||
bb.append("</FONT> ".getBytes());
|
||||
return bb.getBytes();
|
||||
}
|
||||
|
||||
private boolean hit(byte[] text) {
|
||||
if ((text == null) || (bluelist == null)) return false;
|
||||
String lc = new String(text).toLowerCase();
|
||||
for (int i = 0; i < bluelist.size(); i++) if (lc.indexOf((String) bluelist.get(i)) >= 0) return true;
|
||||
return false;
|
||||
if (text == null || bluelist == null) return false;
|
||||
String lc = new String(text).toLowerCase();
|
||||
for (int i = 0; i < bluelist.size(); i++) {
|
||||
if (lc.indexOf((String) bluelist.get(i)) >= 0) return true;
|
||||
}
|
||||
return false;
|
||||
}
|
||||
|
||||
public byte[] transformText(byte[] text) {
|
||||
if (hit(text)) {
|
||||
//System.out.println("FILTERHIT: " + text);
|
||||
return genBlueLetters(text.length);
|
||||
} else
|
||||
return text;
|
||||
if (hit(text)) {
|
||||
// System.out.println("FILTERHIT: " + text);
|
||||
return genBlueLetters(text.length);
|
||||
} else {
|
||||
return text;
|
||||
}
|
||||
}
|
||||
|
||||
public byte[] transformTag0(String tagname, Properties tagopts, byte quotechar) {
|
||||
if (hit(tagopts.getProperty("src","").getBytes())) return genBlueLetters(5);
|
||||
if (hit(tagopts.getProperty("alt","").getBytes())) return genBlueLetters(5);
|
||||
return htmlFilterOutputStream.genTag0(tagname, tagopts, quotechar);
|
||||
if (hit(tagopts.getProperty("src","").getBytes())) return genBlueLetters(5);
|
||||
if (hit(tagopts.getProperty("alt","").getBytes())) return genBlueLetters(5);
|
||||
return htmlFilterOutputStream.genTag0(tagname, tagopts, quotechar);
|
||||
}
|
||||
|
||||
public byte[] transformTag1(String tagname, Properties tagopts, byte[] text, byte quotechar) {
|
||||
if (hit(tagopts.getProperty("href","").getBytes())) return genBlueLetters(text.length);
|
||||
if (hit(text)) return genBlueLetters(text.length);
|
||||
return htmlFilterOutputStream.genTag1(tagname, tagopts, text, quotechar);
|
||||
if (hit(tagopts.getProperty("href","").getBytes())) return genBlueLetters(text.length);
|
||||
if (hit(text)) return genBlueLetters(text.length);
|
||||
return htmlFilterOutputStream.genTag1(tagname, tagopts, text, quotechar);
|
||||
}
|
||||
|
||||
public void close() {
|
||||
// free resources
|
||||
super.close();
|
||||
}
|
||||
|
||||
}
|
||||
|
||||
}
|
|
@ -1,9 +1,12 @@
|
|||
// htmlFilterOutputStream.java
|
||||
// htmlFilterOutputStream.java
|
||||
// ---------------------------
|
||||
// (C) by Michael Peter Christen; mc@anomic.de
|
||||
// first published on http://www.anomic.de
|
||||
// Frankfurt, Germany, 2004, 2005
|
||||
// last major change: 16.02.2005
|
||||
//
|
||||
// $LastChangedDate$
|
||||
// $LastChangedRevision$
|
||||
// $LastChangedBy$
|
||||
//
|
||||
// This program is free software; you can redistribute it and/or modify
|
||||
// it under the terms of the GNU General Public License as published by
|
||||
|
@ -42,9 +45,8 @@
|
|||
This class implements an output stream. Any data written to that output
|
||||
is automatically parsed.
|
||||
After finishing with writing, the htmlFilter can be read out.
|
||||
|
||||
*/
|
||||
|
||||
*/
|
||||
|
||||
package de.anomic.htmlFilter;
|
||||
|
||||
|
@ -59,7 +61,6 @@ import java.net.URL;
|
|||
import java.util.ArrayList;
|
||||
import java.util.Enumeration;
|
||||
import java.util.Properties;
|
||||
|
||||
import de.anomic.server.serverByteBuffer;
|
||||
|
||||
public final class htmlFilterOutputStream extends OutputStream {
|
||||
|
@ -84,219 +85,225 @@ public final class htmlFilterOutputStream extends OutputStream {
|
|||
private boolean inScript;
|
||||
private boolean binaryUnsuspect;
|
||||
private boolean passbyIfBinarySuspect;
|
||||
|
||||
public htmlFilterOutputStream(OutputStream out,
|
||||
htmlFilterScraper scraper, htmlFilterTransformer transformer,
|
||||
|
||||
public htmlFilterOutputStream(OutputStream out, htmlFilterScraper scraper,
|
||||
htmlFilterTransformer transformer,
|
||||
boolean passbyIfBinarySuspect) {
|
||||
this.out = out;
|
||||
this.scraper = scraper;
|
||||
this.transformer = transformer;
|
||||
this.buffer = new serverByteBuffer(1024);
|
||||
this.filterTag = null;
|
||||
this.filterOpts = null;
|
||||
this.filterCont = null;
|
||||
this.out = out;
|
||||
this.scraper = scraper;
|
||||
this.transformer = transformer;
|
||||
this.buffer = new serverByteBuffer(1024);
|
||||
this.filterTag = null;
|
||||
this.filterOpts = null;
|
||||
this.filterCont = null;
|
||||
this.inSingleQuote = false;
|
||||
this.inDoubleQuote = false;
|
||||
this.inComment = false;
|
||||
this.inScript = false;
|
||||
this.inDoubleQuote = false;
|
||||
this.inComment = false;
|
||||
this.inScript = false;
|
||||
this.binaryUnsuspect = true;
|
||||
this.passbyIfBinarySuspect = passbyIfBinarySuspect;
|
||||
}
|
||||
|
||||
|
||||
public static byte[] genTag0raw(String tagname, boolean opening, byte[] tagopts) {
|
||||
serverByteBuffer bb = new serverByteBuffer(tagname.length() + tagopts.length + 3);
|
||||
bb.append((byte) '<');
|
||||
if (!(opening)) bb.append((byte) '/');
|
||||
bb.append(tagname.getBytes());
|
||||
if (tagopts.length > 0) {
|
||||
//if (tagopts[0] == (byte) 32)
|
||||
bb.append(tagopts);
|
||||
//else bb.append((byte) 32).append(tagopts);
|
||||
}
|
||||
bb.append((byte) '>');
|
||||
return bb.getBytes();
|
||||
serverByteBuffer bb = new serverByteBuffer(tagname.length() + tagopts.length + 3);
|
||||
bb.append((byte) '<');
|
||||
if (!opening) {
|
||||
bb.append((byte) '/');
|
||||
}
|
||||
bb.append(tagname.getBytes());
|
||||
if (tagopts.length > 0) {
|
||||
// if (tagopts[0] == (byte) 32)
|
||||
bb.append(tagopts);
|
||||
// else bb.append((byte) 32).append(tagopts);
|
||||
}
|
||||
bb.append((byte) '>');
|
||||
return bb.getBytes();
|
||||
}
|
||||
|
||||
public static byte[] genTag1raw(String tagname, byte[] tagopts, byte[] text) {
|
||||
serverByteBuffer bb = new serverByteBuffer(2 * tagname.length() + tagopts.length + text.length + 5);
|
||||
bb.append((byte) '<').append(tagname.getBytes());
|
||||
if (tagopts.length > 0) {
|
||||
//if (tagopts[0] == (byte) 32)
|
||||
bb.append(tagopts);
|
||||
//else bb.append((byte) 32).append(tagopts);
|
||||
}
|
||||
bb.append((byte) '>');
|
||||
bb.append(text);
|
||||
bb.append((byte) '<').append((byte) '/').append(tagname.getBytes()).append((byte) '>');
|
||||
return bb.getBytes();
|
||||
serverByteBuffer bb = new serverByteBuffer(2 * tagname.length() + tagopts.length + text.length + 5);
|
||||
bb.append((byte) '<').append(tagname.getBytes());
|
||||
if (tagopts.length > 0) {
|
||||
// if (tagopts[0] == (byte) 32)
|
||||
bb.append(tagopts);
|
||||
// else bb.append((byte) 32).append(tagopts);
|
||||
}
|
||||
bb.append((byte) '>');
|
||||
bb.append(text);
|
||||
bb.append((byte) '<').append((byte) '/').append(tagname.getBytes()).append((byte) '>');
|
||||
return bb.getBytes();
|
||||
}
|
||||
|
||||
public static byte[] genTag0(String tagname, Properties tagopts, byte quotechar) {
|
||||
byte[] tagoptsx = (tagopts.size() == 0) ? null : genOpts(tagopts, quotechar);
|
||||
serverByteBuffer bb = new serverByteBuffer(tagname.length() + ((tagoptsx == null) ? 0 : (tagoptsx.length + 1)) + tagname.length() + 2).append((byte) '<').append(tagname.getBytes());
|
||||
if (tagoptsx != null) bb = bb.append((byte) 32).append(tagoptsx);
|
||||
bb = bb.append((byte) '>');
|
||||
return bb.getBytes();
|
||||
serverByteBuffer bb = new serverByteBuffer(tagname.length() + ((tagoptsx == null) ? 0 : (tagoptsx.length + 1)) + tagname.length() + 2).append((byte) '<').append(tagname.getBytes());
|
||||
if (tagoptsx != null) {
|
||||
bb = bb.append((byte) 32).append(tagoptsx);
|
||||
}
|
||||
bb = bb.append((byte) '>');
|
||||
return bb.getBytes();
|
||||
}
|
||||
|
||||
public static byte[] genTag1(String tagname, Properties tagopts, byte[] text, byte quotechar) {
|
||||
byte[] gt0 = genTag0(tagname, tagopts, quotechar);
|
||||
return new serverByteBuffer(gt0, gt0.length + text.length + tagname.length() + 3).append(text).append(("</" + tagname + ">").getBytes()).getBytes();
|
||||
return new serverByteBuffer(gt0, gt0.length + text.length + tagname.length() + 3).append(text).append(("</" + tagname + ">").getBytes()).getBytes();
|
||||
}
|
||||
|
||||
// a helper method for pretty-printing of properties for html tags
|
||||
public static byte[] genOpts(Properties prop, byte quotechar) {
|
||||
Enumeration e = prop.propertyNames();
|
||||
serverByteBuffer bb = new serverByteBuffer(prop.size() * 40);
|
||||
String key;
|
||||
while (e.hasMoreElements()) {
|
||||
key = (String) e.nextElement();
|
||||
bb = bb.append((byte) 32).append(key.getBytes()).append((byte) '=');
|
||||
bb = bb.append(quotechar).append(prop.getProperty(key).getBytes()).append(quotechar);
|
||||
}
|
||||
if (bb.length() > 0) return bb.getBytes(1); else return bb.getBytes();
|
||||
Enumeration e = prop.propertyNames();
|
||||
serverByteBuffer bb = new serverByteBuffer(prop.size() * 40);
|
||||
String key;
|
||||
while (e.hasMoreElements()) {
|
||||
key = (String) e.nextElement();
|
||||
bb = bb.append((byte) 32).append(key.getBytes()).append((byte) '=');
|
||||
bb = bb.append(quotechar).append(prop.getProperty(key).getBytes()).append(quotechar);
|
||||
}
|
||||
if (bb.length() > 0) return bb.getBytes(1); else return bb.getBytes();
|
||||
}
|
||||
|
||||
private byte[] filterTag(String tag, boolean opening, byte[] content, byte quotechar) {
|
||||
//System.out.println("FILTER1: filterTag=" + ((filterTag == null) ? "null" : filterTag) + ", tag=" + tag + ", opening=" + ((opening) ? "true" : "false") + ", content=" + new String(content)); // debug
|
||||
if (filterTag == null) {
|
||||
// we are not collection tag text
|
||||
if (tag == null) {
|
||||
// and this is not a tag opener/closer
|
||||
if (scraper != null) scraper.scrapeText(content);
|
||||
if (transformer != null) return transformer.transformText(content); else return content;
|
||||
} else {
|
||||
// we have a new tag
|
||||
if (opening) {
|
||||
if ((scraper != null) && (scraper.isTag0(tag))) {
|
||||
// this single tag is collected at once here
|
||||
scraper.scrapeTag0(tag, new serverByteBuffer(content).propParser());
|
||||
}
|
||||
if ((transformer != null) && (transformer.isTag0(tag))) {
|
||||
// this single tag is collected at once here
|
||||
return transformer.transformTag0(tag, new serverByteBuffer(content).propParser(), quotechar);
|
||||
} else if (((scraper != null) && (scraper.isTag1(tag))) ||
|
||||
((transformer != null) && (transformer.isTag1(tag)))) {
|
||||
// ok, start collecting
|
||||
filterTag = tag;
|
||||
filterOpts = new serverByteBuffer(content).propParser();
|
||||
filterCont = new serverByteBuffer();
|
||||
return new byte[0];
|
||||
} else {
|
||||
// we ignore that thing and return it again
|
||||
return genTag0raw(tag, true, content);
|
||||
}
|
||||
} else {
|
||||
// we ignore that thing and return it again
|
||||
return genTag0raw(tag, false, content);
|
||||
}
|
||||
}
|
||||
} else {
|
||||
// we are collection tag text for the tag 'filterTag'
|
||||
if (tag == null) {
|
||||
// go on collecting content
|
||||
if (scraper != null) scraper.scrapeText(content);
|
||||
if (transformer != null)
|
||||
filterCont.append(transformer.transformText(content));
|
||||
else
|
||||
filterCont.append(content);
|
||||
return new byte[0];
|
||||
} else {
|
||||
// it's a tag! which one?
|
||||
if ((opening) || (!(tag.equals(filterTag)))) {
|
||||
// this tag is not our concern. just add it
|
||||
filterCont.append(genTag0raw(tag, opening, content));
|
||||
return new byte[0];
|
||||
} else {
|
||||
// it's our closing tag! return complete result.
|
||||
byte[] ret;
|
||||
if (scraper != null) scraper.scrapeTag1(filterTag, filterOpts, filterCont.getBytes());
|
||||
if (transformer != null)
|
||||
ret = transformer.transformTag1(filterTag, filterOpts, filterCont.getBytes(), quotechar);
|
||||
else
|
||||
ret = genTag1(filterTag, filterOpts, filterCont.getBytes(), quotechar);
|
||||
filterTag = null;
|
||||
filterOpts = null;
|
||||
filterCont = null;
|
||||
return ret;
|
||||
}
|
||||
}
|
||||
}
|
||||
// System.out.println("FILTER1: filterTag=" + ((filterTag == null) ? "null" : filterTag) + ", tag=" + tag + ", opening=" + ((opening) ? "true" : "false") + ", content=" + new String(content)); // debug
|
||||
if (filterTag == null) {
|
||||
// we are not collection tag text
|
||||
if (tag == null) {
|
||||
// and this is not a tag opener/closer
|
||||
if (scraper != null) scraper.scrapeText(content);
|
||||
if (transformer != null) return transformer.transformText(content); else return content;
|
||||
} else {
|
||||
// we have a new tag
|
||||
if (opening) {
|
||||
if ((scraper != null) && (scraper.isTag0(tag))) {
|
||||
// this single tag is collected at once here
|
||||
scraper.scrapeTag0(tag, new serverByteBuffer(content).propParser());
|
||||
}
|
||||
if ((transformer != null) && (transformer.isTag0(tag))) {
|
||||
// this single tag is collected at once here
|
||||
return transformer.transformTag0(tag, new serverByteBuffer(content).propParser(), quotechar);
|
||||
} else if (((scraper != null) && (scraper.isTag1(tag))) ||
|
||||
((transformer != null) && (transformer.isTag1(tag)))) {
|
||||
// ok, start collecting
|
||||
filterTag = tag;
|
||||
filterOpts = new serverByteBuffer(content).propParser();
|
||||
filterCont = new serverByteBuffer();
|
||||
return new byte[0];
|
||||
} else {
|
||||
// we ignore that thing and return it again
|
||||
return genTag0raw(tag, true, content);
|
||||
}
|
||||
} else {
|
||||
// we ignore that thing and return it again
|
||||
return genTag0raw(tag, false, content);
|
||||
}
|
||||
}
|
||||
} else {
|
||||
// we are collection tag text for the tag 'filterTag'
|
||||
if (tag == null) {
|
||||
// go on collecting content
|
||||
if (scraper != null) scraper.scrapeText(content);
|
||||
if (transformer != null) {
|
||||
filterCont.append(transformer.transformText(content));
|
||||
} else {
|
||||
filterCont.append(content);
|
||||
}
|
||||
return new byte[0];
|
||||
} else {
|
||||
// it's a tag! which one?
|
||||
if ((opening) || (!(tag.equals(filterTag)))) {
|
||||
// this tag is not our concern. just add it
|
||||
filterCont.append(genTag0raw(tag, opening, content));
|
||||
return new byte[0];
|
||||
} else {
|
||||
// it's our closing tag! return complete result.
|
||||
byte[] ret;
|
||||
if (scraper != null) scraper.scrapeTag1(filterTag, filterOpts, filterCont.getBytes());
|
||||
if (transformer != null) {
|
||||
ret = transformer.transformTag1(filterTag, filterOpts, filterCont.getBytes(), quotechar);
|
||||
} else {
|
||||
ret = genTag1(filterTag, filterOpts, filterCont.getBytes(), quotechar);
|
||||
}
|
||||
filterTag = null;
|
||||
filterOpts = null;
|
||||
filterCont = null;
|
||||
return ret;
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
private byte[] filterFinalize(byte quotechar) {
|
||||
if (filterTag == null) {
|
||||
return new byte[0];
|
||||
} else {
|
||||
// it's our closing tag! return complete result.
|
||||
byte[] ret;
|
||||
if (scraper != null) scraper.scrapeTag1(filterTag, filterOpts, filterCont.getBytes());
|
||||
if (transformer != null)
|
||||
ret = transformer.transformTag1(filterTag, filterOpts, filterCont.getBytes(), quotechar);
|
||||
else
|
||||
ret = genTag1(filterTag, filterOpts, filterCont.getBytes(), quotechar);
|
||||
filterTag = null;
|
||||
filterOpts = null;
|
||||
filterCont = null;
|
||||
return ret;
|
||||
}
|
||||
if (filterTag == null) {
|
||||
return new byte[0];
|
||||
} else {
|
||||
// it's our closing tag! return complete result.
|
||||
byte[] ret;
|
||||
if (scraper != null) scraper.scrapeTag1(filterTag, filterOpts, filterCont.getBytes());
|
||||
if (transformer != null) {
|
||||
ret = transformer.transformTag1(filterTag, filterOpts, filterCont.getBytes(), quotechar);
|
||||
} else {
|
||||
ret = genTag1(filterTag, filterOpts, filterCont.getBytes(), quotechar);
|
||||
}
|
||||
filterTag = null;
|
||||
filterOpts = null;
|
||||
filterCont = null;
|
||||
return ret;
|
||||
}
|
||||
}
|
||||
|
||||
private byte[] filterSentence(byte[] in, byte quotechar) {
|
||||
if (in.length == 0) return in;
|
||||
//System.out.println("FILTER0: " + new String(in)); // debug
|
||||
// scan the string and parse structure
|
||||
if ((in.length > 2) && (in[0] == lb)) {
|
||||
// a tag
|
||||
String tag;
|
||||
int tagend;
|
||||
if (in[1] == '/') {
|
||||
// a closing tag
|
||||
tagend = tagEnd(in, 2);
|
||||
tag = new String(in, 2, tagend - 2);
|
||||
byte[] text = new byte[in.length - tagend - 1];
|
||||
System.arraycopy(in, tagend, text, 0, in.length - tagend - 1);
|
||||
return filterTag(tag, false, text, quotechar);
|
||||
} else {
|
||||
// an opening tag
|
||||
tagend = tagEnd(in, 1);
|
||||
tag = new String(in, 1, tagend - 1);
|
||||
byte[] text = new byte[in.length - tagend - 1];
|
||||
System.arraycopy(in, tagend, text, 0, in.length - tagend - 1);
|
||||
return filterTag(tag, true, text, quotechar);
|
||||
}
|
||||
} else {
|
||||
// a text
|
||||
return filterTag(null, true, in, quotechar);
|
||||
}
|
||||
if (in.length == 0) return in;
|
||||
// System.out.println("FILTER0: " + new String(in)); // debug
|
||||
// scan the string and parse structure
|
||||
if (in.length > 2 && in[0] == lb) {
|
||||
// a tag
|
||||
String tag;
|
||||
int tagend;
|
||||
if (in[1] == '/') {
|
||||
// a closing tag
|
||||
tagend = tagEnd(in, 2);
|
||||
tag = new String(in, 2, tagend - 2);
|
||||
byte[] text = new byte[in.length - tagend - 1];
|
||||
System.arraycopy(in, tagend, text, 0, in.length - tagend - 1);
|
||||
return filterTag(tag, false, text, quotechar);
|
||||
} else {
|
||||
// an opening tag
|
||||
tagend = tagEnd(in, 1);
|
||||
tag = new String(in, 1, tagend - 1);
|
||||
byte[] text = new byte[in.length - tagend - 1];
|
||||
System.arraycopy(in, tagend, text, 0, in.length - tagend - 1);
|
||||
return filterTag(tag, true, text, quotechar);
|
||||
}
|
||||
} else {
|
||||
// a text
|
||||
return filterTag(null, true, in, quotechar);
|
||||
}
|
||||
}
|
||||
|
||||
private static int tagEnd(byte[] tag, int start) {
|
||||
char c;
|
||||
for (int i = start; i < tag.length; i++) {
|
||||
c = (char) tag[i];
|
||||
if ((c != '!') && (c != '-') &&
|
||||
((c < '0') || (c > '9')) &&
|
||||
((c < 'a') || (c > 'z')) &&
|
||||
((c < 'A') || (c > 'Z'))
|
||||
) return i;
|
||||
}
|
||||
return tag.length - 1;
|
||||
char c;
|
||||
for (int i = start; i < tag.length; i++) {
|
||||
c = (char) tag[i];
|
||||
if (c != '!' && c != '-' &&
|
||||
(c < '0' || c > '9') &&
|
||||
(c < 'a' || c > 'z') &&
|
||||
(c < 'A' || c > 'Z')
|
||||
) return i;
|
||||
}
|
||||
return tag.length - 1;
|
||||
}
|
||||
|
||||
public void write(int b) throws IOException {
|
||||
write((byte) (b & 0xff));
|
||||
write((byte) (b & 0xff));
|
||||
}
|
||||
|
||||
private void write(byte b) throws IOException {
|
||||
//System.out.println((char) b);
|
||||
// System.out.println((char) b);
|
||||
if ((binaryUnsuspect) && (binaryHint(b))) {
|
||||
binaryUnsuspect = false;
|
||||
if (passbyIfBinarySuspect) finalize();
|
||||
}
|
||||
|
||||
if ((binaryUnsuspect) || (!(passbyIfBinarySuspect))) {
|
||||
|
||||
if (binaryUnsuspect || !passbyIfBinarySuspect) {
|
||||
byte[] filtered;
|
||||
if (inSingleQuote) {
|
||||
buffer.append(b);
|
||||
|
@ -306,7 +313,7 @@ public final class htmlFilterOutputStream extends OutputStream {
|
|||
inSingleQuote = false;
|
||||
// the tag ends here. after filtering: pass on
|
||||
filtered = filterSentence(buffer.getBytes(), singlequote);
|
||||
if (out != null) out.write(filtered);
|
||||
if (out != null) { out.write(filtered); }
|
||||
// buffer = new serverByteBuffer();
|
||||
buffer.reset();
|
||||
}
|
||||
|
@ -314,7 +321,7 @@ public final class htmlFilterOutputStream extends OutputStream {
|
|||
buffer.append(b);
|
||||
if (b == doublequote) inDoubleQuote = false;
|
||||
// check error cases
|
||||
if ((b == rb) && (buffer.byteAt(0) == lb)) {
|
||||
if (b == rb && buffer.byteAt(0) == lb) {
|
||||
inDoubleQuote = false;
|
||||
// the tag ends here. after filtering: pass on
|
||||
filtered = filterSentence(buffer.getBytes(), doublequote);
|
||||
|
@ -324,8 +331,9 @@ public final class htmlFilterOutputStream extends OutputStream {
|
|||
}
|
||||
} else if (inComment) {
|
||||
buffer.append(b);
|
||||
if ((b == rb) && (buffer.length() > 6) &&
|
||||
(buffer.byteAt(buffer.length() - 3) == dash)) {
|
||||
if (b == rb &&
|
||||
buffer.length() > 6 &&
|
||||
buffer.byteAt(buffer.length() - 3) == dash) {
|
||||
// comment is at end
|
||||
inComment = false;
|
||||
if (out != null) out.write(buffer.getBytes());
|
||||
|
@ -336,13 +344,13 @@ public final class htmlFilterOutputStream extends OutputStream {
|
|||
buffer.append(b);
|
||||
int bufferLength = buffer.length();
|
||||
if ((b == rb) && (bufferLength > 14) &&
|
||||
(buffer.byteAt(bufferLength - 8) == (byte) '/') &&
|
||||
(buffer.byteAt(bufferLength - 7) == (byte) 's') &&
|
||||
(buffer.byteAt(bufferLength - 6) == (byte) 'c') &&
|
||||
(buffer.byteAt(bufferLength - 5) == (byte) 'r') &&
|
||||
(buffer.byteAt(bufferLength - 4) == (byte) 'i') &&
|
||||
(buffer.byteAt(bufferLength - 3) == (byte) 'p') &&
|
||||
(buffer.byteAt(bufferLength - 2) == (byte) 't')) {
|
||||
(buffer.byteAt(bufferLength - 8) == (byte) '/') &&
|
||||
(buffer.byteAt(bufferLength - 7) == (byte) 's') &&
|
||||
(buffer.byteAt(bufferLength - 6) == (byte) 'c') &&
|
||||
(buffer.byteAt(bufferLength - 5) == (byte) 'r') &&
|
||||
(buffer.byteAt(bufferLength - 4) == (byte) 'i') &&
|
||||
(buffer.byteAt(bufferLength - 3) == (byte) 'p') &&
|
||||
(buffer.byteAt(bufferLength - 2) == (byte) 't')) {
|
||||
// script is at end
|
||||
inScript = false;
|
||||
if (out != null) out.write(buffer.getBytes());
|
||||
|
@ -362,17 +370,17 @@ public final class htmlFilterOutputStream extends OutputStream {
|
|||
if (b == doublequote) inDoubleQuote = true;
|
||||
// fill in tag text
|
||||
if ((buffer.length() == 3) && (buffer.byteAt(1) == excl) &&
|
||||
(buffer.byteAt(2) == dash) && (b == dash)) {
|
||||
(buffer.byteAt(2) == dash) && (b == dash)) {
|
||||
// this is the start of a comment
|
||||
inComment = true;
|
||||
buffer.append(b);
|
||||
} else if ((buffer.length() == 6) &&
|
||||
(buffer.byteAt(1) == (byte) 's') &&
|
||||
(buffer.byteAt(2) == (byte) 'c') &&
|
||||
(buffer.byteAt(3) == (byte) 'r') &&
|
||||
(buffer.byteAt(4) == (byte) 'i') &&
|
||||
(buffer.byteAt(5) == (byte) 'p') &&
|
||||
( b == (byte) 't')) {
|
||||
(buffer.byteAt(1) == (byte) 's') &&
|
||||
(buffer.byteAt(2) == (byte) 'c') &&
|
||||
(buffer.byteAt(3) == (byte) 'r') &&
|
||||
(buffer.byteAt(4) == (byte) 'i') &&
|
||||
(buffer.byteAt(5) == (byte) 'p') &&
|
||||
(b == (byte) 't')) {
|
||||
// this is the start of a comment
|
||||
inScript = true;
|
||||
buffer.append(b);
|
||||
|
@ -417,33 +425,33 @@ public final class htmlFilterOutputStream extends OutputStream {
|
|||
out.write(b);
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
public void write(byte b[]) throws IOException {
|
||||
this.write(b, 0, b.length);
|
||||
this.write(b, 0, b.length);
|
||||
}
|
||||
|
||||
public void write(byte b[], int off, int len) throws IOException {
|
||||
//System.out.println(new String(b, off, len));
|
||||
if ((off | len | (b.length - (len + off)) | (off + len)) < 0) throw new IndexOutOfBoundsException();
|
||||
for (int i = off ; i < (len - off) ; i++) this.write(b[i]);
|
||||
// System.out.println(new String(b, off, len));
|
||||
if ((off | len | (b.length - (len + off)) | (off + len)) < 0) throw new IndexOutOfBoundsException();
|
||||
for (int i = off ; i < (len - off) ; i++) this.write(b[i]);
|
||||
}
|
||||
|
||||
public void flush() throws IOException {
|
||||
// we cannot flush the current string buffer to prevent that
|
||||
// the filter process is messed up
|
||||
// instead, we simply flush the underlying output stream
|
||||
if (out != null) out.flush();
|
||||
// if you want to flush all, call close() at end of writing;
|
||||
// we cannot flush the current string buffer to prevent that
|
||||
// the filter process is messed up
|
||||
// instead, we simply flush the underlying output stream
|
||||
if (out != null) out.flush();
|
||||
// if you want to flush all, call close() at end of writing;
|
||||
}
|
||||
|
||||
|
||||
public void finalize() throws IOException {
|
||||
// if we are forced to close, we of course flush the buffer first,
|
||||
// then close the connection
|
||||
// if we are forced to close, we of course flush the buffer first,
|
||||
// then close the connection
|
||||
close();
|
||||
}
|
||||
|
||||
public void close() throws IOException {
|
||||
byte quotechar = (inSingleQuote) ? singlequote : doublequote;
|
||||
byte quotechar = (inSingleQuote) ? singlequote : doublequote;
|
||||
if (buffer != null) {
|
||||
if (buffer.length() > 0) {
|
||||
byte[] filtered = filterSentence(buffer.getBytes(), quotechar);
|
||||
|
@ -452,53 +460,53 @@ public final class htmlFilterOutputStream extends OutputStream {
|
|||
buffer = null;
|
||||
}
|
||||
byte[] finalized = filterFinalize(quotechar);
|
||||
if (out != null) {
|
||||
if (finalized != null) out.write(finalized);
|
||||
out.flush();
|
||||
out.close();
|
||||
}
|
||||
if (out != null) {
|
||||
if (finalized != null) out.write(finalized);
|
||||
out.flush();
|
||||
out.close();
|
||||
}
|
||||
filterTag = null;
|
||||
filterOpts = null;
|
||||
filterCont = null;
|
||||
//if (scraper != null) {scraper.close(); scraper = null;}
|
||||
//if (transformer != null) {transformer.close(); transformer = null;}
|
||||
// if (scraper != null) {scraper.close(); scraper = null;}
|
||||
// if (transformer != null) {transformer.close(); transformer = null;}
|
||||
}
|
||||
|
||||
|
||||
private static boolean binaryHint(byte b) {
|
||||
if (b < 0) return false;
|
||||
if (b < 0) return false;
|
||||
if (b > 31) return false;
|
||||
if ((b == 8) || (b == 9) || (b == 10) || (b == 13)) return false;
|
||||
//return false;
|
||||
//System.out.println("BINARY HINT: " + (int) b);
|
||||
return true;
|
||||
// return false;
|
||||
// System.out.println("BINARY HINT: " + (int) b);
|
||||
return true;
|
||||
}
|
||||
|
||||
public boolean binarySuspect() {
|
||||
return !binaryUnsuspect;
|
||||
}
|
||||
|
||||
|
||||
public static void main(String[] args) {
|
||||
// test app
|
||||
// takes one argument: a file name
|
||||
if (args.length != 1) return;
|
||||
byte[] buffer = new byte[512];
|
||||
try {
|
||||
htmlFilterContentScraper lc = new htmlFilterContentScraper(new URL("http://www.anomic.de/"));
|
||||
ArrayList v = new ArrayList();
|
||||
v.add("proxy");
|
||||
htmlFilterTransformer lt = new htmlFilterContentTransformer();
|
||||
InputStream is = new FileInputStream(args[0]);
|
||||
FileOutputStream fos = new FileOutputStream(new File(args[0] + ".out"));
|
||||
OutputStream os = new htmlFilterOutputStream(fos, lc, lt, false);
|
||||
int i;
|
||||
while ((i = is.read(buffer)) > 0) os.write(buffer, 0, i);
|
||||
os.close();
|
||||
fos.close();
|
||||
is.close();
|
||||
lc.print();
|
||||
}
|
||||
catch (MalformedURLException e) {}
|
||||
catch (IOException e) {}
|
||||
// test app
|
||||
// takes one argument: a file name
|
||||
if (args.length != 1) return;
|
||||
byte[] buffer = new byte[512];
|
||||
try {
|
||||
htmlFilterContentScraper lc = new htmlFilterContentScraper(new URL("http://www.anomic.de/"));
|
||||
ArrayList v = new ArrayList();
|
||||
v.add("proxy");
|
||||
htmlFilterTransformer lt = new htmlFilterContentTransformer();
|
||||
InputStream is = new FileInputStream(args[0]);
|
||||
FileOutputStream fos = new FileOutputStream(new File(args[0] + ".out"));
|
||||
OutputStream os = new htmlFilterOutputStream(fos, lc, lt, false);
|
||||
int i;
|
||||
while ((i = is.read(buffer)) > 0) os.write(buffer, 0, i);
|
||||
os.close();
|
||||
fos.close();
|
||||
is.close();
|
||||
lc.print();
|
||||
}
|
||||
catch (MalformedURLException e) {}
|
||||
catch (IOException e) {}
|
||||
}
|
||||
|
||||
}
|
||||
}
|
Loading…
Reference in New Issue
Block a user