cleaned, Properties

git-svn-id: https://svn.berlios.de/svnroot/repos/yacy/trunk@867 6c8d7289-2bf4-0310-a012-ef5d649a1542
This commit is contained in:
borg-0300 2005-10-06 09:41:59 +00:00
parent 68aa215479
commit afc5ef2819
2 changed files with 311 additions and 293 deletions

View File

@ -1,9 +1,12 @@
// htmlFilterContentTransformer.java
// htmlFilterContentTransformer.java
// ---------------------------------
// (C) by Michael Peter Christen; mc@anomic.de
// first published on http://www.anomic.de
// Frankfurt, Germany, 2004
// last major change: 18.02.2004
//
// $LastChangedDate$
// $LastChangedRevision$
// $LastChangedBy$
//
// This program is free software; you can redistribute it and/or modify
// it under the terms of the GNU General Public License as published by
@ -48,7 +51,6 @@ import java.util.ArrayList;
import java.util.Locale;
import java.util.Properties;
import java.util.TreeSet;
import de.anomic.server.serverByteBuffer;
public class htmlFilterContentTransformer extends htmlFilterAbstractTransformer implements htmlFilterTransformer {
@ -56,88 +58,96 @@ public class htmlFilterContentTransformer extends htmlFilterAbstractTransformer
// statics: for initialisation of the HTMLFilterAbstractTransformer
private static TreeSet linkTags0;
private static TreeSet linkTags1;
private static final Collator insensitiveCollator = Collator.getInstance(Locale.US);
static {
insensitiveCollator.setStrength(Collator.SECONDARY);
insensitiveCollator.setDecomposition(Collator.NO_DECOMPOSITION);
insensitiveCollator.setStrength(Collator.SECONDARY);
insensitiveCollator.setDecomposition(Collator.NO_DECOMPOSITION);
}
static {
linkTags0 = new TreeSet(insensitiveCollator);
linkTags0.add("img");
linkTags1 = new TreeSet(insensitiveCollator);
linkTags1.add("a");
static {
linkTags0 = new TreeSet(insensitiveCollator);
linkTags0.add("img");
linkTags1 = new TreeSet(insensitiveCollator);
linkTags1.add("a");
}
private static ArrayList bluelist = null;
public htmlFilterContentTransformer() {
super(linkTags0, linkTags1);
super(linkTags0, linkTags1);
}
public void init(String initarg) {
//System.out.println("Transformer init: " + initarg);
if (bluelist == null) {
// here, the initarg is used to load a list of bluelisted words
bluelist = new ArrayList();
File f = new File(initarg);
if ((f.exists()) && (f.canRead())) try {
BufferedReader r = new BufferedReader(new FileReader(f));
String s;
while ((s = r.readLine()) != null) {
if ((!(s.startsWith("#"))) && (s.length() > 0)) bluelist.add(s.toLowerCase());
}
r.close();
} catch (Exception e) {
}
//if (bluelist.size() == 0) System.out.println("BLUELIST is empty");
}
// System.out.println("Transformer init: " + initarg);
if (bluelist == null) {
// here, the initarg is used to load a list of bluelisted words
bluelist = new ArrayList();
File f = new File(initarg);
if (f.canRead()) {
try {
BufferedReader r = new BufferedReader(new FileReader(f));
String s;
while ((s = r.readLine()) != null) {
if (!s.startsWith("#") && s.length() > 0) bluelist.add(s.toLowerCase());
}
r.close();
} catch (Exception e) {
}
// if (bluelist.size() == 0) System.out.println("BLUELIST is empty");
}
}
}
public boolean isIdentityTransformer() {
return bluelist.size() == 0;
}
private static byte[] genBlueLetters(int length) {
serverByteBuffer bb = new serverByteBuffer(" <FONT COLOR=#0000FF>".getBytes());
length = length / 2;
if (length > 10) length = 7;
while (length-- > 0) bb.append((byte) 'X');
bb.append("</FONT> ".getBytes());
return bb.getBytes();
serverByteBuffer bb = new serverByteBuffer(" <FONT COLOR=#0000FF>".getBytes());
length = length / 2;
if (length > 10) length = 7;
while (length-- > 0) {
bb.append((byte) 'X');
}
bb.append("</FONT> ".getBytes());
return bb.getBytes();
}
private boolean hit(byte[] text) {
if ((text == null) || (bluelist == null)) return false;
String lc = new String(text).toLowerCase();
for (int i = 0; i < bluelist.size(); i++) if (lc.indexOf((String) bluelist.get(i)) >= 0) return true;
return false;
if (text == null || bluelist == null) return false;
String lc = new String(text).toLowerCase();
for (int i = 0; i < bluelist.size(); i++) {
if (lc.indexOf((String) bluelist.get(i)) >= 0) return true;
}
return false;
}
public byte[] transformText(byte[] text) {
if (hit(text)) {
//System.out.println("FILTERHIT: " + text);
return genBlueLetters(text.length);
} else
return text;
if (hit(text)) {
// System.out.println("FILTERHIT: " + text);
return genBlueLetters(text.length);
} else {
return text;
}
}
public byte[] transformTag0(String tagname, Properties tagopts, byte quotechar) {
if (hit(tagopts.getProperty("src","").getBytes())) return genBlueLetters(5);
if (hit(tagopts.getProperty("alt","").getBytes())) return genBlueLetters(5);
return htmlFilterOutputStream.genTag0(tagname, tagopts, quotechar);
if (hit(tagopts.getProperty("src","").getBytes())) return genBlueLetters(5);
if (hit(tagopts.getProperty("alt","").getBytes())) return genBlueLetters(5);
return htmlFilterOutputStream.genTag0(tagname, tagopts, quotechar);
}
public byte[] transformTag1(String tagname, Properties tagopts, byte[] text, byte quotechar) {
if (hit(tagopts.getProperty("href","").getBytes())) return genBlueLetters(text.length);
if (hit(text)) return genBlueLetters(text.length);
return htmlFilterOutputStream.genTag1(tagname, tagopts, text, quotechar);
if (hit(tagopts.getProperty("href","").getBytes())) return genBlueLetters(text.length);
if (hit(text)) return genBlueLetters(text.length);
return htmlFilterOutputStream.genTag1(tagname, tagopts, text, quotechar);
}
public void close() {
// free resources
super.close();
}
}
}

View File

@ -1,9 +1,12 @@
// htmlFilterOutputStream.java
// htmlFilterOutputStream.java
// ---------------------------
// (C) by Michael Peter Christen; mc@anomic.de
// first published on http://www.anomic.de
// Frankfurt, Germany, 2004, 2005
// last major change: 16.02.2005
//
// $LastChangedDate$
// $LastChangedRevision$
// $LastChangedBy$
//
// This program is free software; you can redistribute it and/or modify
// it under the terms of the GNU General Public License as published by
@ -42,9 +45,8 @@
This class implements an output stream. Any data written to that output
is automatically parsed.
After finishing with writing, the htmlFilter can be read out.
*/
*/
package de.anomic.htmlFilter;
@ -59,7 +61,6 @@ import java.net.URL;
import java.util.ArrayList;
import java.util.Enumeration;
import java.util.Properties;
import de.anomic.server.serverByteBuffer;
public final class htmlFilterOutputStream extends OutputStream {
@ -84,219 +85,225 @@ public final class htmlFilterOutputStream extends OutputStream {
private boolean inScript;
private boolean binaryUnsuspect;
private boolean passbyIfBinarySuspect;
public htmlFilterOutputStream(OutputStream out,
htmlFilterScraper scraper, htmlFilterTransformer transformer,
public htmlFilterOutputStream(OutputStream out, htmlFilterScraper scraper,
htmlFilterTransformer transformer,
boolean passbyIfBinarySuspect) {
this.out = out;
this.scraper = scraper;
this.transformer = transformer;
this.buffer = new serverByteBuffer(1024);
this.filterTag = null;
this.filterOpts = null;
this.filterCont = null;
this.out = out;
this.scraper = scraper;
this.transformer = transformer;
this.buffer = new serverByteBuffer(1024);
this.filterTag = null;
this.filterOpts = null;
this.filterCont = null;
this.inSingleQuote = false;
this.inDoubleQuote = false;
this.inComment = false;
this.inScript = false;
this.inDoubleQuote = false;
this.inComment = false;
this.inScript = false;
this.binaryUnsuspect = true;
this.passbyIfBinarySuspect = passbyIfBinarySuspect;
}
public static byte[] genTag0raw(String tagname, boolean opening, byte[] tagopts) {
serverByteBuffer bb = new serverByteBuffer(tagname.length() + tagopts.length + 3);
bb.append((byte) '<');
if (!(opening)) bb.append((byte) '/');
bb.append(tagname.getBytes());
if (tagopts.length > 0) {
//if (tagopts[0] == (byte) 32)
bb.append(tagopts);
//else bb.append((byte) 32).append(tagopts);
}
bb.append((byte) '>');
return bb.getBytes();
serverByteBuffer bb = new serverByteBuffer(tagname.length() + tagopts.length + 3);
bb.append((byte) '<');
if (!opening) {
bb.append((byte) '/');
}
bb.append(tagname.getBytes());
if (tagopts.length > 0) {
// if (tagopts[0] == (byte) 32)
bb.append(tagopts);
// else bb.append((byte) 32).append(tagopts);
}
bb.append((byte) '>');
return bb.getBytes();
}
public static byte[] genTag1raw(String tagname, byte[] tagopts, byte[] text) {
serverByteBuffer bb = new serverByteBuffer(2 * tagname.length() + tagopts.length + text.length + 5);
bb.append((byte) '<').append(tagname.getBytes());
if (tagopts.length > 0) {
//if (tagopts[0] == (byte) 32)
bb.append(tagopts);
//else bb.append((byte) 32).append(tagopts);
}
bb.append((byte) '>');
bb.append(text);
bb.append((byte) '<').append((byte) '/').append(tagname.getBytes()).append((byte) '>');
return bb.getBytes();
serverByteBuffer bb = new serverByteBuffer(2 * tagname.length() + tagopts.length + text.length + 5);
bb.append((byte) '<').append(tagname.getBytes());
if (tagopts.length > 0) {
// if (tagopts[0] == (byte) 32)
bb.append(tagopts);
// else bb.append((byte) 32).append(tagopts);
}
bb.append((byte) '>');
bb.append(text);
bb.append((byte) '<').append((byte) '/').append(tagname.getBytes()).append((byte) '>');
return bb.getBytes();
}
public static byte[] genTag0(String tagname, Properties tagopts, byte quotechar) {
byte[] tagoptsx = (tagopts.size() == 0) ? null : genOpts(tagopts, quotechar);
serverByteBuffer bb = new serverByteBuffer(tagname.length() + ((tagoptsx == null) ? 0 : (tagoptsx.length + 1)) + tagname.length() + 2).append((byte) '<').append(tagname.getBytes());
if (tagoptsx != null) bb = bb.append((byte) 32).append(tagoptsx);
bb = bb.append((byte) '>');
return bb.getBytes();
serverByteBuffer bb = new serverByteBuffer(tagname.length() + ((tagoptsx == null) ? 0 : (tagoptsx.length + 1)) + tagname.length() + 2).append((byte) '<').append(tagname.getBytes());
if (tagoptsx != null) {
bb = bb.append((byte) 32).append(tagoptsx);
}
bb = bb.append((byte) '>');
return bb.getBytes();
}
public static byte[] genTag1(String tagname, Properties tagopts, byte[] text, byte quotechar) {
byte[] gt0 = genTag0(tagname, tagopts, quotechar);
return new serverByteBuffer(gt0, gt0.length + text.length + tagname.length() + 3).append(text).append(("</" + tagname + ">").getBytes()).getBytes();
return new serverByteBuffer(gt0, gt0.length + text.length + tagname.length() + 3).append(text).append(("</" + tagname + ">").getBytes()).getBytes();
}
// a helper method for pretty-printing of properties for html tags
public static byte[] genOpts(Properties prop, byte quotechar) {
Enumeration e = prop.propertyNames();
serverByteBuffer bb = new serverByteBuffer(prop.size() * 40);
String key;
while (e.hasMoreElements()) {
key = (String) e.nextElement();
bb = bb.append((byte) 32).append(key.getBytes()).append((byte) '=');
bb = bb.append(quotechar).append(prop.getProperty(key).getBytes()).append(quotechar);
}
if (bb.length() > 0) return bb.getBytes(1); else return bb.getBytes();
Enumeration e = prop.propertyNames();
serverByteBuffer bb = new serverByteBuffer(prop.size() * 40);
String key;
while (e.hasMoreElements()) {
key = (String) e.nextElement();
bb = bb.append((byte) 32).append(key.getBytes()).append((byte) '=');
bb = bb.append(quotechar).append(prop.getProperty(key).getBytes()).append(quotechar);
}
if (bb.length() > 0) return bb.getBytes(1); else return bb.getBytes();
}
private byte[] filterTag(String tag, boolean opening, byte[] content, byte quotechar) {
//System.out.println("FILTER1: filterTag=" + ((filterTag == null) ? "null" : filterTag) + ", tag=" + tag + ", opening=" + ((opening) ? "true" : "false") + ", content=" + new String(content)); // debug
if (filterTag == null) {
// we are not collection tag text
if (tag == null) {
// and this is not a tag opener/closer
if (scraper != null) scraper.scrapeText(content);
if (transformer != null) return transformer.transformText(content); else return content;
} else {
// we have a new tag
if (opening) {
if ((scraper != null) && (scraper.isTag0(tag))) {
// this single tag is collected at once here
scraper.scrapeTag0(tag, new serverByteBuffer(content).propParser());
}
if ((transformer != null) && (transformer.isTag0(tag))) {
// this single tag is collected at once here
return transformer.transformTag0(tag, new serverByteBuffer(content).propParser(), quotechar);
} else if (((scraper != null) && (scraper.isTag1(tag))) ||
((transformer != null) && (transformer.isTag1(tag)))) {
// ok, start collecting
filterTag = tag;
filterOpts = new serverByteBuffer(content).propParser();
filterCont = new serverByteBuffer();
return new byte[0];
} else {
// we ignore that thing and return it again
return genTag0raw(tag, true, content);
}
} else {
// we ignore that thing and return it again
return genTag0raw(tag, false, content);
}
}
} else {
// we are collection tag text for the tag 'filterTag'
if (tag == null) {
// go on collecting content
if (scraper != null) scraper.scrapeText(content);
if (transformer != null)
filterCont.append(transformer.transformText(content));
else
filterCont.append(content);
return new byte[0];
} else {
// it's a tag! which one?
if ((opening) || (!(tag.equals(filterTag)))) {
// this tag is not our concern. just add it
filterCont.append(genTag0raw(tag, opening, content));
return new byte[0];
} else {
// it's our closing tag! return complete result.
byte[] ret;
if (scraper != null) scraper.scrapeTag1(filterTag, filterOpts, filterCont.getBytes());
if (transformer != null)
ret = transformer.transformTag1(filterTag, filterOpts, filterCont.getBytes(), quotechar);
else
ret = genTag1(filterTag, filterOpts, filterCont.getBytes(), quotechar);
filterTag = null;
filterOpts = null;
filterCont = null;
return ret;
}
}
}
// System.out.println("FILTER1: filterTag=" + ((filterTag == null) ? "null" : filterTag) + ", tag=" + tag + ", opening=" + ((opening) ? "true" : "false") + ", content=" + new String(content)); // debug
if (filterTag == null) {
// we are not collection tag text
if (tag == null) {
// and this is not a tag opener/closer
if (scraper != null) scraper.scrapeText(content);
if (transformer != null) return transformer.transformText(content); else return content;
} else {
// we have a new tag
if (opening) {
if ((scraper != null) && (scraper.isTag0(tag))) {
// this single tag is collected at once here
scraper.scrapeTag0(tag, new serverByteBuffer(content).propParser());
}
if ((transformer != null) && (transformer.isTag0(tag))) {
// this single tag is collected at once here
return transformer.transformTag0(tag, new serverByteBuffer(content).propParser(), quotechar);
} else if (((scraper != null) && (scraper.isTag1(tag))) ||
((transformer != null) && (transformer.isTag1(tag)))) {
// ok, start collecting
filterTag = tag;
filterOpts = new serverByteBuffer(content).propParser();
filterCont = new serverByteBuffer();
return new byte[0];
} else {
// we ignore that thing and return it again
return genTag0raw(tag, true, content);
}
} else {
// we ignore that thing and return it again
return genTag0raw(tag, false, content);
}
}
} else {
// we are collection tag text for the tag 'filterTag'
if (tag == null) {
// go on collecting content
if (scraper != null) scraper.scrapeText(content);
if (transformer != null) {
filterCont.append(transformer.transformText(content));
} else {
filterCont.append(content);
}
return new byte[0];
} else {
// it's a tag! which one?
if ((opening) || (!(tag.equals(filterTag)))) {
// this tag is not our concern. just add it
filterCont.append(genTag0raw(tag, opening, content));
return new byte[0];
} else {
// it's our closing tag! return complete result.
byte[] ret;
if (scraper != null) scraper.scrapeTag1(filterTag, filterOpts, filterCont.getBytes());
if (transformer != null) {
ret = transformer.transformTag1(filterTag, filterOpts, filterCont.getBytes(), quotechar);
} else {
ret = genTag1(filterTag, filterOpts, filterCont.getBytes(), quotechar);
}
filterTag = null;
filterOpts = null;
filterCont = null;
return ret;
}
}
}
}
private byte[] filterFinalize(byte quotechar) {
if (filterTag == null) {
return new byte[0];
} else {
// it's our closing tag! return complete result.
byte[] ret;
if (scraper != null) scraper.scrapeTag1(filterTag, filterOpts, filterCont.getBytes());
if (transformer != null)
ret = transformer.transformTag1(filterTag, filterOpts, filterCont.getBytes(), quotechar);
else
ret = genTag1(filterTag, filterOpts, filterCont.getBytes(), quotechar);
filterTag = null;
filterOpts = null;
filterCont = null;
return ret;
}
if (filterTag == null) {
return new byte[0];
} else {
// it's our closing tag! return complete result.
byte[] ret;
if (scraper != null) scraper.scrapeTag1(filterTag, filterOpts, filterCont.getBytes());
if (transformer != null) {
ret = transformer.transformTag1(filterTag, filterOpts, filterCont.getBytes(), quotechar);
} else {
ret = genTag1(filterTag, filterOpts, filterCont.getBytes(), quotechar);
}
filterTag = null;
filterOpts = null;
filterCont = null;
return ret;
}
}
private byte[] filterSentence(byte[] in, byte quotechar) {
if (in.length == 0) return in;
//System.out.println("FILTER0: " + new String(in)); // debug
// scan the string and parse structure
if ((in.length > 2) && (in[0] == lb)) {
// a tag
String tag;
int tagend;
if (in[1] == '/') {
// a closing tag
tagend = tagEnd(in, 2);
tag = new String(in, 2, tagend - 2);
byte[] text = new byte[in.length - tagend - 1];
System.arraycopy(in, tagend, text, 0, in.length - tagend - 1);
return filterTag(tag, false, text, quotechar);
} else {
// an opening tag
tagend = tagEnd(in, 1);
tag = new String(in, 1, tagend - 1);
byte[] text = new byte[in.length - tagend - 1];
System.arraycopy(in, tagend, text, 0, in.length - tagend - 1);
return filterTag(tag, true, text, quotechar);
}
} else {
// a text
return filterTag(null, true, in, quotechar);
}
if (in.length == 0) return in;
// System.out.println("FILTER0: " + new String(in)); // debug
// scan the string and parse structure
if (in.length > 2 && in[0] == lb) {
// a tag
String tag;
int tagend;
if (in[1] == '/') {
// a closing tag
tagend = tagEnd(in, 2);
tag = new String(in, 2, tagend - 2);
byte[] text = new byte[in.length - tagend - 1];
System.arraycopy(in, tagend, text, 0, in.length - tagend - 1);
return filterTag(tag, false, text, quotechar);
} else {
// an opening tag
tagend = tagEnd(in, 1);
tag = new String(in, 1, tagend - 1);
byte[] text = new byte[in.length - tagend - 1];
System.arraycopy(in, tagend, text, 0, in.length - tagend - 1);
return filterTag(tag, true, text, quotechar);
}
} else {
// a text
return filterTag(null, true, in, quotechar);
}
}
private static int tagEnd(byte[] tag, int start) {
char c;
for (int i = start; i < tag.length; i++) {
c = (char) tag[i];
if ((c != '!') && (c != '-') &&
((c < '0') || (c > '9')) &&
((c < 'a') || (c > 'z')) &&
((c < 'A') || (c > 'Z'))
) return i;
}
return tag.length - 1;
char c;
for (int i = start; i < tag.length; i++) {
c = (char) tag[i];
if (c != '!' && c != '-' &&
(c < '0' || c > '9') &&
(c < 'a' || c > 'z') &&
(c < 'A' || c > 'Z')
) return i;
}
return tag.length - 1;
}
public void write(int b) throws IOException {
write((byte) (b & 0xff));
write((byte) (b & 0xff));
}
private void write(byte b) throws IOException {
//System.out.println((char) b);
// System.out.println((char) b);
if ((binaryUnsuspect) && (binaryHint(b))) {
binaryUnsuspect = false;
if (passbyIfBinarySuspect) finalize();
}
if ((binaryUnsuspect) || (!(passbyIfBinarySuspect))) {
if (binaryUnsuspect || !passbyIfBinarySuspect) {
byte[] filtered;
if (inSingleQuote) {
buffer.append(b);
@ -306,7 +313,7 @@ public final class htmlFilterOutputStream extends OutputStream {
inSingleQuote = false;
// the tag ends here. after filtering: pass on
filtered = filterSentence(buffer.getBytes(), singlequote);
if (out != null) out.write(filtered);
if (out != null) { out.write(filtered); }
// buffer = new serverByteBuffer();
buffer.reset();
}
@ -314,7 +321,7 @@ public final class htmlFilterOutputStream extends OutputStream {
buffer.append(b);
if (b == doublequote) inDoubleQuote = false;
// check error cases
if ((b == rb) && (buffer.byteAt(0) == lb)) {
if (b == rb && buffer.byteAt(0) == lb) {
inDoubleQuote = false;
// the tag ends here. after filtering: pass on
filtered = filterSentence(buffer.getBytes(), doublequote);
@ -324,8 +331,9 @@ public final class htmlFilterOutputStream extends OutputStream {
}
} else if (inComment) {
buffer.append(b);
if ((b == rb) && (buffer.length() > 6) &&
(buffer.byteAt(buffer.length() - 3) == dash)) {
if (b == rb &&
buffer.length() > 6 &&
buffer.byteAt(buffer.length() - 3) == dash) {
// comment is at end
inComment = false;
if (out != null) out.write(buffer.getBytes());
@ -336,13 +344,13 @@ public final class htmlFilterOutputStream extends OutputStream {
buffer.append(b);
int bufferLength = buffer.length();
if ((b == rb) && (bufferLength > 14) &&
(buffer.byteAt(bufferLength - 8) == (byte) '/') &&
(buffer.byteAt(bufferLength - 7) == (byte) 's') &&
(buffer.byteAt(bufferLength - 6) == (byte) 'c') &&
(buffer.byteAt(bufferLength - 5) == (byte) 'r') &&
(buffer.byteAt(bufferLength - 4) == (byte) 'i') &&
(buffer.byteAt(bufferLength - 3) == (byte) 'p') &&
(buffer.byteAt(bufferLength - 2) == (byte) 't')) {
(buffer.byteAt(bufferLength - 8) == (byte) '/') &&
(buffer.byteAt(bufferLength - 7) == (byte) 's') &&
(buffer.byteAt(bufferLength - 6) == (byte) 'c') &&
(buffer.byteAt(bufferLength - 5) == (byte) 'r') &&
(buffer.byteAt(bufferLength - 4) == (byte) 'i') &&
(buffer.byteAt(bufferLength - 3) == (byte) 'p') &&
(buffer.byteAt(bufferLength - 2) == (byte) 't')) {
// script is at end
inScript = false;
if (out != null) out.write(buffer.getBytes());
@ -362,17 +370,17 @@ public final class htmlFilterOutputStream extends OutputStream {
if (b == doublequote) inDoubleQuote = true;
// fill in tag text
if ((buffer.length() == 3) && (buffer.byteAt(1) == excl) &&
(buffer.byteAt(2) == dash) && (b == dash)) {
(buffer.byteAt(2) == dash) && (b == dash)) {
// this is the start of a comment
inComment = true;
buffer.append(b);
} else if ((buffer.length() == 6) &&
(buffer.byteAt(1) == (byte) 's') &&
(buffer.byteAt(2) == (byte) 'c') &&
(buffer.byteAt(3) == (byte) 'r') &&
(buffer.byteAt(4) == (byte) 'i') &&
(buffer.byteAt(5) == (byte) 'p') &&
( b == (byte) 't')) {
(buffer.byteAt(1) == (byte) 's') &&
(buffer.byteAt(2) == (byte) 'c') &&
(buffer.byteAt(3) == (byte) 'r') &&
(buffer.byteAt(4) == (byte) 'i') &&
(buffer.byteAt(5) == (byte) 'p') &&
(b == (byte) 't')) {
// this is the start of a comment
inScript = true;
buffer.append(b);
@ -417,33 +425,33 @@ public final class htmlFilterOutputStream extends OutputStream {
out.write(b);
}
}
public void write(byte b[]) throws IOException {
this.write(b, 0, b.length);
this.write(b, 0, b.length);
}
public void write(byte b[], int off, int len) throws IOException {
//System.out.println(new String(b, off, len));
if ((off | len | (b.length - (len + off)) | (off + len)) < 0) throw new IndexOutOfBoundsException();
for (int i = off ; i < (len - off) ; i++) this.write(b[i]);
// System.out.println(new String(b, off, len));
if ((off | len | (b.length - (len + off)) | (off + len)) < 0) throw new IndexOutOfBoundsException();
for (int i = off ; i < (len - off) ; i++) this.write(b[i]);
}
public void flush() throws IOException {
// we cannot flush the current string buffer to prevent that
// the filter process is messed up
// instead, we simply flush the underlying output stream
if (out != null) out.flush();
// if you want to flush all, call close() at end of writing;
// we cannot flush the current string buffer to prevent that
// the filter process is messed up
// instead, we simply flush the underlying output stream
if (out != null) out.flush();
// if you want to flush all, call close() at end of writing;
}
public void finalize() throws IOException {
// if we are forced to close, we of course flush the buffer first,
// then close the connection
// if we are forced to close, we of course flush the buffer first,
// then close the connection
close();
}
public void close() throws IOException {
byte quotechar = (inSingleQuote) ? singlequote : doublequote;
byte quotechar = (inSingleQuote) ? singlequote : doublequote;
if (buffer != null) {
if (buffer.length() > 0) {
byte[] filtered = filterSentence(buffer.getBytes(), quotechar);
@ -452,53 +460,53 @@ public final class htmlFilterOutputStream extends OutputStream {
buffer = null;
}
byte[] finalized = filterFinalize(quotechar);
if (out != null) {
if (finalized != null) out.write(finalized);
out.flush();
out.close();
}
if (out != null) {
if (finalized != null) out.write(finalized);
out.flush();
out.close();
}
filterTag = null;
filterOpts = null;
filterCont = null;
//if (scraper != null) {scraper.close(); scraper = null;}
//if (transformer != null) {transformer.close(); transformer = null;}
// if (scraper != null) {scraper.close(); scraper = null;}
// if (transformer != null) {transformer.close(); transformer = null;}
}
private static boolean binaryHint(byte b) {
if (b < 0) return false;
if (b < 0) return false;
if (b > 31) return false;
if ((b == 8) || (b == 9) || (b == 10) || (b == 13)) return false;
//return false;
//System.out.println("BINARY HINT: " + (int) b);
return true;
// return false;
// System.out.println("BINARY HINT: " + (int) b);
return true;
}
public boolean binarySuspect() {
return !binaryUnsuspect;
}
public static void main(String[] args) {
// test app
// takes one argument: a file name
if (args.length != 1) return;
byte[] buffer = new byte[512];
try {
htmlFilterContentScraper lc = new htmlFilterContentScraper(new URL("http://www.anomic.de/"));
ArrayList v = new ArrayList();
v.add("proxy");
htmlFilterTransformer lt = new htmlFilterContentTransformer();
InputStream is = new FileInputStream(args[0]);
FileOutputStream fos = new FileOutputStream(new File(args[0] + ".out"));
OutputStream os = new htmlFilterOutputStream(fos, lc, lt, false);
int i;
while ((i = is.read(buffer)) > 0) os.write(buffer, 0, i);
os.close();
fos.close();
is.close();
lc.print();
}
catch (MalformedURLException e) {}
catch (IOException e) {}
// test app
// takes one argument: a file name
if (args.length != 1) return;
byte[] buffer = new byte[512];
try {
htmlFilterContentScraper lc = new htmlFilterContentScraper(new URL("http://www.anomic.de/"));
ArrayList v = new ArrayList();
v.add("proxy");
htmlFilterTransformer lt = new htmlFilterContentTransformer();
InputStream is = new FileInputStream(args[0]);
FileOutputStream fos = new FileOutputStream(new File(args[0] + ".out"));
OutputStream os = new htmlFilterOutputStream(fos, lc, lt, false);
int i;
while ((i = is.read(buffer)) > 0) os.write(buffer, 0, i);
os.close();
fos.close();
is.close();
lc.print();
}
catch (MalformedURLException e) {}
catch (IOException e) {}
}
}
}