From f8b015949c5446f8be08a7bd374625b8037d5e5f Mon Sep 17 00:00:00 2001 From: orbiter Date: Sat, 24 May 2008 10:47:22 +0000 Subject: [PATCH] fix for bug in html scraper that appears if opening and closing tag are not both in same case see http://forum.yacy-websuche.de/viewtopic.php?f=6&t=1173&p=7836#p7836 git-svn-id: https://svn.berlios.de/svnroot/repos/yacy/trunk@4844 6c8d7289-2bf4-0310-a012-ef5d649a1542 --- source/de/anomic/htmlFilter/htmlFilterContentScraper.java | 1 + source/de/anomic/htmlFilter/htmlFilterWriter.java | 2 +- 2 files changed, 2 insertions(+), 1 deletion(-) diff --git a/source/de/anomic/htmlFilter/htmlFilterContentScraper.java b/source/de/anomic/htmlFilter/htmlFilterContentScraper.java index b601783c6..467e9ef80 100644 --- a/source/de/anomic/htmlFilter/htmlFilterContentScraper.java +++ b/source/de/anomic/htmlFilter/htmlFilterContentScraper.java @@ -72,6 +72,7 @@ public class htmlFilterContentScraper extends htmlFilterAbstractScraper implemen private static final HashSet linkTags0 = new HashSet(9,0.99f); private static final HashSet linkTags1 = new HashSet(7,0.99f); + // all these tags must be given in lowercase, because the tags from the files are compared in lowercase static { linkTags0.add("img"); linkTags0.add("base"); diff --git a/source/de/anomic/htmlFilter/htmlFilterWriter.java b/source/de/anomic/htmlFilter/htmlFilterWriter.java index e0e43cbf5..13ab82e68 100644 --- a/source/de/anomic/htmlFilter/htmlFilterWriter.java +++ b/source/de/anomic/htmlFilter/htmlFilterWriter.java @@ -231,7 +231,7 @@ public final class htmlFilterWriter extends Writer { } // it's a tag! which one? - if ((opening) || (!(tag.equals(filterTag)))) { + if ((opening) || (!(tag.equalsIgnoreCase(filterTag)))) { // this tag is not our concern. just add it filterCont.append(genTag0raw(tag, opening, content)); return new char[0];