mirror of
https://github.com/yacy/yacy_search_server.git
synced 2024-09-19 00:01:41 +02:00
Fixed issue #158 : completed div CSS class ignore in crawl
This commit is contained in:
parent
fa65fb1a03
commit
eb20589e29
|
@ -373,7 +373,7 @@
|
|||
<dt>Filter div class names</dt>
|
||||
<dd>
|
||||
<table border="0">
|
||||
<tr><td width="110">set of class names</td><td><input name="ignoreclassname" id="ignoreclassname" type="text" size="55" maxlength="100000" value="#[ignoreclassname]#" onblur="if (this.value=='') this.value='';"/></td><td>comma-separated list of div class names which should be filtered out</td></tr>
|
||||
<tr><td width="110">set of CSS class names</td><td><input name="ignoreclassname" id="ignoreclassname" type="text" size="55" maxlength="100000" value="#[ignoreclassname]#" onblur="if (this.value=='') this.value='';"/></td><td>comma-separated list of <div> element class names which should be filtered out</td></tr>
|
||||
</table>
|
||||
</dd>
|
||||
</dl>
|
||||
|
|
|
@ -65,17 +65,6 @@ public abstract class AbstractScraper implements Scraper {
|
|||
return (this.tags1 != null) && (this.tags1.contains(tag.toLowerCase()));
|
||||
}
|
||||
|
||||
//the 'missing' method that shall be implemented:
|
||||
@Override
|
||||
public abstract void scrapeText(char[] text, String insideTag);
|
||||
|
||||
// the other methods must take into account to construct the return value correctly
|
||||
@Override
|
||||
public abstract void scrapeTag0(ContentScraper.Tag tag);
|
||||
|
||||
@Override
|
||||
public abstract void scrapeTag1(ContentScraper.Tag tag);
|
||||
|
||||
public static String stripAllTags(final char[] s) {
|
||||
if (s.length > 80 && !MemoryControl.request(s.length * 2, false)) return "";
|
||||
final StringBuilder r = new StringBuilder(s.length);
|
||||
|
|
|
@ -145,6 +145,10 @@ public class ContentScraper extends AbstractScraper implements Scraper {
|
|||
public String name;
|
||||
public Properties opts;
|
||||
public CharBuffer content;
|
||||
|
||||
/** Set to true when this tag should be ignored from scraping */
|
||||
private boolean ignore = false;
|
||||
|
||||
public Tag(final String name) {
|
||||
this.name = name;
|
||||
this.opts = new Properties();
|
||||
|
@ -174,6 +178,18 @@ public class ContentScraper extends AbstractScraper implements Scraper {
|
|||
public String toString() {
|
||||
return "<" + name + " " + opts + ">" + content + "</" + name + ">";
|
||||
}
|
||||
|
||||
/** @return true when this tag should be ignored from scraping */
|
||||
public boolean isIgnore() {
|
||||
return this.ignore;
|
||||
}
|
||||
|
||||
/**
|
||||
* @param ignore true when this tag should be ignored from scraping
|
||||
*/
|
||||
public void setIgnore(final boolean ignore) {
|
||||
this.ignore = ignore;
|
||||
}
|
||||
}
|
||||
|
||||
// all these tags must be given in lowercase, because the tags from the files are compared in lowercase
|
||||
|
@ -216,7 +232,10 @@ public class ContentScraper extends AbstractScraper implements Scraper {
|
|||
private final int maxAnchors;
|
||||
|
||||
private final VocabularyScraper vocabularyScraper;
|
||||
private final Set<String> ignore_class_name;
|
||||
|
||||
/** Set of CSS class names whose matching div elements content should be ignored */
|
||||
private final Set<String> ignoreDivClassNames;
|
||||
|
||||
private final int timezoneOffset;
|
||||
private int breadcrumbs;
|
||||
|
||||
|
@ -245,18 +264,19 @@ public class ContentScraper extends AbstractScraper implements Scraper {
|
|||
* @param root the document root url
|
||||
* @param maxAnchors the maximum number of URLs to process and store in the anchors property.
|
||||
* @param maxLinks the maximum number of links (other than a, area, and canonical and stylesheet links) to store
|
||||
* @param ignoreDivClassNames an eventual set of CSS class names whose matching div elements content should be ignored
|
||||
* @param vocabularyScraper handles maps from class names to vocabulary names and from documents to a map from vocabularies to terms
|
||||
* @param timezoneOffset local time zone offset
|
||||
*/
|
||||
@SuppressWarnings("unchecked")
|
||||
public ContentScraper(final DigestURL root, final int maxAnchors, final int maxLinks, final Set<String> ignore_class_name, final VocabularyScraper vocabularyScraper, int timezoneOffset) {
|
||||
public ContentScraper(final DigestURL root, final int maxAnchors, final int maxLinks, final Set<String> ignoreDivClassNames, final VocabularyScraper vocabularyScraper, int timezoneOffset) {
|
||||
// the root value here will not be used to load the resource.
|
||||
// it is only the reference for relative links
|
||||
super(linkTags0, linkTags1);
|
||||
assert root != null;
|
||||
this.root = root;
|
||||
this.vocabularyScraper = vocabularyScraper;
|
||||
this.ignore_class_name = ignore_class_name;
|
||||
this.ignoreDivClassNames = ignoreDivClassNames;
|
||||
this.timezoneOffset = timezoneOffset;
|
||||
this.evaluationScores = new Evaluation();
|
||||
this.rss = new SizeLimitedMap<DigestURL, String>(maxLinks);
|
||||
|
@ -314,9 +334,15 @@ public class ContentScraper extends AbstractScraper implements Scraper {
|
|||
}
|
||||
|
||||
@Override
|
||||
public void scrapeText(final char[] newtext0, final String insideTag) {
|
||||
// System.out.println("SCRAPE: " + UTF8.String(newtext));
|
||||
if (insideTag != null && (TagName.script.name().equals(insideTag) || TagName.style.name().equals(insideTag))) return;
|
||||
public void scrapeText(final char[] newtext0, final Tag insideTag) {
|
||||
if (insideTag != null) {
|
||||
if(insideTag.ignore) {
|
||||
return;
|
||||
}
|
||||
if ((TagName.script.name().equals(insideTag.name) || TagName.style.name().equals(insideTag.name))) {
|
||||
return;
|
||||
}
|
||||
}
|
||||
int p, pl, q, s = 0;
|
||||
char[] newtext = CharacterCoding.html2unicode(new String(newtext0)).toCharArray();
|
||||
|
||||
|
@ -377,7 +403,7 @@ public class ContentScraper extends AbstractScraper implements Scraper {
|
|||
}
|
||||
// find tags inside text
|
||||
String b = cleanLine(stripAllTags(newtext));
|
||||
if ((insideTag != null) && (!(insideTag.equals("a")))) {
|
||||
if ((insideTag != null) && (!(insideTag.name.equals(TagName.a.name())))) {
|
||||
// texts inside tags sometimes have no punctuation at the line end
|
||||
// this is bad for the text semantics, because it is not possible for the
|
||||
// condenser to distinguish headlines from text beginnings.
|
||||
|
@ -697,6 +723,9 @@ public class ContentScraper extends AbstractScraper implements Scraper {
|
|||
*/
|
||||
@Override
|
||||
public void scrapeTag0(final Tag tag) {
|
||||
if(tag.ignore) {
|
||||
return;
|
||||
}
|
||||
checkOpts(tag);
|
||||
if (tag.name.equalsIgnoreCase("img")) {
|
||||
final String src = tag.opts.getProperty("src", EMPTY_STRING);
|
||||
|
@ -861,6 +890,9 @@ public class ContentScraper extends AbstractScraper implements Scraper {
|
|||
*/
|
||||
@Override
|
||||
public void scrapeTag1(final Tag tag) {
|
||||
if(tag.ignore) {
|
||||
return;
|
||||
}
|
||||
checkOpts(tag);
|
||||
// System.out.println("ScrapeTag1: tag.tagname=" + tag.tagname + ", opts=" + tag.opts.toString() + ", text=" + UTF8.String(text));
|
||||
if (tag.name.equalsIgnoreCase("a") && tag.content.length() < 2048) {
|
||||
|
@ -882,18 +914,12 @@ public class ContentScraper extends AbstractScraper implements Scraper {
|
|||
}
|
||||
final String h;
|
||||
if (tag.name.equalsIgnoreCase("div")) {
|
||||
final String classn = tag.opts.getProperty("class", EMPTY_STRING);
|
||||
if (classn.length() > 0 && this.ignore_class_name.contains(classn)) {
|
||||
// we remove everything inside that tag, so it can be ignored
|
||||
tag.content.clear();
|
||||
} else {
|
||||
final String id = tag.opts.getProperty("id", EMPTY_STRING);
|
||||
this.evaluationScores.match(Element.divid, id);
|
||||
final String itemtype = tag.opts.getProperty("itemtype", EMPTY_STRING);
|
||||
if (itemtype.equals("http://data-vocabulary.org/Breadcrumb")) {
|
||||
breadcrumbs++;
|
||||
}
|
||||
}
|
||||
final String id = tag.opts.getProperty("id", EMPTY_STRING);
|
||||
this.evaluationScores.match(Element.divid, id);
|
||||
final String itemtype = tag.opts.getProperty("itemtype", EMPTY_STRING);
|
||||
if (itemtype.equals("http://data-vocabulary.org/Breadcrumb")) {
|
||||
breadcrumbs++;
|
||||
}
|
||||
} else if ((tag.name.equalsIgnoreCase("h1")) && (tag.content.length() < 1024)) {
|
||||
h = cleanLine(CharacterCoding.html2unicode(stripAllTags(tag.content.getChars())));
|
||||
if (h.length() > 0) this.headlines[0].add(h);
|
||||
|
@ -974,15 +1000,33 @@ public class ContentScraper extends AbstractScraper implements Scraper {
|
|||
* {@link ContentScraper#linkTags0} and {@link ContentScraper#linkTags1}.
|
||||
*/
|
||||
@Override
|
||||
public void scrapeAnyTagOpening(final String tagName, final Properties tagAttributes) {
|
||||
if (tagAttributes != null) {
|
||||
public void scrapeAnyTagOpening(final Tag tag) {
|
||||
if (tag != null && !tag.ignore && tag.opts != null) {
|
||||
/*
|
||||
* HTML microdata can be annotated on any kind of tag, so we don't restrict this
|
||||
* scraping to the limited sets in linkTags0 and linkTags1
|
||||
*/
|
||||
this.linkedDataTypes.addAll(parseMicrodataItemType(tagAttributes));
|
||||
this.linkedDataTypes.addAll(parseMicrodataItemType(tag.opts));
|
||||
}
|
||||
}
|
||||
|
||||
@Override
|
||||
public boolean shouldIgnoreTag(final Tag tag, final Tag parentTag) {
|
||||
boolean ignore = false;
|
||||
|
||||
/* First, inherit ignore property from eventual parent */
|
||||
if(parentTag != null) {
|
||||
ignore = parentTag.ignore;
|
||||
}
|
||||
|
||||
/* Parent is not marked as ignored : let's check the current tag */
|
||||
if (!ignore && this.ignoreDivClassNames != null && tag != null && TagName.div.name().equals(tag.name)) {
|
||||
final String classAttr = tag.opts.getProperty("class", EMPTY_STRING);
|
||||
final Set<String> classes = ContentScraper.parseSpaceSeparatedTokens(classAttr);
|
||||
ignore = !Collections.disjoint(this.ignoreDivClassNames, classes);
|
||||
}
|
||||
return ignore;
|
||||
}
|
||||
|
||||
/**
|
||||
* Add an anchor to the anchors list, and trigger any eventual listener
|
||||
|
|
|
@ -24,8 +24,6 @@
|
|||
|
||||
package net.yacy.document.parser.html;
|
||||
|
||||
import java.util.Properties;
|
||||
|
||||
public interface Scraper {
|
||||
|
||||
/**
|
||||
|
@ -50,7 +48,12 @@ public interface Scraper {
|
|||
*/
|
||||
public boolean isTag1(String tag);
|
||||
|
||||
public void scrapeText(char[] text, String insideTag);
|
||||
/**
|
||||
* Process plain text
|
||||
* @param plain text to process
|
||||
* @param insideTag the eventual direct parent tag. May be null.
|
||||
*/
|
||||
public void scrapeText(char[] text, ContentScraper.Tag insideTag);
|
||||
|
||||
/**
|
||||
* Process a tag belonging to the first category of tags according to the Scraper implementation
|
||||
|
@ -66,10 +69,18 @@ public interface Scraper {
|
|||
|
||||
/**
|
||||
* Processing applied to any kind of tag opening.
|
||||
* @param tagName the tag name
|
||||
* @param tagAttributes the atttributes of the tag
|
||||
* @param tag a parsed tag
|
||||
*/
|
||||
public void scrapeAnyTagOpening(String tagName, Properties tagAttributes);
|
||||
public void scrapeAnyTagOpening(ContentScraper.Tag tag);
|
||||
|
||||
/**
|
||||
* @param tag
|
||||
* a parsed tag
|
||||
* @param parentTag the eventual parent tag
|
||||
* @return true when the tag should be ignored according to the scraper
|
||||
* implementation rules
|
||||
*/
|
||||
public boolean shouldIgnoreTag(final ContentScraper.Tag tag, final ContentScraper.Tag parentTag);
|
||||
|
||||
public void scrapeComment(final char[] comment);
|
||||
|
||||
|
|
|
@ -232,15 +232,19 @@ public final class TransformerWriter extends Writer {
|
|||
if (this.tagStack.size() == 0) {
|
||||
// we are not collection tag text -> case (1) - (3)
|
||||
// case (1): this is not a tag opener/closer
|
||||
if (this.scraper != null && content.length > 0) this.scraper.scrapeText(content, null);
|
||||
if (this.transformer != null) return this.transformer.transformText(content);
|
||||
if (this.scraper != null && content.length > 0) {
|
||||
this.scraper.scrapeText(content, null);
|
||||
}
|
||||
if (this.transformer != null) {
|
||||
return this.transformer.transformText(content);
|
||||
}
|
||||
return content;
|
||||
}
|
||||
|
||||
// we are collection tag text for the tag 'filterTag' -> case (4) - (7)
|
||||
// case (4): getting no tag, go on collecting content
|
||||
if (this.scraper != null) {
|
||||
this.scraper.scrapeText(content, this.tagStack.lastElement().name);
|
||||
this.scraper.scrapeText(content, this.tagStack.lastElement());
|
||||
}
|
||||
if (this.transformer != null) {
|
||||
this.tagStack.lastElement().content.append(this.transformer.transformText(content));
|
||||
|
@ -293,8 +297,22 @@ public final class TransformerWriter extends Writer {
|
|||
ContentScraper.Tag tag = new ContentScraper.Tag(tagname, charBuffer.propParser());
|
||||
charBuffer.close();
|
||||
|
||||
final ContentScraper.Tag parentTag;
|
||||
if(this.tagStack.size() > 0) {
|
||||
parentTag = this.tagStack.lastElement();
|
||||
} else {
|
||||
parentTag = null;
|
||||
}
|
||||
|
||||
/* Check scraper ignoring rules */
|
||||
if (this.scraper != null && this.scraper.shouldIgnoreTag(tag, parentTag)) {
|
||||
tag.setIgnore(true);
|
||||
}
|
||||
|
||||
/* Apply processing relevant for any kind of tag opening */
|
||||
this.scraper.scrapeAnyTagOpening(tag.name, tag.opts);
|
||||
if(this.scraper != null) {
|
||||
this.scraper.scrapeAnyTagOpening(tag);
|
||||
}
|
||||
|
||||
if (this.scraper != null && this.scraper.isTag0(tagname)) {
|
||||
// this single tag is collected at once here
|
||||
|
|
|
@ -13,6 +13,7 @@ import java.nio.charset.StandardCharsets;
|
|||
import java.util.HashSet;
|
||||
import java.util.List;
|
||||
import java.util.Locale;
|
||||
import java.util.Set;
|
||||
|
||||
import org.junit.Test;
|
||||
|
||||
|
@ -138,6 +139,107 @@ public class htmlParserTest extends TestCase {
|
|||
}
|
||||
}
|
||||
}
|
||||
|
||||
/**
|
||||
* Test the htmlParser.parse() method, when filtering out div elements on their CSS class.
|
||||
*
|
||||
* @throws Exception
|
||||
* when an unexpected error occurred
|
||||
*/
|
||||
@Test
|
||||
public void testParseHtmlDivClassFilter() throws Exception {
|
||||
final AnchorURL url = new AnchorURL("http://localhost/test.html");
|
||||
final String mimetype = "text/html";
|
||||
final StringBuilder testHtml = new StringBuilder("<!DOCTYPE html><head><title>Test document</title></head>");
|
||||
|
||||
testHtml.append("<div class=\"top\">Top text");
|
||||
testHtml.append("<a href=\"http://localhost/top.html\">Top link</a>");
|
||||
testHtml.append("</div>");
|
||||
|
||||
testHtml.append("<div class=\"optional\">Some optional content");
|
||||
testHtml.append("<a href=\"http://localhost/content.html\">Link from optional block</a>");
|
||||
testHtml.append("</div>");
|
||||
|
||||
testHtml.append("<p class=\"optional\">A paragraph</p>");
|
||||
|
||||
testHtml.append("<div class=\"optional-text\">Text-only optional block</div>");
|
||||
|
||||
testHtml.append("<div class=\"optional desc\">");
|
||||
testHtml.append("<div class=\"optional child\">");
|
||||
testHtml.append("<div class=\"child\">");
|
||||
testHtml.append("<p>Child text at depth 3</p>");
|
||||
testHtml.append("</div></div></div>");
|
||||
|
||||
testHtml.append("<div class=\"bottom optional media\" itemscope itemtype=\"https://schema.org/LocalBusiness\"><img itemprop=\"logo\" src=\"http://localhost/image.png\" alt=\"Our Company\"></div>");
|
||||
|
||||
final htmlParser parser = new htmlParser();
|
||||
|
||||
/* No CSS class filter */
|
||||
try (InputStream sourceStream = new ByteArrayInputStream(
|
||||
testHtml.toString().getBytes(StandardCharsets.UTF_8));) {
|
||||
final Document[] docs = parser.parse(url, mimetype, null, new VocabularyScraper(), 0, sourceStream);
|
||||
final Document doc = docs[0];
|
||||
final String parsedDext = doc.getTextString();
|
||||
|
||||
/* Check everything has been parsed */
|
||||
assertEquals(2, doc.getAnchors().size());
|
||||
assertEquals(1, doc.getImages().size());
|
||||
assertEquals(1, doc.getLinkedDataTypes().size());
|
||||
assertTrue(parsedDext.contains("Top"));
|
||||
assertTrue(parsedDext.contains("Some"));
|
||||
assertTrue(parsedDext.contains("from"));
|
||||
assertTrue(parsedDext.contains("paragraph"));
|
||||
assertTrue(parsedDext.contains("Text-only"));
|
||||
assertTrue(parsedDext.contains("depth"));
|
||||
}
|
||||
|
||||
/* Filter on CSS classes with no matching elements */
|
||||
try (InputStream sourceStream = new ByteArrayInputStream(
|
||||
testHtml.toString().getBytes(StandardCharsets.UTF_8));) {
|
||||
final Set<String> ignore = new HashSet<>();
|
||||
ignore.add("opt");
|
||||
ignore.add("head");
|
||||
ignore.add("container");
|
||||
final Document[] docs = parser.parse(url, mimetype, null, new VocabularyScraper(), 0, sourceStream);
|
||||
final Document doc = docs[0];
|
||||
final String parsedDext = doc.getTextString();
|
||||
|
||||
/* Check everything has been parsed */
|
||||
assertEquals(2, doc.getAnchors().size());
|
||||
assertEquals(1, doc.getImages().size());
|
||||
assertEquals(1, doc.getLinkedDataTypes().size());
|
||||
assertTrue(parsedDext.contains("Top"));
|
||||
assertTrue(parsedDext.contains("Some"));
|
||||
assertTrue(parsedDext.contains("from"));
|
||||
assertTrue(parsedDext.contains("paragraph"));
|
||||
assertTrue(parsedDext.contains("Text-only"));
|
||||
assertTrue(parsedDext.contains("depth"));
|
||||
}
|
||||
|
||||
/* Filter on CSS class with matching elements */
|
||||
try (InputStream sourceStream = new ByteArrayInputStream(
|
||||
testHtml.toString().getBytes(StandardCharsets.UTF_8));) {
|
||||
final Set<String> ignore = new HashSet<>();
|
||||
ignore.add("optional");
|
||||
final Document[] docs = parser.parse(url, mimetype, null, ignore, new VocabularyScraper(), 0, sourceStream);
|
||||
final Document doc = docs[0];
|
||||
final String parsedDext = doc.getTextString();
|
||||
|
||||
/* Check matching blocks have been ignored */
|
||||
assertEquals(1, doc.getAnchors().size());
|
||||
assertEquals("http://localhost/top.html", doc.getAnchors().iterator().next().toString());
|
||||
assertEquals(0, doc.getLinkedDataTypes().size());
|
||||
assertEquals(0, doc.getImages().size());
|
||||
assertFalse(parsedDext.contains("Some"));
|
||||
assertFalse(parsedDext.contains("from"));
|
||||
assertFalse(parsedDext.contains("depth"));
|
||||
|
||||
/* Check non-matching blocks have been normally parsed */
|
||||
assertTrue(parsedDext.contains("Top"));
|
||||
assertTrue(parsedDext.contains("Text-only"));
|
||||
assertTrue(parsedDext.contains("paragraph"));
|
||||
}
|
||||
}
|
||||
|
||||
/**
|
||||
* Test the htmlParser.parseWithLimits() method with test content within bounds.
|
||||
|
|
Loading…
Reference in New Issue
Block a user