added more attributes to html evaluation

git-svn-id: https://svn.berlios.de/svnroot/repos/yacy/trunk@7688 6c8d7289-2bf4-0310-a012-ef5d649a1542
This commit is contained in:
orbiter 2011-04-29 15:36:44 +00:00
parent 3b578a28ef
commit 4e8fa03514
2 changed files with 11 additions and 4 deletions

View File

@ -274,10 +274,10 @@ public class ContentScraper extends AbstractScraper implements Scraper {
public void scrapeTag0(final String tagname, final Properties tagopts) { public void scrapeTag0(final String tagname, final Properties tagopts) {
if (tagname.equalsIgnoreCase("img")) { if (tagname.equalsIgnoreCase("img")) {
String src = tagopts.getProperty("src", "");
try { try {
final int width = Integer.parseInt(tagopts.getProperty("width", "-1")); final int width = Integer.parseInt(tagopts.getProperty("width", "-1"));
final int height = Integer.parseInt(tagopts.getProperty("height", "-1")); final int height = Integer.parseInt(tagopts.getProperty("height", "-1"));
String src = tagopts.getProperty("src", "");
if (src.length() > 0) { if (src.length() > 0) {
final MultiProtocolURI url = absolutePath(src); final MultiProtocolURI url = absolutePath(src);
if (url != null) { if (url != null) {
@ -286,6 +286,7 @@ public class ContentScraper extends AbstractScraper implements Scraper {
} }
} }
} catch (final NumberFormatException e) {} } catch (final NumberFormatException e) {}
Evaluation.match(Element.imgpath, src, this.evaluationScores);
} else if(tagname.equalsIgnoreCase("base")) { } else if(tagname.equalsIgnoreCase("base")) {
try { try {
root = new MultiProtocolURI(tagopts.getProperty("href", "")); root = new MultiProtocolURI(tagopts.getProperty("href", ""));
@ -293,9 +294,6 @@ public class ContentScraper extends AbstractScraper implements Scraper {
} else if (tagname.equalsIgnoreCase("frame")) { } else if (tagname.equalsIgnoreCase("frame")) {
anchors.put(absolutePath(tagopts.getProperty("src", "")), tagopts /* with property "name" */); anchors.put(absolutePath(tagopts.getProperty("src", "")), tagopts /* with property "name" */);
frames.add(absolutePath(tagopts.getProperty("src", ""))); frames.add(absolutePath(tagopts.getProperty("src", "")));
} else if (tagname.equalsIgnoreCase("iframe")) {
anchors.put(absolutePath(tagopts.getProperty("src", "")), tagopts /* with property "name" */);
iframes.add(absolutePath(tagopts.getProperty("src", "")));
} else if (tagname.equalsIgnoreCase("body")) { } else if (tagname.equalsIgnoreCase("body")) {
String c = tagopts.getProperty("class", ""); String c = tagopts.getProperty("class", "");
Evaluation.match(Element.bodyclass, c, this.evaluationScores); Evaluation.match(Element.bodyclass, c, this.evaluationScores);
@ -376,6 +374,7 @@ public class ContentScraper extends AbstractScraper implements Scraper {
anchors.put(url, tagopts); anchors.put(url, tagopts);
} }
} }
Evaluation.match(Element.apath, href, this.evaluationScores);
} }
final String h; final String h;
if ((tagname.equalsIgnoreCase("h1")) && (text.length < 1024)) { if ((tagname.equalsIgnoreCase("h1")) && (text.length < 1024)) {
@ -410,6 +409,11 @@ public class ContentScraper extends AbstractScraper implements Scraper {
} else if ((tagname.equalsIgnoreCase("li")) && (text.length < 1024)) { } else if ((tagname.equalsIgnoreCase("li")) && (text.length < 1024)) {
h = recursiveParse(text); h = recursiveParse(text);
if (h.length() > 0) li.add(h); if (h.length() > 0) li.add(h);
} else if (tagname.equalsIgnoreCase("iframe")) {
String src = tagopts.getProperty("src", "");
anchors.put(absolutePath(src), tagopts /* with property "name" */);
iframes.add(absolutePath(src));
Evaluation.match(Element.iframepath, src, this.evaluationScores);
} else if (tagname.equalsIgnoreCase("script")) { } else if (tagname.equalsIgnoreCase("script")) {
String src = tagopts.getProperty("src", ""); String src = tagopts.getProperty("src", "");
if (src.length() > 0) { if (src.length() > 0) {

View File

@ -68,6 +68,9 @@ public class Evaluation {
url, url,
scriptpath, scriptpath,
scriptcode, scriptcode,
iframepath,
imgpath,
apath,
comment; comment;
} }