mirror of
https://github.com/yacy/yacy_search_server.git
synced 2024-09-21 00:00:13 +02:00
added more attributes to html evaluation
git-svn-id: https://svn.berlios.de/svnroot/repos/yacy/trunk@7688 6c8d7289-2bf4-0310-a012-ef5d649a1542
This commit is contained in:
parent
3b578a28ef
commit
4e8fa03514
|
@ -274,10 +274,10 @@ public class ContentScraper extends AbstractScraper implements Scraper {
|
|||
|
||||
public void scrapeTag0(final String tagname, final Properties tagopts) {
|
||||
if (tagname.equalsIgnoreCase("img")) {
|
||||
String src = tagopts.getProperty("src", "");
|
||||
try {
|
||||
final int width = Integer.parseInt(tagopts.getProperty("width", "-1"));
|
||||
final int height = Integer.parseInt(tagopts.getProperty("height", "-1"));
|
||||
String src = tagopts.getProperty("src", "");
|
||||
if (src.length() > 0) {
|
||||
final MultiProtocolURI url = absolutePath(src);
|
||||
if (url != null) {
|
||||
|
@ -286,6 +286,7 @@ public class ContentScraper extends AbstractScraper implements Scraper {
|
|||
}
|
||||
}
|
||||
} catch (final NumberFormatException e) {}
|
||||
Evaluation.match(Element.imgpath, src, this.evaluationScores);
|
||||
} else if(tagname.equalsIgnoreCase("base")) {
|
||||
try {
|
||||
root = new MultiProtocolURI(tagopts.getProperty("href", ""));
|
||||
|
@ -293,9 +294,6 @@ public class ContentScraper extends AbstractScraper implements Scraper {
|
|||
} else if (tagname.equalsIgnoreCase("frame")) {
|
||||
anchors.put(absolutePath(tagopts.getProperty("src", "")), tagopts /* with property "name" */);
|
||||
frames.add(absolutePath(tagopts.getProperty("src", "")));
|
||||
} else if (tagname.equalsIgnoreCase("iframe")) {
|
||||
anchors.put(absolutePath(tagopts.getProperty("src", "")), tagopts /* with property "name" */);
|
||||
iframes.add(absolutePath(tagopts.getProperty("src", "")));
|
||||
} else if (tagname.equalsIgnoreCase("body")) {
|
||||
String c = tagopts.getProperty("class", "");
|
||||
Evaluation.match(Element.bodyclass, c, this.evaluationScores);
|
||||
|
@ -376,6 +374,7 @@ public class ContentScraper extends AbstractScraper implements Scraper {
|
|||
anchors.put(url, tagopts);
|
||||
}
|
||||
}
|
||||
Evaluation.match(Element.apath, href, this.evaluationScores);
|
||||
}
|
||||
final String h;
|
||||
if ((tagname.equalsIgnoreCase("h1")) && (text.length < 1024)) {
|
||||
|
@ -410,6 +409,11 @@ public class ContentScraper extends AbstractScraper implements Scraper {
|
|||
} else if ((tagname.equalsIgnoreCase("li")) && (text.length < 1024)) {
|
||||
h = recursiveParse(text);
|
||||
if (h.length() > 0) li.add(h);
|
||||
} else if (tagname.equalsIgnoreCase("iframe")) {
|
||||
String src = tagopts.getProperty("src", "");
|
||||
anchors.put(absolutePath(src), tagopts /* with property "name" */);
|
||||
iframes.add(absolutePath(src));
|
||||
Evaluation.match(Element.iframepath, src, this.evaluationScores);
|
||||
} else if (tagname.equalsIgnoreCase("script")) {
|
||||
String src = tagopts.getProperty("src", "");
|
||||
if (src.length() > 0) {
|
||||
|
|
|
@ -68,6 +68,9 @@ public class Evaluation {
|
|||
url,
|
||||
scriptpath,
|
||||
scriptcode,
|
||||
iframepath,
|
||||
imgpath,
|
||||
apath,
|
||||
comment;
|
||||
}
|
||||
|
||||
|
|
Loading…
Reference in New Issue
Block a user