- solved problems with backpath normalization

- redesigned in/outbound link handover
- removed iframe links from inbound/outbound in solr scheme
This commit is contained in:
Michael Peter Christen 2012-04-27 16:48:51 +02:00
parent 5f5ed33ed8
commit 453010bd68
5 changed files with 157 additions and 123 deletions

View File

@ -106,7 +106,7 @@ public class webstructure {
prop.put("references_count", 1);
prop.put("references_documents", 1);
prop.put("references_documents_0_hash", urlhash);
prop.put("references_documents_0_count", scraper.inboundLinkCount() + scraper.outboundLinkCount());
prop.put("references_documents_0_count", scraper.inboundLinks().size() + scraper.outboundLinks().size());
prop.put("references_documents_0_date", GenericFormatter.SHORT_DAY_FORMATTER.format(new Date()));
prop.put("references_documents_0_urle", url == null ? 0 : 1);
if (url != null) prop.putXML("references_documents_0_urle_url", url.toNormalform(true, false));

View File

@ -84,31 +84,31 @@ public class SolrScheme extends ConfigurationSet {
*/
}
private void addSolr(final SolrInputDocument solrdoc, final Field key, final String value) {
protected void addSolr(final SolrInputDocument solrdoc, final Field key, final String value) {
if (isEmpty() || contains(key.name())) solrdoc.setField(key.name(), value);
}
private void addSolr(final SolrInputDocument solrdoc, final Field key, final Date value) {
protected void addSolr(final SolrInputDocument solrdoc, final Field key, final Date value) {
if (isEmpty() || contains(key.name())) solrdoc.setField(key.name(), value);
}
private void addSolr(final SolrInputDocument solrdoc, final Field key, final int value) {
protected void addSolr(final SolrInputDocument solrdoc, final Field key, final int value) {
if (isEmpty() || contains(key.name())) solrdoc.setField(key.name(), value);
}
private void addSolr(final SolrInputDocument solrdoc, final Field key, final String[] value) {
protected void addSolr(final SolrInputDocument solrdoc, final Field key, final String[] value) {
if (isEmpty() || contains(key.name())) solrdoc.setField(key.name(), value);
}
private void addSolr(final SolrInputDocument solrdoc, final Field key, final float value) {
protected void addSolr(final SolrInputDocument solrdoc, final Field key, final float value) {
if (isEmpty() || contains(key.name())) solrdoc.setField(key.name(), value);
}
private void addSolr(final SolrInputDocument solrdoc, final Field key, final boolean value) {
protected void addSolr(final SolrInputDocument solrdoc, final Field key, final boolean value) {
if (isEmpty() || contains(key.name())) solrdoc.setField(key.name(), value);
}
private void addSolr(final SolrInputDocument solrdoc, final Field key, final String value, final float boost) {
protected void addSolr(final SolrInputDocument solrdoc, final Field key, final String value, final float boost) {
if (isEmpty() || contains(key.name())) solrdoc.setField(key.name(), value, boost);
}
@ -308,92 +308,11 @@ public class SolrScheme extends ConfigurationSet {
if (paths.length > 0) addSolr(solrdoc, Field.paths_txt, paths);
}
// list all links
final Map<MultiProtocolURI, Properties> alllinks = yacydoc.getAnchors();
// get list of all links; they will be shrinked by urls that appear in other fields of the solr scheme
Set<MultiProtocolURI> inboundLinks = yacydoc.inboundLinks();
Set<MultiProtocolURI> ouboundLinks = yacydoc.outboundLinks();
int c = 0;
if (isEmpty() || contains(Field.inboundlinkscount_i.name())) addSolr(solrdoc, Field.inboundlinkscount_i, yacydoc.inboundLinkCount());
if (isEmpty() || contains(Field.inboundlinksnofollowcount_i.name())) addSolr(solrdoc, Field.inboundlinksnofollowcount_i, yacydoc.inboundLinkNofollowCount());
final String[] inboundlinksTag = new String[yacydoc.inboundLinkCount()];
final String[] inboundlinksURLProtocol = new String[yacydoc.inboundLinkCount()];
final String[] inboundlinksURLStub = new String[yacydoc.inboundLinkCount()];
final String[] inboundlinksName = new String[yacydoc.inboundLinkCount()];
final String[] inboundlinksRel = new String[yacydoc.inboundLinkCount()];
final String[] inboundlinksText = new String[yacydoc.inboundLinkCount()];
for (final MultiProtocolURI url: yacydoc.inboundLinks()) {
final Properties p = alllinks.get(url);
final String name = p.getProperty("name", ""); // the name attribute
final String rel = p.getProperty("rel", ""); // the rel-attribute
final String text = p.getProperty("text", ""); // the text between the <a></a> tag
final String urls = url.toNormalform(false, false);
final int pr = urls.indexOf("://",0);
inboundlinksURLProtocol[c] = urls.substring(0, pr);
inboundlinksURLStub[c] = urls.substring(pr + 3);
inboundlinksName[c] = name.length() > 0 ? name : "";
inboundlinksRel[c] = rel.length() > 0 ? rel : "";
inboundlinksText[c] = text.length() > 0 ? text : "";
inboundlinksTag[c] =
"<a href=\"" + url.toNormalform(false, false) + "\"" +
(rel.length() > 0 ? " rel=\"" + rel + "\"" : "") +
(name.length() > 0 ? " name=\"" + name + "\"" : "") +
">" +
((text.length() > 0) ? text : "") + "</a>";
c++;
}
if (isEmpty() || contains(Field.inboundlinks_tag_txt.name())) addSolr(solrdoc, Field.inboundlinks_tag_txt, inboundlinksTag);
if (isEmpty() || contains(Field.inboundlinks_protocol_txt.name())) addSolr(solrdoc, Field.inboundlinks_protocol_txt, protocolList2indexedList(inboundlinksURLProtocol));
if (isEmpty() || contains(Field.inboundlinks_urlstub_txt.name())) addSolr(solrdoc, Field.inboundlinks_urlstub_txt, inboundlinksURLStub);
if (isEmpty() || contains(Field.inboundlinks_name_txt.name())) addSolr(solrdoc, Field.inboundlinks_name_txt, inboundlinksName);
if (isEmpty() || contains(Field.inboundlinks_rel_txt.name())) addSolr(solrdoc, Field.inboundlinks_rel_txt, inboundlinksRel);
if (isEmpty() || contains(Field.inboundlinks_relflags_txt.name())) addSolr(solrdoc, Field.inboundlinks_relflags_txt, relEval(inboundlinksRel));
if (isEmpty() || contains(Field.inboundlinks_text_txt.name())) addSolr(solrdoc, Field.inboundlinks_text_txt, inboundlinksText);
c = 0;
if (isEmpty() || contains(Field.outboundlinkscount_i.name())) addSolr(solrdoc, Field.outboundlinkscount_i, yacydoc.outboundLinkCount());
if (isEmpty() || contains(Field.outboundlinksnofollowcount_i.name())) addSolr(solrdoc, Field.outboundlinksnofollowcount_i, yacydoc.outboundLinkNofollowCount());
final String[] outboundlinksTag = new String[yacydoc.outboundLinkCount()];
final String[] outboundlinksURLProtocol = new String[yacydoc.outboundLinkCount()];
final String[] outboundlinksURLStub = new String[yacydoc.outboundLinkCount()];
final String[] outboundlinksName = new String[yacydoc.outboundLinkCount()];
final String[] outboundlinksRel = new String[yacydoc.outboundLinkCount()];
final String[] outboundlinksText = new String[yacydoc.outboundLinkCount()];
for (final MultiProtocolURI url: yacydoc.outboundLinks()) {
final Properties p = alllinks.get(url);
final String name = p.getProperty("name", ""); // the name attribute
final String rel = p.getProperty("rel", ""); // the rel-attribute
final String text = p.getProperty("text", ""); // the text between the <a></a> tag
final String urls = url.toNormalform(false, false);
final int pr = urls.indexOf("://",0);
outboundlinksURLProtocol[c] = urls.substring(0, pr);
outboundlinksURLStub[c] = urls.substring(pr + 3);
outboundlinksName[c] = name.length() > 0 ? name : "";
outboundlinksRel[c] = rel.length() > 0 ? rel : "";
outboundlinksText[c] = text.length() > 0 ? text : "";
outboundlinksTag[c] =
"<a href=\"" + url.toNormalform(false, false) + "\"" +
(rel.length() > 0 ? " rel=\"" + rel + "\"" : "") +
(name.length() > 0 ? " name=\"" + name + "\"" : "") +
">" +
((text.length() > 0) ? text : "") + "</a>";
c++;
}
if (isEmpty() || contains(Field.outboundlinks_tag_txt.name())) addSolr(solrdoc, Field.outboundlinks_tag_txt, outboundlinksTag);
if (isEmpty() || contains(Field.outboundlinks_protocol_txt.name())) addSolr(solrdoc, Field.outboundlinks_protocol_txt, protocolList2indexedList(outboundlinksURLProtocol));
if (isEmpty() || contains(Field.outboundlinks_urlstub_txt.name())) addSolr(solrdoc, Field.outboundlinks_urlstub_txt, outboundlinksURLStub);
if (isEmpty() || contains(Field.outboundlinks_name_txt.name())) addSolr(solrdoc, Field.outboundlinks_name_txt, outboundlinksName);
if (isEmpty() || contains(Field.outboundlinks_rel_txt.name())) addSolr(solrdoc, Field.outboundlinks_rel_txt, outboundlinksRel);
if (isEmpty() || contains(Field.outboundlinks_relflags_txt.name())) addSolr(solrdoc, Field.outboundlinks_relflags_txt, relEval(inboundlinksRel));
if (isEmpty() || contains(Field.outboundlinks_text_txt.name())) addSolr(solrdoc, Field.outboundlinks_text_txt, outboundlinksText);
// charset
addSolr(solrdoc, Field.charset_s, yacydoc.getCharset());
// coordinates
if (yacydoc.lat() != 0.0f && yacydoc.lon() != 0.0f) {
addSolr(solrdoc, Field.lon_coordinate, yacydoc.lon());
addSolr(solrdoc, Field.lat_coordinate, yacydoc.lat());
}
addSolr(solrdoc, Field.httpstatus_i, 200);
final Object parser = yacydoc.getParserObject();
if (parser instanceof ContentScraper) {
final ContentScraper html = (ContentScraper) parser;
@ -483,6 +402,8 @@ public class SolrScheme extends ConfigurationSet {
c = 0;
for (final ImageEntry ie: imagesc) {
final MultiProtocolURI uri = ie.url();
inboundLinks.remove(uri);
ouboundLinks.remove(uri);
imgtags[c] = ie.toString();
imgprots[c] = uri.getProtocol();
imgstubs[c] = uri.toString().substring(imgprots[c].length() + 3);
@ -503,6 +424,8 @@ public class SolrScheme extends ConfigurationSet {
c = 0;
for (final Map.Entry<MultiProtocolURI, String> entry: csss.entrySet()) {
final String url = entry.getKey().toNormalform(false, false, false, false);
inboundLinks.remove(url);
ouboundLinks.remove(url);
css_tag[c] =
"<link rel=\"stylesheet\" type=\"text/css\" media=\"" + entry.getValue() + "\"" +
" href=\""+ url + "\" />";
@ -520,6 +443,8 @@ public class SolrScheme extends ConfigurationSet {
final String[] scripts = new String[scriptss.size()];
c = 0;
for (final MultiProtocolURI url: scriptss) {
inboundLinks.remove(url);
ouboundLinks.remove(url);
scripts[c++] = url.toNormalform(false, false, false, false);
}
addSolr(solrdoc, Field.scriptscount_i, scripts.length);
@ -531,21 +456,24 @@ public class SolrScheme extends ConfigurationSet {
final Set<MultiProtocolURI> framess = html.getFrames();
final String[] frames = new String[framess.size()];
c = 0;
for (final MultiProtocolURI entry: framess) {
frames[c++] = entry.toNormalform(false, false, false, false);
for (final MultiProtocolURI url: framess) {
inboundLinks.remove(url);
ouboundLinks.remove(url);
frames[c++] = url.toNormalform(false, false, false, false);
}
addSolr(solrdoc, Field.framesscount_i, frames.length);
if (frames.length > 0) addSolr(solrdoc, Field.frames_txt, frames);
}
// IFrames
if (isEmpty() || contains(Field.iframes_txt.name()
)) {
if (isEmpty() || contains(Field.iframes_txt.name())) {
final Set<MultiProtocolURI> iframess = html.getIFrames();
final String[] iframes = new String[iframess.size()];
c = 0;
for (final MultiProtocolURI entry: iframess) {
iframes[c++] = entry.toNormalform(false, false, false, false);
for (final MultiProtocolURI url: iframess) {
inboundLinks.remove(url);
ouboundLinks.remove(url);
iframes[c++] = url.toNormalform(false, false, false, false);
}
addSolr(solrdoc, Field.iframesscount_i, iframes.length);
if (iframes.length > 0) addSolr(solrdoc, Field.iframes_txt, iframes);
@ -568,6 +496,94 @@ public class SolrScheme extends ConfigurationSet {
// response time
addSolr(solrdoc, Field.responsetime_i, header.get(HeaderFramework.RESPONSE_TIME_MILLIS, "0"));
}
// list all links
final Map<MultiProtocolURI, Properties> alllinks = yacydoc.getAnchors();
c = 0;
if (isEmpty() || contains(Field.inboundlinkscount_i.name())) addSolr(solrdoc, Field.inboundlinkscount_i, inboundLinks.size());
if (isEmpty() || contains(Field.inboundlinksnofollowcount_i.name())) addSolr(solrdoc, Field.inboundlinksnofollowcount_i, yacydoc.inboundLinkNofollowCount());
final String[] inboundlinksTag = new String[inboundLinks.size()];
final String[] inboundlinksURLProtocol = new String[inboundLinks.size()];
final String[] inboundlinksURLStub = new String[inboundLinks.size()];
final String[] inboundlinksName = new String[inboundLinks.size()];
final String[] inboundlinksRel = new String[inboundLinks.size()];
final String[] inboundlinksText = new String[inboundLinks.size()];
for (final MultiProtocolURI url: inboundLinks) {
final Properties p = alllinks.get(url);
final String name = p.getProperty("name", ""); // the name attribute
final String rel = p.getProperty("rel", ""); // the rel-attribute
final String text = p.getProperty("text", ""); // the text between the <a></a> tag
final String urls = url.toNormalform(false, false);
final int pr = urls.indexOf("://",0);
inboundlinksURLProtocol[c] = urls.substring(0, pr);
inboundlinksURLStub[c] = urls.substring(pr + 3);
inboundlinksName[c] = name.length() > 0 ? name : "";
inboundlinksRel[c] = rel.length() > 0 ? rel : "";
inboundlinksText[c] = text.length() > 0 ? text : "";
inboundlinksTag[c] =
"<a href=\"" + url.toNormalform(false, false) + "\"" +
(rel.length() > 0 ? " rel=\"" + rel + "\"" : "") +
(name.length() > 0 ? " name=\"" + name + "\"" : "") +
">" +
((text.length() > 0) ? text : "") + "</a>";
c++;
}
if (isEmpty() || contains(Field.inboundlinks_tag_txt.name())) addSolr(solrdoc, Field.inboundlinks_tag_txt, inboundlinksTag);
if (isEmpty() || contains(Field.inboundlinks_protocol_txt.name())) addSolr(solrdoc, Field.inboundlinks_protocol_txt, protocolList2indexedList(inboundlinksURLProtocol));
if (isEmpty() || contains(Field.inboundlinks_urlstub_txt.name())) addSolr(solrdoc, Field.inboundlinks_urlstub_txt, inboundlinksURLStub);
if (isEmpty() || contains(Field.inboundlinks_name_txt.name())) addSolr(solrdoc, Field.inboundlinks_name_txt, inboundlinksName);
if (isEmpty() || contains(Field.inboundlinks_rel_txt.name())) addSolr(solrdoc, Field.inboundlinks_rel_txt, inboundlinksRel);
if (isEmpty() || contains(Field.inboundlinks_relflags_txt.name())) addSolr(solrdoc, Field.inboundlinks_relflags_txt, relEval(inboundlinksRel));
if (isEmpty() || contains(Field.inboundlinks_text_txt.name())) addSolr(solrdoc, Field.inboundlinks_text_txt, inboundlinksText);
c = 0;
if (isEmpty() || contains(Field.outboundlinkscount_i.name())) addSolr(solrdoc, Field.outboundlinkscount_i, ouboundLinks.size());
if (isEmpty() || contains(Field.outboundlinksnofollowcount_i.name())) addSolr(solrdoc, Field.outboundlinksnofollowcount_i, yacydoc.outboundLinkNofollowCount());
final String[] outboundlinksTag = new String[ouboundLinks.size()];
final String[] outboundlinksURLProtocol = new String[ouboundLinks.size()];
final String[] outboundlinksURLStub = new String[ouboundLinks.size()];
final String[] outboundlinksName = new String[ouboundLinks.size()];
final String[] outboundlinksRel = new String[ouboundLinks.size()];
final String[] outboundlinksText = new String[ouboundLinks.size()];
for (final MultiProtocolURI url: ouboundLinks) {
final Properties p = alllinks.get(url);
final String name = p.getProperty("name", ""); // the name attribute
final String rel = p.getProperty("rel", ""); // the rel-attribute
final String text = p.getProperty("text", ""); // the text between the <a></a> tag
final String urls = url.toNormalform(false, false);
final int pr = urls.indexOf("://",0);
outboundlinksURLProtocol[c] = urls.substring(0, pr);
outboundlinksURLStub[c] = urls.substring(pr + 3);
outboundlinksName[c] = name.length() > 0 ? name : "";
outboundlinksRel[c] = rel.length() > 0 ? rel : "";
outboundlinksText[c] = text.length() > 0 ? text : "";
outboundlinksTag[c] =
"<a href=\"" + url.toNormalform(false, false) + "\"" +
(rel.length() > 0 ? " rel=\"" + rel + "\"" : "") +
(name.length() > 0 ? " name=\"" + name + "\"" : "") +
">" +
((text.length() > 0) ? text : "") + "</a>";
c++;
}
if (isEmpty() || contains(Field.outboundlinks_tag_txt.name())) addSolr(solrdoc, Field.outboundlinks_tag_txt, outboundlinksTag);
if (isEmpty() || contains(Field.outboundlinks_protocol_txt.name())) addSolr(solrdoc, Field.outboundlinks_protocol_txt, protocolList2indexedList(outboundlinksURLProtocol));
if (isEmpty() || contains(Field.outboundlinks_urlstub_txt.name())) addSolr(solrdoc, Field.outboundlinks_urlstub_txt, outboundlinksURLStub);
if (isEmpty() || contains(Field.outboundlinks_name_txt.name())) addSolr(solrdoc, Field.outboundlinks_name_txt, outboundlinksName);
if (isEmpty() || contains(Field.outboundlinks_rel_txt.name())) addSolr(solrdoc, Field.outboundlinks_rel_txt, outboundlinksRel);
if (isEmpty() || contains(Field.outboundlinks_relflags_txt.name())) addSolr(solrdoc, Field.outboundlinks_relflags_txt, relEval(inboundlinksRel));
if (isEmpty() || contains(Field.outboundlinks_text_txt.name())) addSolr(solrdoc, Field.outboundlinks_text_txt, outboundlinksText);
// charset
addSolr(solrdoc, Field.charset_s, yacydoc.getCharset());
// coordinates
if (yacydoc.lat() != 0.0f && yacydoc.lon() != 0.0f) {
addSolr(solrdoc, Field.lon_coordinate, yacydoc.lon());
addSolr(solrdoc, Field.lat_coordinate, yacydoc.lat());
}
addSolr(solrdoc, Field.httpstatus_i, 200);
return solrdoc;
}

View File

@ -626,16 +626,6 @@ dc_rights
this.favicon = faviconURL;
}
public int inboundLinkCount() {
if (this.inboundlinks == null) resortLinks();
return (this.inboundlinks == null) ? 0 : this.inboundlinks.size();
}
public int outboundLinkCount() {
if (this.outboundlinks == null) resortLinks();
return (this.outboundlinks == null) ? 0 : this.outboundlinks.size();
}
public int inboundLinkNofollowCount() {
if (this.inboundlinks == null) resortLinks();
if (this.inboundlinks == null) return 0;

View File

@ -65,8 +65,8 @@ public class ContentScraper extends AbstractScraper implements Scraper {
private final char[] minuteCharsHTML = "&#039;".toCharArray();
// statics: for initialization of the HTMLFilterAbstractScraper
private static final Set<String> linkTags0 = new HashSet<String>(9,0.99f);
private static final Set<String> linkTags1 = new HashSet<String>(7,0.99f);
private static final Set<String> linkTags0 = new HashSet<String>(12,0.99f);
private static final Set<String> linkTags1 = new HashSet<String>(15,0.99f);
public enum TagType {
singleton, pair;
@ -119,6 +119,7 @@ public class ContentScraper extends AbstractScraper implements Scraper {
private final Map<MultiProtocolURI, Properties> anchors;
private final Map<MultiProtocolURI, String> rss, css;
private final Set<MultiProtocolURI> script, frames, iframes;
private final Map<MultiProtocolURI, EmbedEntry> embeds; // urlhash/embed relation
private final Map<MultiProtocolURI, ImageEntry> images; // urlhash/image relation
private final Map<String, String> metas;
private String title;
@ -159,6 +160,7 @@ public class ContentScraper extends AbstractScraper implements Scraper {
this.css = new HashMap<MultiProtocolURI, String>();
this.anchors = new HashMap<MultiProtocolURI, Properties>();
this.images = new HashMap<MultiProtocolURI, ImageEntry>();
this.embeds = new HashMap<MultiProtocolURI, EmbedEntry>();
this.frames = new HashSet<MultiProtocolURI>();
this.iframes = new HashSet<MultiProtocolURI>();
this.metas = new HashMap<String, String>();
@ -317,11 +319,11 @@ public class ContentScraper extends AbstractScraper implements Scraper {
if (tagname.equalsIgnoreCase("img")) {
final String src = tagopts.getProperty("src", EMPTY_STRING);
try {
final int width = Integer.parseInt(tagopts.getProperty("width", "-1"));
final int height = Integer.parseInt(tagopts.getProperty("height", "-1"));
if (src.length() > 0) {
final MultiProtocolURI url = absolutePath(src);
if (url != null) {
final int width = Integer.parseInt(tagopts.getProperty("width", "-1"));
final int height = Integer.parseInt(tagopts.getProperty("height", "-1"));
final ImageEntry ie = new ImageEntry(url, tagopts.getProperty("alt", EMPTY_STRING), width, height, -1);
addImage(this.images, ie);
}
@ -334,6 +336,7 @@ public class ContentScraper extends AbstractScraper implements Scraper {
} catch (final MalformedURLException e) {}
} else if (tagname.equalsIgnoreCase("frame")) {
final MultiProtocolURI src = absolutePath(tagopts.getProperty("src", EMPTY_STRING));
tagopts.put("src", src.toNormalform(true, false));
mergeAnchors(src, tagopts /* with property "name" */);
this.frames.add(src);
this.evaluationScores.match(Element.framepath, src.toNormalform(true, false));
@ -361,13 +364,18 @@ public class ContentScraper extends AbstractScraper implements Scraper {
final String areatitle = cleanLine(tagopts.getProperty("title",EMPTY_STRING));
//String alt = tagopts.getProperty("alt",EMPTY_STRING);
final String href = tagopts.getProperty("href", EMPTY_STRING);
tagopts.put("nme", areatitle);
if (href.length() > 0) mergeAnchors(absolutePath(href), tagopts);
if (href.length() > 0) {
tagopts.put("nme", areatitle);
MultiProtocolURI url = absolutePath(href);
tagopts.put("href", url.toNormalform(true, false));
mergeAnchors(url, tagopts);
}
} else if (tagname.equalsIgnoreCase("link")) {
final String href = tagopts.getProperty("href", EMPTY_STRING);
final MultiProtocolURI newLink = absolutePath(href);
if (newLink != null) {
tagopts.put("href", newLink.toNormalform(true, false));
final String rel = tagopts.getProperty("rel", EMPTY_STRING);
final String linktitle = tagopts.getProperty("title", EMPTY_STRING);
final String type = tagopts.getProperty("type", EMPTY_STRING);
@ -391,11 +399,26 @@ public class ContentScraper extends AbstractScraper implements Scraper {
}
}
} else if(tagname.equalsIgnoreCase("embed")) {
mergeAnchors(absolutePath(tagopts.getProperty("src", EMPTY_STRING)), tagopts /* with property "name" */);
final String src = tagopts.getProperty("src", EMPTY_STRING);
try {
if (src.length() > 0) {
final MultiProtocolURI url = absolutePath(src);
if (url != null) {
final int width = Integer.parseInt(tagopts.getProperty("width", "-1"));
final int height = Integer.parseInt(tagopts.getProperty("height", "-1"));
tagopts.put("src", url.toNormalform(true, false));
final EmbedEntry ie = new EmbedEntry(url, width, height, tagopts.getProperty("type", EMPTY_STRING), tagopts.getProperty("pluginspage", EMPTY_STRING));
this.embeds.put(url, ie);
mergeAnchors(url, tagopts);
}
}
} catch (final NumberFormatException e) {}
} else if(tagname.equalsIgnoreCase("param")) {
final String name = tagopts.getProperty("name", EMPTY_STRING);
if (name.equalsIgnoreCase("movie")) {
mergeAnchors(absolutePath(tagopts.getProperty("value", EMPTY_STRING)), tagopts /* with property "name" */);
MultiProtocolURI url = absolutePath(tagopts.getProperty("value", EMPTY_STRING));
tagopts.put("value", url.toNormalform(true, false));
mergeAnchors(url, tagopts /* with property "name" */);
}
}
@ -419,6 +442,7 @@ public class ContentScraper extends AbstractScraper implements Scraper {
addImage(this.images, ie);
} else {
tagopts.put("text", recursiveParse(text));
tagopts.put("href", url.toNormalform(true, false)); // we must assign this because the url may have resolved backpaths and may not be absolute
mergeAnchors(url, tagopts);
}
}
@ -460,6 +484,7 @@ public class ContentScraper extends AbstractScraper implements Scraper {
if (h.length() > 0) this.li.add(h);
} else if (tagname.equalsIgnoreCase("iframe")) {
final MultiProtocolURI src = absolutePath(tagopts.getProperty("src", EMPTY_STRING));
tagopts.put("src", src.toNormalform(true, false));
mergeAnchors(src, tagopts /* with property "name" */);
this.iframes.add(src);
this.evaluationScores.match(Element.iframepath, src.toNormalform(true, false));
@ -654,10 +679,13 @@ public class ContentScraper extends AbstractScraper implements Scraper {
* @return a map of <urlhash, ImageEntry>
*/
public Map<MultiProtocolURI, ImageEntry> getImages() {
// this resturns a String(absolute url)/htmlFilterImageEntry - relation
return this.images;
}
public Map<MultiProtocolURI, EmbedEntry> getEmbeds() {
return this.embeds;
}
public Map<String, String> getMetas() {
return this.metas;
}

View File

@ -385,11 +385,11 @@ public class Segment {
new byte[0], // md5
(int) sourcesize, // size
condenser.RESULT_NUMB_WORDS, // word count
Response.docType(document.dc_format()), // doctype
Response.docType(document.dc_format()), // doctype
condenser.RESULT_FLAGS, // flags
UTF8.getBytes(language), // language
document.inboundLinkCount(), // inbound links
document.outboundLinkCount(), // outbound links
document.inboundLinks().size(), // inbound links
document.outboundLinks().size(), // outbound links
document.getAudiolinks().size(), // laudio
document.getImages().size(), // limage
document.getVideolinks().size(), // lvideo
@ -409,8 +409,8 @@ public class Segment {
condenser, // document condenser
language, // document language
Response.docType(document.dc_format()), // document type
document.inboundLinkCount(), // inbound links
document.outboundLinkCount(), // outbound links
document.inboundLinks().size(), // inbound links
document.outboundLinks().size(), // outbound links
searchEvent, // a search event that can have results directly
sourceName // the name of the source where the index was created
);