mirror of
https://github.com/yacy/yacy_search_server.git
synced 2024-09-19 00:01:41 +02:00
- solved problems with backpath normalization
- redesigned in/outbound link handover - removed iframe links from inbound/outbound in solr scheme
This commit is contained in:
parent
5f5ed33ed8
commit
453010bd68
|
@ -106,7 +106,7 @@ public class webstructure {
|
|||
prop.put("references_count", 1);
|
||||
prop.put("references_documents", 1);
|
||||
prop.put("references_documents_0_hash", urlhash);
|
||||
prop.put("references_documents_0_count", scraper.inboundLinkCount() + scraper.outboundLinkCount());
|
||||
prop.put("references_documents_0_count", scraper.inboundLinks().size() + scraper.outboundLinks().size());
|
||||
prop.put("references_documents_0_date", GenericFormatter.SHORT_DAY_FORMATTER.format(new Date()));
|
||||
prop.put("references_documents_0_urle", url == null ? 0 : 1);
|
||||
if (url != null) prop.putXML("references_documents_0_urle_url", url.toNormalform(true, false));
|
||||
|
|
|
@ -84,31 +84,31 @@ public class SolrScheme extends ConfigurationSet {
|
|||
*/
|
||||
}
|
||||
|
||||
private void addSolr(final SolrInputDocument solrdoc, final Field key, final String value) {
|
||||
protected void addSolr(final SolrInputDocument solrdoc, final Field key, final String value) {
|
||||
if (isEmpty() || contains(key.name())) solrdoc.setField(key.name(), value);
|
||||
}
|
||||
|
||||
private void addSolr(final SolrInputDocument solrdoc, final Field key, final Date value) {
|
||||
protected void addSolr(final SolrInputDocument solrdoc, final Field key, final Date value) {
|
||||
if (isEmpty() || contains(key.name())) solrdoc.setField(key.name(), value);
|
||||
}
|
||||
|
||||
private void addSolr(final SolrInputDocument solrdoc, final Field key, final int value) {
|
||||
protected void addSolr(final SolrInputDocument solrdoc, final Field key, final int value) {
|
||||
if (isEmpty() || contains(key.name())) solrdoc.setField(key.name(), value);
|
||||
}
|
||||
|
||||
private void addSolr(final SolrInputDocument solrdoc, final Field key, final String[] value) {
|
||||
protected void addSolr(final SolrInputDocument solrdoc, final Field key, final String[] value) {
|
||||
if (isEmpty() || contains(key.name())) solrdoc.setField(key.name(), value);
|
||||
}
|
||||
|
||||
private void addSolr(final SolrInputDocument solrdoc, final Field key, final float value) {
|
||||
protected void addSolr(final SolrInputDocument solrdoc, final Field key, final float value) {
|
||||
if (isEmpty() || contains(key.name())) solrdoc.setField(key.name(), value);
|
||||
}
|
||||
|
||||
private void addSolr(final SolrInputDocument solrdoc, final Field key, final boolean value) {
|
||||
protected void addSolr(final SolrInputDocument solrdoc, final Field key, final boolean value) {
|
||||
if (isEmpty() || contains(key.name())) solrdoc.setField(key.name(), value);
|
||||
}
|
||||
|
||||
private void addSolr(final SolrInputDocument solrdoc, final Field key, final String value, final float boost) {
|
||||
protected void addSolr(final SolrInputDocument solrdoc, final Field key, final String value, final float boost) {
|
||||
if (isEmpty() || contains(key.name())) solrdoc.setField(key.name(), value, boost);
|
||||
}
|
||||
|
||||
|
@ -308,92 +308,11 @@ public class SolrScheme extends ConfigurationSet {
|
|||
if (paths.length > 0) addSolr(solrdoc, Field.paths_txt, paths);
|
||||
}
|
||||
|
||||
// list all links
|
||||
final Map<MultiProtocolURI, Properties> alllinks = yacydoc.getAnchors();
|
||||
// get list of all links; they will be shrinked by urls that appear in other fields of the solr scheme
|
||||
Set<MultiProtocolURI> inboundLinks = yacydoc.inboundLinks();
|
||||
Set<MultiProtocolURI> ouboundLinks = yacydoc.outboundLinks();
|
||||
|
||||
int c = 0;
|
||||
if (isEmpty() || contains(Field.inboundlinkscount_i.name())) addSolr(solrdoc, Field.inboundlinkscount_i, yacydoc.inboundLinkCount());
|
||||
if (isEmpty() || contains(Field.inboundlinksnofollowcount_i.name())) addSolr(solrdoc, Field.inboundlinksnofollowcount_i, yacydoc.inboundLinkNofollowCount());
|
||||
final String[] inboundlinksTag = new String[yacydoc.inboundLinkCount()];
|
||||
final String[] inboundlinksURLProtocol = new String[yacydoc.inboundLinkCount()];
|
||||
final String[] inboundlinksURLStub = new String[yacydoc.inboundLinkCount()];
|
||||
final String[] inboundlinksName = new String[yacydoc.inboundLinkCount()];
|
||||
final String[] inboundlinksRel = new String[yacydoc.inboundLinkCount()];
|
||||
final String[] inboundlinksText = new String[yacydoc.inboundLinkCount()];
|
||||
for (final MultiProtocolURI url: yacydoc.inboundLinks()) {
|
||||
final Properties p = alllinks.get(url);
|
||||
final String name = p.getProperty("name", ""); // the name attribute
|
||||
final String rel = p.getProperty("rel", ""); // the rel-attribute
|
||||
final String text = p.getProperty("text", ""); // the text between the <a></a> tag
|
||||
final String urls = url.toNormalform(false, false);
|
||||
final int pr = urls.indexOf("://",0);
|
||||
inboundlinksURLProtocol[c] = urls.substring(0, pr);
|
||||
inboundlinksURLStub[c] = urls.substring(pr + 3);
|
||||
inboundlinksName[c] = name.length() > 0 ? name : "";
|
||||
inboundlinksRel[c] = rel.length() > 0 ? rel : "";
|
||||
inboundlinksText[c] = text.length() > 0 ? text : "";
|
||||
inboundlinksTag[c] =
|
||||
"<a href=\"" + url.toNormalform(false, false) + "\"" +
|
||||
(rel.length() > 0 ? " rel=\"" + rel + "\"" : "") +
|
||||
(name.length() > 0 ? " name=\"" + name + "\"" : "") +
|
||||
">" +
|
||||
((text.length() > 0) ? text : "") + "</a>";
|
||||
c++;
|
||||
}
|
||||
if (isEmpty() || contains(Field.inboundlinks_tag_txt.name())) addSolr(solrdoc, Field.inboundlinks_tag_txt, inboundlinksTag);
|
||||
if (isEmpty() || contains(Field.inboundlinks_protocol_txt.name())) addSolr(solrdoc, Field.inboundlinks_protocol_txt, protocolList2indexedList(inboundlinksURLProtocol));
|
||||
if (isEmpty() || contains(Field.inboundlinks_urlstub_txt.name())) addSolr(solrdoc, Field.inboundlinks_urlstub_txt, inboundlinksURLStub);
|
||||
if (isEmpty() || contains(Field.inboundlinks_name_txt.name())) addSolr(solrdoc, Field.inboundlinks_name_txt, inboundlinksName);
|
||||
if (isEmpty() || contains(Field.inboundlinks_rel_txt.name())) addSolr(solrdoc, Field.inboundlinks_rel_txt, inboundlinksRel);
|
||||
if (isEmpty() || contains(Field.inboundlinks_relflags_txt.name())) addSolr(solrdoc, Field.inboundlinks_relflags_txt, relEval(inboundlinksRel));
|
||||
if (isEmpty() || contains(Field.inboundlinks_text_txt.name())) addSolr(solrdoc, Field.inboundlinks_text_txt, inboundlinksText);
|
||||
|
||||
c = 0;
|
||||
if (isEmpty() || contains(Field.outboundlinkscount_i.name())) addSolr(solrdoc, Field.outboundlinkscount_i, yacydoc.outboundLinkCount());
|
||||
if (isEmpty() || contains(Field.outboundlinksnofollowcount_i.name())) addSolr(solrdoc, Field.outboundlinksnofollowcount_i, yacydoc.outboundLinkNofollowCount());
|
||||
final String[] outboundlinksTag = new String[yacydoc.outboundLinkCount()];
|
||||
final String[] outboundlinksURLProtocol = new String[yacydoc.outboundLinkCount()];
|
||||
final String[] outboundlinksURLStub = new String[yacydoc.outboundLinkCount()];
|
||||
final String[] outboundlinksName = new String[yacydoc.outboundLinkCount()];
|
||||
final String[] outboundlinksRel = new String[yacydoc.outboundLinkCount()];
|
||||
final String[] outboundlinksText = new String[yacydoc.outboundLinkCount()];
|
||||
for (final MultiProtocolURI url: yacydoc.outboundLinks()) {
|
||||
final Properties p = alllinks.get(url);
|
||||
final String name = p.getProperty("name", ""); // the name attribute
|
||||
final String rel = p.getProperty("rel", ""); // the rel-attribute
|
||||
final String text = p.getProperty("text", ""); // the text between the <a></a> tag
|
||||
final String urls = url.toNormalform(false, false);
|
||||
final int pr = urls.indexOf("://",0);
|
||||
outboundlinksURLProtocol[c] = urls.substring(0, pr);
|
||||
outboundlinksURLStub[c] = urls.substring(pr + 3);
|
||||
outboundlinksName[c] = name.length() > 0 ? name : "";
|
||||
outboundlinksRel[c] = rel.length() > 0 ? rel : "";
|
||||
outboundlinksText[c] = text.length() > 0 ? text : "";
|
||||
outboundlinksTag[c] =
|
||||
"<a href=\"" + url.toNormalform(false, false) + "\"" +
|
||||
(rel.length() > 0 ? " rel=\"" + rel + "\"" : "") +
|
||||
(name.length() > 0 ? " name=\"" + name + "\"" : "") +
|
||||
">" +
|
||||
((text.length() > 0) ? text : "") + "</a>";
|
||||
c++;
|
||||
}
|
||||
if (isEmpty() || contains(Field.outboundlinks_tag_txt.name())) addSolr(solrdoc, Field.outboundlinks_tag_txt, outboundlinksTag);
|
||||
if (isEmpty() || contains(Field.outboundlinks_protocol_txt.name())) addSolr(solrdoc, Field.outboundlinks_protocol_txt, protocolList2indexedList(outboundlinksURLProtocol));
|
||||
if (isEmpty() || contains(Field.outboundlinks_urlstub_txt.name())) addSolr(solrdoc, Field.outboundlinks_urlstub_txt, outboundlinksURLStub);
|
||||
if (isEmpty() || contains(Field.outboundlinks_name_txt.name())) addSolr(solrdoc, Field.outboundlinks_name_txt, outboundlinksName);
|
||||
if (isEmpty() || contains(Field.outboundlinks_rel_txt.name())) addSolr(solrdoc, Field.outboundlinks_rel_txt, outboundlinksRel);
|
||||
if (isEmpty() || contains(Field.outboundlinks_relflags_txt.name())) addSolr(solrdoc, Field.outboundlinks_relflags_txt, relEval(inboundlinksRel));
|
||||
if (isEmpty() || contains(Field.outboundlinks_text_txt.name())) addSolr(solrdoc, Field.outboundlinks_text_txt, outboundlinksText);
|
||||
|
||||
|
||||
// charset
|
||||
addSolr(solrdoc, Field.charset_s, yacydoc.getCharset());
|
||||
|
||||
// coordinates
|
||||
if (yacydoc.lat() != 0.0f && yacydoc.lon() != 0.0f) {
|
||||
addSolr(solrdoc, Field.lon_coordinate, yacydoc.lon());
|
||||
addSolr(solrdoc, Field.lat_coordinate, yacydoc.lat());
|
||||
}
|
||||
addSolr(solrdoc, Field.httpstatus_i, 200);
|
||||
final Object parser = yacydoc.getParserObject();
|
||||
if (parser instanceof ContentScraper) {
|
||||
final ContentScraper html = (ContentScraper) parser;
|
||||
|
@ -483,6 +402,8 @@ public class SolrScheme extends ConfigurationSet {
|
|||
c = 0;
|
||||
for (final ImageEntry ie: imagesc) {
|
||||
final MultiProtocolURI uri = ie.url();
|
||||
inboundLinks.remove(uri);
|
||||
ouboundLinks.remove(uri);
|
||||
imgtags[c] = ie.toString();
|
||||
imgprots[c] = uri.getProtocol();
|
||||
imgstubs[c] = uri.toString().substring(imgprots[c].length() + 3);
|
||||
|
@ -503,6 +424,8 @@ public class SolrScheme extends ConfigurationSet {
|
|||
c = 0;
|
||||
for (final Map.Entry<MultiProtocolURI, String> entry: csss.entrySet()) {
|
||||
final String url = entry.getKey().toNormalform(false, false, false, false);
|
||||
inboundLinks.remove(url);
|
||||
ouboundLinks.remove(url);
|
||||
css_tag[c] =
|
||||
"<link rel=\"stylesheet\" type=\"text/css\" media=\"" + entry.getValue() + "\"" +
|
||||
" href=\""+ url + "\" />";
|
||||
|
@ -520,6 +443,8 @@ public class SolrScheme extends ConfigurationSet {
|
|||
final String[] scripts = new String[scriptss.size()];
|
||||
c = 0;
|
||||
for (final MultiProtocolURI url: scriptss) {
|
||||
inboundLinks.remove(url);
|
||||
ouboundLinks.remove(url);
|
||||
scripts[c++] = url.toNormalform(false, false, false, false);
|
||||
}
|
||||
addSolr(solrdoc, Field.scriptscount_i, scripts.length);
|
||||
|
@ -531,21 +456,24 @@ public class SolrScheme extends ConfigurationSet {
|
|||
final Set<MultiProtocolURI> framess = html.getFrames();
|
||||
final String[] frames = new String[framess.size()];
|
||||
c = 0;
|
||||
for (final MultiProtocolURI entry: framess) {
|
||||
frames[c++] = entry.toNormalform(false, false, false, false);
|
||||
for (final MultiProtocolURI url: framess) {
|
||||
inboundLinks.remove(url);
|
||||
ouboundLinks.remove(url);
|
||||
frames[c++] = url.toNormalform(false, false, false, false);
|
||||
}
|
||||
addSolr(solrdoc, Field.framesscount_i, frames.length);
|
||||
if (frames.length > 0) addSolr(solrdoc, Field.frames_txt, frames);
|
||||
}
|
||||
|
||||
// IFrames
|
||||
if (isEmpty() || contains(Field.iframes_txt.name()
|
||||
)) {
|
||||
if (isEmpty() || contains(Field.iframes_txt.name())) {
|
||||
final Set<MultiProtocolURI> iframess = html.getIFrames();
|
||||
final String[] iframes = new String[iframess.size()];
|
||||
c = 0;
|
||||
for (final MultiProtocolURI entry: iframess) {
|
||||
iframes[c++] = entry.toNormalform(false, false, false, false);
|
||||
for (final MultiProtocolURI url: iframess) {
|
||||
inboundLinks.remove(url);
|
||||
ouboundLinks.remove(url);
|
||||
iframes[c++] = url.toNormalform(false, false, false, false);
|
||||
}
|
||||
addSolr(solrdoc, Field.iframesscount_i, iframes.length);
|
||||
if (iframes.length > 0) addSolr(solrdoc, Field.iframes_txt, iframes);
|
||||
|
@ -568,6 +496,94 @@ public class SolrScheme extends ConfigurationSet {
|
|||
// response time
|
||||
addSolr(solrdoc, Field.responsetime_i, header.get(HeaderFramework.RESPONSE_TIME_MILLIS, "0"));
|
||||
}
|
||||
|
||||
// list all links
|
||||
final Map<MultiProtocolURI, Properties> alllinks = yacydoc.getAnchors();
|
||||
c = 0;
|
||||
if (isEmpty() || contains(Field.inboundlinkscount_i.name())) addSolr(solrdoc, Field.inboundlinkscount_i, inboundLinks.size());
|
||||
if (isEmpty() || contains(Field.inboundlinksnofollowcount_i.name())) addSolr(solrdoc, Field.inboundlinksnofollowcount_i, yacydoc.inboundLinkNofollowCount());
|
||||
final String[] inboundlinksTag = new String[inboundLinks.size()];
|
||||
final String[] inboundlinksURLProtocol = new String[inboundLinks.size()];
|
||||
final String[] inboundlinksURLStub = new String[inboundLinks.size()];
|
||||
final String[] inboundlinksName = new String[inboundLinks.size()];
|
||||
final String[] inboundlinksRel = new String[inboundLinks.size()];
|
||||
final String[] inboundlinksText = new String[inboundLinks.size()];
|
||||
for (final MultiProtocolURI url: inboundLinks) {
|
||||
final Properties p = alllinks.get(url);
|
||||
final String name = p.getProperty("name", ""); // the name attribute
|
||||
final String rel = p.getProperty("rel", ""); // the rel-attribute
|
||||
final String text = p.getProperty("text", ""); // the text between the <a></a> tag
|
||||
final String urls = url.toNormalform(false, false);
|
||||
final int pr = urls.indexOf("://",0);
|
||||
inboundlinksURLProtocol[c] = urls.substring(0, pr);
|
||||
inboundlinksURLStub[c] = urls.substring(pr + 3);
|
||||
inboundlinksName[c] = name.length() > 0 ? name : "";
|
||||
inboundlinksRel[c] = rel.length() > 0 ? rel : "";
|
||||
inboundlinksText[c] = text.length() > 0 ? text : "";
|
||||
inboundlinksTag[c] =
|
||||
"<a href=\"" + url.toNormalform(false, false) + "\"" +
|
||||
(rel.length() > 0 ? " rel=\"" + rel + "\"" : "") +
|
||||
(name.length() > 0 ? " name=\"" + name + "\"" : "") +
|
||||
">" +
|
||||
((text.length() > 0) ? text : "") + "</a>";
|
||||
c++;
|
||||
}
|
||||
if (isEmpty() || contains(Field.inboundlinks_tag_txt.name())) addSolr(solrdoc, Field.inboundlinks_tag_txt, inboundlinksTag);
|
||||
if (isEmpty() || contains(Field.inboundlinks_protocol_txt.name())) addSolr(solrdoc, Field.inboundlinks_protocol_txt, protocolList2indexedList(inboundlinksURLProtocol));
|
||||
if (isEmpty() || contains(Field.inboundlinks_urlstub_txt.name())) addSolr(solrdoc, Field.inboundlinks_urlstub_txt, inboundlinksURLStub);
|
||||
if (isEmpty() || contains(Field.inboundlinks_name_txt.name())) addSolr(solrdoc, Field.inboundlinks_name_txt, inboundlinksName);
|
||||
if (isEmpty() || contains(Field.inboundlinks_rel_txt.name())) addSolr(solrdoc, Field.inboundlinks_rel_txt, inboundlinksRel);
|
||||
if (isEmpty() || contains(Field.inboundlinks_relflags_txt.name())) addSolr(solrdoc, Field.inboundlinks_relflags_txt, relEval(inboundlinksRel));
|
||||
if (isEmpty() || contains(Field.inboundlinks_text_txt.name())) addSolr(solrdoc, Field.inboundlinks_text_txt, inboundlinksText);
|
||||
|
||||
c = 0;
|
||||
if (isEmpty() || contains(Field.outboundlinkscount_i.name())) addSolr(solrdoc, Field.outboundlinkscount_i, ouboundLinks.size());
|
||||
if (isEmpty() || contains(Field.outboundlinksnofollowcount_i.name())) addSolr(solrdoc, Field.outboundlinksnofollowcount_i, yacydoc.outboundLinkNofollowCount());
|
||||
final String[] outboundlinksTag = new String[ouboundLinks.size()];
|
||||
final String[] outboundlinksURLProtocol = new String[ouboundLinks.size()];
|
||||
final String[] outboundlinksURLStub = new String[ouboundLinks.size()];
|
||||
final String[] outboundlinksName = new String[ouboundLinks.size()];
|
||||
final String[] outboundlinksRel = new String[ouboundLinks.size()];
|
||||
final String[] outboundlinksText = new String[ouboundLinks.size()];
|
||||
for (final MultiProtocolURI url: ouboundLinks) {
|
||||
final Properties p = alllinks.get(url);
|
||||
final String name = p.getProperty("name", ""); // the name attribute
|
||||
final String rel = p.getProperty("rel", ""); // the rel-attribute
|
||||
final String text = p.getProperty("text", ""); // the text between the <a></a> tag
|
||||
final String urls = url.toNormalform(false, false);
|
||||
final int pr = urls.indexOf("://",0);
|
||||
outboundlinksURLProtocol[c] = urls.substring(0, pr);
|
||||
outboundlinksURLStub[c] = urls.substring(pr + 3);
|
||||
outboundlinksName[c] = name.length() > 0 ? name : "";
|
||||
outboundlinksRel[c] = rel.length() > 0 ? rel : "";
|
||||
outboundlinksText[c] = text.length() > 0 ? text : "";
|
||||
outboundlinksTag[c] =
|
||||
"<a href=\"" + url.toNormalform(false, false) + "\"" +
|
||||
(rel.length() > 0 ? " rel=\"" + rel + "\"" : "") +
|
||||
(name.length() > 0 ? " name=\"" + name + "\"" : "") +
|
||||
">" +
|
||||
((text.length() > 0) ? text : "") + "</a>";
|
||||
c++;
|
||||
}
|
||||
if (isEmpty() || contains(Field.outboundlinks_tag_txt.name())) addSolr(solrdoc, Field.outboundlinks_tag_txt, outboundlinksTag);
|
||||
if (isEmpty() || contains(Field.outboundlinks_protocol_txt.name())) addSolr(solrdoc, Field.outboundlinks_protocol_txt, protocolList2indexedList(outboundlinksURLProtocol));
|
||||
if (isEmpty() || contains(Field.outboundlinks_urlstub_txt.name())) addSolr(solrdoc, Field.outboundlinks_urlstub_txt, outboundlinksURLStub);
|
||||
if (isEmpty() || contains(Field.outboundlinks_name_txt.name())) addSolr(solrdoc, Field.outboundlinks_name_txt, outboundlinksName);
|
||||
if (isEmpty() || contains(Field.outboundlinks_rel_txt.name())) addSolr(solrdoc, Field.outboundlinks_rel_txt, outboundlinksRel);
|
||||
if (isEmpty() || contains(Field.outboundlinks_relflags_txt.name())) addSolr(solrdoc, Field.outboundlinks_relflags_txt, relEval(inboundlinksRel));
|
||||
if (isEmpty() || contains(Field.outboundlinks_text_txt.name())) addSolr(solrdoc, Field.outboundlinks_text_txt, outboundlinksText);
|
||||
|
||||
|
||||
// charset
|
||||
addSolr(solrdoc, Field.charset_s, yacydoc.getCharset());
|
||||
|
||||
// coordinates
|
||||
if (yacydoc.lat() != 0.0f && yacydoc.lon() != 0.0f) {
|
||||
addSolr(solrdoc, Field.lon_coordinate, yacydoc.lon());
|
||||
addSolr(solrdoc, Field.lat_coordinate, yacydoc.lat());
|
||||
}
|
||||
addSolr(solrdoc, Field.httpstatus_i, 200);
|
||||
|
||||
return solrdoc;
|
||||
}
|
||||
|
||||
|
|
|
@ -626,16 +626,6 @@ dc_rights
|
|||
this.favicon = faviconURL;
|
||||
}
|
||||
|
||||
public int inboundLinkCount() {
|
||||
if (this.inboundlinks == null) resortLinks();
|
||||
return (this.inboundlinks == null) ? 0 : this.inboundlinks.size();
|
||||
}
|
||||
|
||||
public int outboundLinkCount() {
|
||||
if (this.outboundlinks == null) resortLinks();
|
||||
return (this.outboundlinks == null) ? 0 : this.outboundlinks.size();
|
||||
}
|
||||
|
||||
public int inboundLinkNofollowCount() {
|
||||
if (this.inboundlinks == null) resortLinks();
|
||||
if (this.inboundlinks == null) return 0;
|
||||
|
|
|
@ -65,8 +65,8 @@ public class ContentScraper extends AbstractScraper implements Scraper {
|
|||
private final char[] minuteCharsHTML = "'".toCharArray();
|
||||
|
||||
// statics: for initialization of the HTMLFilterAbstractScraper
|
||||
private static final Set<String> linkTags0 = new HashSet<String>(9,0.99f);
|
||||
private static final Set<String> linkTags1 = new HashSet<String>(7,0.99f);
|
||||
private static final Set<String> linkTags0 = new HashSet<String>(12,0.99f);
|
||||
private static final Set<String> linkTags1 = new HashSet<String>(15,0.99f);
|
||||
|
||||
public enum TagType {
|
||||
singleton, pair;
|
||||
|
@ -119,6 +119,7 @@ public class ContentScraper extends AbstractScraper implements Scraper {
|
|||
private final Map<MultiProtocolURI, Properties> anchors;
|
||||
private final Map<MultiProtocolURI, String> rss, css;
|
||||
private final Set<MultiProtocolURI> script, frames, iframes;
|
||||
private final Map<MultiProtocolURI, EmbedEntry> embeds; // urlhash/embed relation
|
||||
private final Map<MultiProtocolURI, ImageEntry> images; // urlhash/image relation
|
||||
private final Map<String, String> metas;
|
||||
private String title;
|
||||
|
@ -159,6 +160,7 @@ public class ContentScraper extends AbstractScraper implements Scraper {
|
|||
this.css = new HashMap<MultiProtocolURI, String>();
|
||||
this.anchors = new HashMap<MultiProtocolURI, Properties>();
|
||||
this.images = new HashMap<MultiProtocolURI, ImageEntry>();
|
||||
this.embeds = new HashMap<MultiProtocolURI, EmbedEntry>();
|
||||
this.frames = new HashSet<MultiProtocolURI>();
|
||||
this.iframes = new HashSet<MultiProtocolURI>();
|
||||
this.metas = new HashMap<String, String>();
|
||||
|
@ -317,11 +319,11 @@ public class ContentScraper extends AbstractScraper implements Scraper {
|
|||
if (tagname.equalsIgnoreCase("img")) {
|
||||
final String src = tagopts.getProperty("src", EMPTY_STRING);
|
||||
try {
|
||||
final int width = Integer.parseInt(tagopts.getProperty("width", "-1"));
|
||||
final int height = Integer.parseInt(tagopts.getProperty("height", "-1"));
|
||||
if (src.length() > 0) {
|
||||
final MultiProtocolURI url = absolutePath(src);
|
||||
if (url != null) {
|
||||
final int width = Integer.parseInt(tagopts.getProperty("width", "-1"));
|
||||
final int height = Integer.parseInt(tagopts.getProperty("height", "-1"));
|
||||
final ImageEntry ie = new ImageEntry(url, tagopts.getProperty("alt", EMPTY_STRING), width, height, -1);
|
||||
addImage(this.images, ie);
|
||||
}
|
||||
|
@ -334,6 +336,7 @@ public class ContentScraper extends AbstractScraper implements Scraper {
|
|||
} catch (final MalformedURLException e) {}
|
||||
} else if (tagname.equalsIgnoreCase("frame")) {
|
||||
final MultiProtocolURI src = absolutePath(tagopts.getProperty("src", EMPTY_STRING));
|
||||
tagopts.put("src", src.toNormalform(true, false));
|
||||
mergeAnchors(src, tagopts /* with property "name" */);
|
||||
this.frames.add(src);
|
||||
this.evaluationScores.match(Element.framepath, src.toNormalform(true, false));
|
||||
|
@ -361,13 +364,18 @@ public class ContentScraper extends AbstractScraper implements Scraper {
|
|||
final String areatitle = cleanLine(tagopts.getProperty("title",EMPTY_STRING));
|
||||
//String alt = tagopts.getProperty("alt",EMPTY_STRING);
|
||||
final String href = tagopts.getProperty("href", EMPTY_STRING);
|
||||
tagopts.put("nme", areatitle);
|
||||
if (href.length() > 0) mergeAnchors(absolutePath(href), tagopts);
|
||||
if (href.length() > 0) {
|
||||
tagopts.put("nme", areatitle);
|
||||
MultiProtocolURI url = absolutePath(href);
|
||||
tagopts.put("href", url.toNormalform(true, false));
|
||||
mergeAnchors(url, tagopts);
|
||||
}
|
||||
} else if (tagname.equalsIgnoreCase("link")) {
|
||||
final String href = tagopts.getProperty("href", EMPTY_STRING);
|
||||
final MultiProtocolURI newLink = absolutePath(href);
|
||||
|
||||
if (newLink != null) {
|
||||
tagopts.put("href", newLink.toNormalform(true, false));
|
||||
final String rel = tagopts.getProperty("rel", EMPTY_STRING);
|
||||
final String linktitle = tagopts.getProperty("title", EMPTY_STRING);
|
||||
final String type = tagopts.getProperty("type", EMPTY_STRING);
|
||||
|
@ -391,11 +399,26 @@ public class ContentScraper extends AbstractScraper implements Scraper {
|
|||
}
|
||||
}
|
||||
} else if(tagname.equalsIgnoreCase("embed")) {
|
||||
mergeAnchors(absolutePath(tagopts.getProperty("src", EMPTY_STRING)), tagopts /* with property "name" */);
|
||||
final String src = tagopts.getProperty("src", EMPTY_STRING);
|
||||
try {
|
||||
if (src.length() > 0) {
|
||||
final MultiProtocolURI url = absolutePath(src);
|
||||
if (url != null) {
|
||||
final int width = Integer.parseInt(tagopts.getProperty("width", "-1"));
|
||||
final int height = Integer.parseInt(tagopts.getProperty("height", "-1"));
|
||||
tagopts.put("src", url.toNormalform(true, false));
|
||||
final EmbedEntry ie = new EmbedEntry(url, width, height, tagopts.getProperty("type", EMPTY_STRING), tagopts.getProperty("pluginspage", EMPTY_STRING));
|
||||
this.embeds.put(url, ie);
|
||||
mergeAnchors(url, tagopts);
|
||||
}
|
||||
}
|
||||
} catch (final NumberFormatException e) {}
|
||||
} else if(tagname.equalsIgnoreCase("param")) {
|
||||
final String name = tagopts.getProperty("name", EMPTY_STRING);
|
||||
if (name.equalsIgnoreCase("movie")) {
|
||||
mergeAnchors(absolutePath(tagopts.getProperty("value", EMPTY_STRING)), tagopts /* with property "name" */);
|
||||
MultiProtocolURI url = absolutePath(tagopts.getProperty("value", EMPTY_STRING));
|
||||
tagopts.put("value", url.toNormalform(true, false));
|
||||
mergeAnchors(url, tagopts /* with property "name" */);
|
||||
}
|
||||
}
|
||||
|
||||
|
@ -419,6 +442,7 @@ public class ContentScraper extends AbstractScraper implements Scraper {
|
|||
addImage(this.images, ie);
|
||||
} else {
|
||||
tagopts.put("text", recursiveParse(text));
|
||||
tagopts.put("href", url.toNormalform(true, false)); // we must assign this because the url may have resolved backpaths and may not be absolute
|
||||
mergeAnchors(url, tagopts);
|
||||
}
|
||||
}
|
||||
|
@ -460,6 +484,7 @@ public class ContentScraper extends AbstractScraper implements Scraper {
|
|||
if (h.length() > 0) this.li.add(h);
|
||||
} else if (tagname.equalsIgnoreCase("iframe")) {
|
||||
final MultiProtocolURI src = absolutePath(tagopts.getProperty("src", EMPTY_STRING));
|
||||
tagopts.put("src", src.toNormalform(true, false));
|
||||
mergeAnchors(src, tagopts /* with property "name" */);
|
||||
this.iframes.add(src);
|
||||
this.evaluationScores.match(Element.iframepath, src.toNormalform(true, false));
|
||||
|
@ -654,10 +679,13 @@ public class ContentScraper extends AbstractScraper implements Scraper {
|
|||
* @return a map of <urlhash, ImageEntry>
|
||||
*/
|
||||
public Map<MultiProtocolURI, ImageEntry> getImages() {
|
||||
// this resturns a String(absolute url)/htmlFilterImageEntry - relation
|
||||
return this.images;
|
||||
}
|
||||
|
||||
public Map<MultiProtocolURI, EmbedEntry> getEmbeds() {
|
||||
return this.embeds;
|
||||
}
|
||||
|
||||
public Map<String, String> getMetas() {
|
||||
return this.metas;
|
||||
}
|
||||
|
|
|
@ -385,11 +385,11 @@ public class Segment {
|
|||
new byte[0], // md5
|
||||
(int) sourcesize, // size
|
||||
condenser.RESULT_NUMB_WORDS, // word count
|
||||
Response.docType(document.dc_format()), // doctype
|
||||
Response.docType(document.dc_format()), // doctype
|
||||
condenser.RESULT_FLAGS, // flags
|
||||
UTF8.getBytes(language), // language
|
||||
document.inboundLinkCount(), // inbound links
|
||||
document.outboundLinkCount(), // outbound links
|
||||
document.inboundLinks().size(), // inbound links
|
||||
document.outboundLinks().size(), // outbound links
|
||||
document.getAudiolinks().size(), // laudio
|
||||
document.getImages().size(), // limage
|
||||
document.getVideolinks().size(), // lvideo
|
||||
|
@ -409,8 +409,8 @@ public class Segment {
|
|||
condenser, // document condenser
|
||||
language, // document language
|
||||
Response.docType(document.dc_format()), // document type
|
||||
document.inboundLinkCount(), // inbound links
|
||||
document.outboundLinkCount(), // outbound links
|
||||
document.inboundLinks().size(), // inbound links
|
||||
document.outboundLinks().size(), // outbound links
|
||||
searchEvent, // a search event that can have results directly
|
||||
sourceName // the name of the source where the index was created
|
||||
);
|
||||
|
|
Loading…
Reference in New Issue
Block a user