added the anchor text within web pages to the searcheable entities of a

web page. This can be of benefit for the ranking if these fields are
used for boosts.
This commit is contained in:
orbiter 2013-10-08 18:41:07 +02:00
parent 705b3338ee
commit 5f5a97bafc
4 changed files with 16 additions and 4 deletions

View File

@ -242,12 +242,18 @@ inboundlinks_protocol_sxt
## internal links, the url only without the protocol
inboundlinks_urlstub_sxt
## internal links, the visible anchor text
inboundlinks_anchortext_txt
## external links, only the protocol
outboundlinks_protocol_sxt
## external links, the url only without the protocol
outboundlinks_urlstub_sxt
## external links, the visible anchor text
outboundlinks_anchortext_txt
## all text/words appearing in image alt texts or the tokenized url
images_text_t

View File

@ -824,8 +824,10 @@ public class CollectionConfiguration extends SchemaConfiguration implements Seri
doc.webgraphDocuments.addAll(subgraph.edges);
if (allAttr || contains(CollectionSchema.inboundlinks_protocol_sxt)) add(doc, CollectionSchema.inboundlinks_protocol_sxt, protocolList2indexedList(subgraph.urlProtocols[0]));
if (allAttr || contains(CollectionSchema.inboundlinks_urlstub_sxt)) add(doc, CollectionSchema.inboundlinks_urlstub_sxt, subgraph.urlStubs[0]);
if (allAttr || contains(CollectionSchema.inboundlinks_anchortext_txt)) add(doc, CollectionSchema.inboundlinks_anchortext_txt, subgraph.urlAnchorTexts[0]);
if (allAttr || contains(CollectionSchema.outboundlinks_protocol_sxt)) add(doc, CollectionSchema.outboundlinks_protocol_sxt, protocolList2indexedList(subgraph.urlProtocols[1]));
if (allAttr || contains(CollectionSchema.outboundlinks_urlstub_sxt)) add(doc, CollectionSchema.outboundlinks_urlstub_sxt, subgraph.urlStubs[1]);
if (allAttr || contains(CollectionSchema.outboundlinks_anchortext_txt)) add(doc, CollectionSchema.outboundlinks_anchortext_txt, subgraph.urlAnchorTexts[1]);
// charset
if (allAttr || contains(CollectionSchema.charset_s)) add(doc, CollectionSchema.charset_s, document.getCharset());

View File

@ -119,9 +119,11 @@ public enum CollectionSchema implements SchemaDeclaration {
robots_i(SolrType.num_integer, true, true, false, false, false, "content of <meta name=\"robots\" content=#content#> tag and the \"X-Robots-Tag\" HTTP property"),
metagenerator_t(SolrType.text_general, true, true, false, false, false, "content of <meta name=\"generator\" content=#content#> tag"),
inboundlinks_protocol_sxt(SolrType.string, true, true, true, false, false, "internal links, only the protocol"),
inboundlinks_urlstub_sxt(SolrType.string, true, true, true, false, false, "internal links, the url only without the protocol"),
inboundlinks_urlstub_sxt(SolrType.string, true, true, true, false, true, "internal links, the url only without the protocol"),
inboundlinks_anchortext_txt(SolrType.text_general, true, true, true, false, true, "internal links, the visible anchor text"),
outboundlinks_protocol_sxt(SolrType.string, true, true, true, false, false, "external links, only the protocol"),
outboundlinks_urlstub_sxt(SolrType.string, true, true, true, false, false, "external links, the url only without the protocol"),
outboundlinks_urlstub_sxt(SolrType.string, true, true, true, false, true, "external links, the url only without the protocol"),
outboundlinks_anchortext_txt(SolrType.text_general, true, true, true, false, true, "external links, the visible anchor text"),
images_text_t(SolrType.text_general, true, true, false, false, true, "all text/words appearing in image alt texts or the tokenized url"),
images_urlstub_sxt(SolrType.string, true, true, true, false, true, "all image links without the protocol and '://'"),

View File

@ -101,12 +101,13 @@ public class WebgraphConfiguration extends SchemaConfiguration implements Serial
}
public static class Subgraph {
public final ArrayList<String>[] urlProtocols, urlStubs;
public final ArrayList<String>[] urlProtocols, urlStubs, urlAnchorTexts;
public final ArrayList<SolrInputDocument> edges;
@SuppressWarnings("unchecked")
public Subgraph(int inboundSize, int outboundSize) {
this.urlProtocols = new ArrayList[]{new ArrayList<String>(inboundSize), new ArrayList<String>(outboundSize)};
this.urlStubs = new ArrayList[]{new ArrayList<String>(inboundSize), new ArrayList<String>(outboundSize)};
this.urlAnchorTexts = new ArrayList[]{new ArrayList<String>(inboundSize), new ArrayList<String>(outboundSize)};
this.edges = new ArrayList<SolrInputDocument>(inboundSize + outboundSize);
}
}
@ -226,8 +227,9 @@ public class WebgraphConfiguration extends SchemaConfiguration implements Serial
final String target_url_string = target_url.toNormalform(false);
int pr_target = target_url_string.indexOf("://",0);
subgraph.urlProtocols[ioidx].add(target_url_string.substring(0, pr_target));
if (allAttr || contains(WebgraphSchema.target_protocol_s)) add(edge, WebgraphSchema.target_protocol_s, target_url_string.substring(0, pr_target));
subgraph.urlStubs[ioidx].add(target_url_string.substring(pr_target + 3));
subgraph.urlAnchorTexts[ioidx].add(text);
if (allAttr || contains(WebgraphSchema.target_protocol_s)) add(edge, WebgraphSchema.target_protocol_s, target_url_string.substring(0, pr_target));
if (allAttr || contains(WebgraphSchema.target_urlstub_s)) add(edge, WebgraphSchema.target_urlstub_s, target_url_string.substring(pr_target + 3));
Map<String, String> target_searchpart = target_url.getSearchpartMap();
if (target_searchpart == null) {