mirror of
https://github.com/yacy/yacy_search_server.git
synced 2024-09-19 00:01:41 +02:00
added the anchor text within web pages to the searcheable entities of a
web page. This can be of benefit for the ranking if these fields are used for boosts.
This commit is contained in:
parent
705b3338ee
commit
5f5a97bafc
|
@ -242,12 +242,18 @@ inboundlinks_protocol_sxt
|
|||
## internal links, the url only without the protocol
|
||||
inboundlinks_urlstub_sxt
|
||||
|
||||
## internal links, the visible anchor text
|
||||
inboundlinks_anchortext_txt
|
||||
|
||||
## external links, only the protocol
|
||||
outboundlinks_protocol_sxt
|
||||
|
||||
## external links, the url only without the protocol
|
||||
outboundlinks_urlstub_sxt
|
||||
|
||||
## external links, the visible anchor text
|
||||
outboundlinks_anchortext_txt
|
||||
|
||||
## all text/words appearing in image alt texts or the tokenized url
|
||||
images_text_t
|
||||
|
||||
|
|
|
@ -824,8 +824,10 @@ public class CollectionConfiguration extends SchemaConfiguration implements Seri
|
|||
doc.webgraphDocuments.addAll(subgraph.edges);
|
||||
if (allAttr || contains(CollectionSchema.inboundlinks_protocol_sxt)) add(doc, CollectionSchema.inboundlinks_protocol_sxt, protocolList2indexedList(subgraph.urlProtocols[0]));
|
||||
if (allAttr || contains(CollectionSchema.inboundlinks_urlstub_sxt)) add(doc, CollectionSchema.inboundlinks_urlstub_sxt, subgraph.urlStubs[0]);
|
||||
if (allAttr || contains(CollectionSchema.inboundlinks_anchortext_txt)) add(doc, CollectionSchema.inboundlinks_anchortext_txt, subgraph.urlAnchorTexts[0]);
|
||||
if (allAttr || contains(CollectionSchema.outboundlinks_protocol_sxt)) add(doc, CollectionSchema.outboundlinks_protocol_sxt, protocolList2indexedList(subgraph.urlProtocols[1]));
|
||||
if (allAttr || contains(CollectionSchema.outboundlinks_urlstub_sxt)) add(doc, CollectionSchema.outboundlinks_urlstub_sxt, subgraph.urlStubs[1]);
|
||||
if (allAttr || contains(CollectionSchema.outboundlinks_anchortext_txt)) add(doc, CollectionSchema.outboundlinks_anchortext_txt, subgraph.urlAnchorTexts[1]);
|
||||
|
||||
// charset
|
||||
if (allAttr || contains(CollectionSchema.charset_s)) add(doc, CollectionSchema.charset_s, document.getCharset());
|
||||
|
|
|
@ -119,9 +119,11 @@ public enum CollectionSchema implements SchemaDeclaration {
|
|||
robots_i(SolrType.num_integer, true, true, false, false, false, "content of <meta name=\"robots\" content=#content#> tag and the \"X-Robots-Tag\" HTTP property"),
|
||||
metagenerator_t(SolrType.text_general, true, true, false, false, false, "content of <meta name=\"generator\" content=#content#> tag"),
|
||||
inboundlinks_protocol_sxt(SolrType.string, true, true, true, false, false, "internal links, only the protocol"),
|
||||
inboundlinks_urlstub_sxt(SolrType.string, true, true, true, false, false, "internal links, the url only without the protocol"),
|
||||
inboundlinks_urlstub_sxt(SolrType.string, true, true, true, false, true, "internal links, the url only without the protocol"),
|
||||
inboundlinks_anchortext_txt(SolrType.text_general, true, true, true, false, true, "internal links, the visible anchor text"),
|
||||
outboundlinks_protocol_sxt(SolrType.string, true, true, true, false, false, "external links, only the protocol"),
|
||||
outboundlinks_urlstub_sxt(SolrType.string, true, true, true, false, false, "external links, the url only without the protocol"),
|
||||
outboundlinks_urlstub_sxt(SolrType.string, true, true, true, false, true, "external links, the url only without the protocol"),
|
||||
outboundlinks_anchortext_txt(SolrType.text_general, true, true, true, false, true, "external links, the visible anchor text"),
|
||||
|
||||
images_text_t(SolrType.text_general, true, true, false, false, true, "all text/words appearing in image alt texts or the tokenized url"),
|
||||
images_urlstub_sxt(SolrType.string, true, true, true, false, true, "all image links without the protocol and '://'"),
|
||||
|
|
|
@ -101,12 +101,13 @@ public class WebgraphConfiguration extends SchemaConfiguration implements Serial
|
|||
}
|
||||
|
||||
public static class Subgraph {
|
||||
public final ArrayList<String>[] urlProtocols, urlStubs;
|
||||
public final ArrayList<String>[] urlProtocols, urlStubs, urlAnchorTexts;
|
||||
public final ArrayList<SolrInputDocument> edges;
|
||||
@SuppressWarnings("unchecked")
|
||||
public Subgraph(int inboundSize, int outboundSize) {
|
||||
this.urlProtocols = new ArrayList[]{new ArrayList<String>(inboundSize), new ArrayList<String>(outboundSize)};
|
||||
this.urlStubs = new ArrayList[]{new ArrayList<String>(inboundSize), new ArrayList<String>(outboundSize)};
|
||||
this.urlAnchorTexts = new ArrayList[]{new ArrayList<String>(inboundSize), new ArrayList<String>(outboundSize)};
|
||||
this.edges = new ArrayList<SolrInputDocument>(inboundSize + outboundSize);
|
||||
}
|
||||
}
|
||||
|
@ -226,8 +227,9 @@ public class WebgraphConfiguration extends SchemaConfiguration implements Serial
|
|||
final String target_url_string = target_url.toNormalform(false);
|
||||
int pr_target = target_url_string.indexOf("://",0);
|
||||
subgraph.urlProtocols[ioidx].add(target_url_string.substring(0, pr_target));
|
||||
if (allAttr || contains(WebgraphSchema.target_protocol_s)) add(edge, WebgraphSchema.target_protocol_s, target_url_string.substring(0, pr_target));
|
||||
subgraph.urlStubs[ioidx].add(target_url_string.substring(pr_target + 3));
|
||||
subgraph.urlAnchorTexts[ioidx].add(text);
|
||||
if (allAttr || contains(WebgraphSchema.target_protocol_s)) add(edge, WebgraphSchema.target_protocol_s, target_url_string.substring(0, pr_target));
|
||||
if (allAttr || contains(WebgraphSchema.target_urlstub_s)) add(edge, WebgraphSchema.target_urlstub_s, target_url_string.substring(pr_target + 3));
|
||||
Map<String, String> target_searchpart = target_url.getSearchpartMap();
|
||||
if (target_searchpart == null) {
|
||||
|
|
Loading…
Reference in New Issue
Block a user