From 6b13dd0d3dedd467765098e6382806cf6ece13e0 Mon Sep 17 00:00:00 2001 From: orbiter Date: Thu, 14 Mar 2013 01:35:38 +0100 Subject: [PATCH] added clickdepth field writing for webgraph core (unfinished) --- defaults/solr.webgraph.schema | 5 +- .../schema/CollectionConfiguration.java | 13 ++--- .../search/schema/WebgraphConfiguration.java | 47 +++++++++++++++---- .../yacy/search/schema/WebgraphSchema.java | 1 + 4 files changed, 47 insertions(+), 19 deletions(-) diff --git a/defaults/solr.webgraph.schema b/defaults/solr.webgraph.schema index ed9a22d9d..f7fb37f76 100644 --- a/defaults/solr.webgraph.schema +++ b/defaults/solr.webgraph.schema @@ -19,11 +19,14 @@ id last_modified ## time when resource was loaded -load_date_dt +#load_date_dt ## tags that are attached to crawls/index generation to separate the search result into user-defined subsets collection_sxt +## needed (post-)processing steps on this metadata set, used i.e. for clickdepth-computation. +#process_sxt + ## ## url construction information about the source diff --git a/source/net/yacy/search/schema/CollectionConfiguration.java b/source/net/yacy/search/schema/CollectionConfiguration.java index 993b21e9f..7ab4528d9 100644 --- a/source/net/yacy/search/schema/CollectionConfiguration.java +++ b/source/net/yacy/search/schema/CollectionConfiguration.java @@ -351,15 +351,8 @@ public class CollectionConfiguration extends SchemaConfiguration implements Seri clickdepth = 0; this.lazy = lc; } else { - // search the citations for references - //try { - clickdepth = 999; //getClickDepth(citations, digestURI); - //} catch (IOException e) { - // add(doc, CollectionSchema.clickdepth_i, 999); - //} - if (clickdepth < 0 || clickdepth > 1) { - processTypes.add(ProcessType.CLICKDEPTH); // postprocessing needed; this is also needed if the depth is positive; there could be a shortcut - } + clickdepth = 999; + processTypes.add(ProcessType.CLICKDEPTH); // postprocessing needed; this is also needed if the depth is positive; there could be a shortcut } add(doc, CollectionSchema.clickdepth_i, clickdepth); } @@ -717,7 +710,7 @@ public class CollectionConfiguration extends SchemaConfiguration implements Seri if (allAttr || contains(CollectionSchema.outboundlinksnofollowcount_i)) add(doc, CollectionSchema.outboundlinksnofollowcount_i, document.outboundLinkNofollowCount()); // list all links - WebgraphConfiguration.Subgraph subgraph = webgraph.edges(digestURI, responseHeader, profile.collections(), clickdepth, document.getAnchors(), images, inboundLinks, outboundLinks); + WebgraphConfiguration.Subgraph subgraph = webgraph.edges(digestURI, responseHeader, profile.collections(), clickdepth, document.getAnchors(), images, inboundLinks, outboundLinks, citations); doc.webgraphDocuments.addAll(subgraph.edges); if (allAttr || contains(CollectionSchema.inboundlinks_tag_txt)) add(doc, CollectionSchema.inboundlinks_tag_txt, subgraph.tags[0]); // if inboundlinks_tag_txt can be removed, remove also subgraph.tags if (allAttr || contains(CollectionSchema.inboundlinks_protocol_sxt)) add(doc, CollectionSchema.inboundlinks_protocol_sxt, protocolList2indexedList(subgraph.urlProtocols[0])); diff --git a/source/net/yacy/search/schema/WebgraphConfiguration.java b/source/net/yacy/search/schema/WebgraphConfiguration.java index 68420e970..c7fb52110 100644 --- a/source/net/yacy/search/schema/WebgraphConfiguration.java +++ b/source/net/yacy/search/schema/WebgraphConfiguration.java @@ -30,6 +30,8 @@ import java.io.Serializable; import java.util.ArrayList; import java.util.Date; import java.util.Iterator; +import java.util.LinkedHashSet; +import java.util.List; import java.util.Map; import java.util.Properties; import java.util.Set; @@ -38,14 +40,17 @@ import org.apache.solr.common.SolrDocument; import org.apache.solr.common.SolrInputDocument; import net.yacy.cora.document.ASCII; +import net.yacy.cora.federate.solr.ProcessType; import net.yacy.cora.federate.solr.SchemaConfiguration; import net.yacy.cora.federate.solr.SchemaDeclaration; import net.yacy.cora.protocol.Domains; import net.yacy.cora.protocol.ResponseHeader; import net.yacy.cora.util.CommonPattern; import net.yacy.document.parser.html.ImageEntry; +import net.yacy.kelondro.data.citation.CitationReference; import net.yacy.kelondro.data.meta.DigestURI; import net.yacy.kelondro.logging.Log; +import net.yacy.kelondro.rwi.IndexCell; public class WebgraphConfiguration extends SchemaConfiguration implements Serializable { @@ -103,25 +108,34 @@ public class WebgraphConfiguration extends SchemaConfiguration implements Serial } public Subgraph edges( - final DigestURI source, final ResponseHeader responseHeader, String[] collections, int clickdepth, + final DigestURI source, final ResponseHeader responseHeader, String[] collections, int clickdepth_source, final Map alllinks, final Map images, final Set inboundLinks, - final Set outboundLinks + final Set outboundLinks, + IndexCell citations ) { boolean allAttr = this.isEmpty(); Subgraph subgraph = new Subgraph(inboundLinks.size(), outboundLinks.size()); - addEdges(subgraph, source, responseHeader, collections, clickdepth, allAttr, alllinks, images, true, inboundLinks); - addEdges(subgraph, source, responseHeader, collections, clickdepth, allAttr, alllinks, images, false, outboundLinks); + addEdges( + subgraph, source, responseHeader, collections, clickdepth_source, + allAttr, alllinks, images, true, inboundLinks, citations); + addEdges( + subgraph, source, responseHeader, collections, clickdepth_source, + allAttr, alllinks, images, false, outboundLinks, citations); return subgraph; } private void addEdges( final Subgraph subgraph, - final DigestURI source, final ResponseHeader responseHeader, String[] collections, int clickdepth, + final DigestURI source, final ResponseHeader responseHeader, String[] collections, int clickdepth_source, final boolean allAttr, final Map alllinks, final Map images, - final boolean inbound, final Set links) { + final boolean inbound, final Set links, + final IndexCell citations) { for (final DigestURI target_url: links) { + + Set processTypes = new LinkedHashSet(); + final Properties p = alllinks.get(target_url); if (p == null) continue; final String name = p.getProperty("name", ""); // the name attribute @@ -183,7 +197,8 @@ public class WebgraphConfiguration extends SchemaConfiguration implements Serial add(edge, WebgraphSchema.source_path_folders_count_i, paths.length); add(edge, WebgraphSchema.source_path_folders_sxt, paths); } - add(edge, WebgraphSchema.source_clickdepth_i, clickdepth); + add(edge, WebgraphSchema.source_clickdepth_i, clickdepth_source); + if (clickdepth_source < 0 || clickdepth_source > 1) processTypes.add(ProcessType.CLICKDEPTH); // add the source attributes about the target if (allAttr || contains(WebgraphSchema.target_inbound_b)) add(edge, WebgraphSchema.target_inbound_b, inbound); @@ -239,7 +254,23 @@ public class WebgraphConfiguration extends SchemaConfiguration implements Serial add(edge, WebgraphSchema.target_path_folders_count_i, paths.length); add(edge, WebgraphSchema.target_path_folders_sxt, paths); } - add(edge, WebgraphSchema.target_clickdepth_i, clickdepth); + + if ((allAttr || contains(WebgraphSchema.target_clickdepth_i)) && citations != null) { + if (target_url.probablyRootURL()) { + boolean lc = this.lazy; this.lazy = false; + add(edge, WebgraphSchema.target_clickdepth_i, 0); + this.lazy = lc; + } else { + add(edge, WebgraphSchema.target_clickdepth_i, 999); + processTypes.add(ProcessType.CLICKDEPTH); // postprocessing needed; this is also needed if the depth is positive; there could be a shortcut + } + } + + if (allAttr || contains(WebgraphSchema.process_sxt)) { + List pr = new ArrayList(); + for (ProcessType t: processTypes) pr.add(t.name()); + add(edge, WebgraphSchema.process_sxt, pr); + } // add the edge to the subgraph subgraph.edges.add(edge); diff --git a/source/net/yacy/search/schema/WebgraphSchema.java b/source/net/yacy/search/schema/WebgraphSchema.java index 90d618744..4ff02e6a1 100644 --- a/source/net/yacy/search/schema/WebgraphSchema.java +++ b/source/net/yacy/search/schema/WebgraphSchema.java @@ -35,6 +35,7 @@ public enum WebgraphSchema implements SchemaDeclaration { last_modified(SolrType.date, true, true, false, false, false, "last-modified from http header"), load_date_dt(SolrType.date, true, true, false, false, false, "time when resource was loaded"), collection_sxt(SolrType.string, true, true, true, false, false, "tags that are attached to crawls/index generation to separate the search result into user-defined subsets"), + process_sxt(SolrType.string, true, true, true, false, false, "needed (post-)processing steps on this metadata set, used i.e. for clickdepth-computation."), // source information source_id_s(SolrType.string, true, true, false, false, false, "primary key of document, the URL hash (source)"),