added clickdepth field writing for webgraph core (unfinished)

This commit is contained in:
orbiter 2013-03-14 01:35:38 +01:00
parent 47114910d5
commit 6b13dd0d3d
4 changed files with 47 additions and 19 deletions

View File

@ -19,11 +19,14 @@ id
last_modified
## time when resource was loaded
load_date_dt
#load_date_dt
## tags that are attached to crawls/index generation to separate the search result into user-defined subsets
collection_sxt
## needed (post-)processing steps on this metadata set, used i.e. for clickdepth-computation.
#process_sxt
##
## url construction information about the source

View File

@ -351,15 +351,8 @@ public class CollectionConfiguration extends SchemaConfiguration implements Seri
clickdepth = 0;
this.lazy = lc;
} else {
// search the citations for references
//try {
clickdepth = 999; //getClickDepth(citations, digestURI);
//} catch (IOException e) {
// add(doc, CollectionSchema.clickdepth_i, 999);
//}
if (clickdepth < 0 || clickdepth > 1) {
processTypes.add(ProcessType.CLICKDEPTH); // postprocessing needed; this is also needed if the depth is positive; there could be a shortcut
}
clickdepth = 999;
processTypes.add(ProcessType.CLICKDEPTH); // postprocessing needed; this is also needed if the depth is positive; there could be a shortcut
}
add(doc, CollectionSchema.clickdepth_i, clickdepth);
}
@ -717,7 +710,7 @@ public class CollectionConfiguration extends SchemaConfiguration implements Seri
if (allAttr || contains(CollectionSchema.outboundlinksnofollowcount_i)) add(doc, CollectionSchema.outboundlinksnofollowcount_i, document.outboundLinkNofollowCount());
// list all links
WebgraphConfiguration.Subgraph subgraph = webgraph.edges(digestURI, responseHeader, profile.collections(), clickdepth, document.getAnchors(), images, inboundLinks, outboundLinks);
WebgraphConfiguration.Subgraph subgraph = webgraph.edges(digestURI, responseHeader, profile.collections(), clickdepth, document.getAnchors(), images, inboundLinks, outboundLinks, citations);
doc.webgraphDocuments.addAll(subgraph.edges);
if (allAttr || contains(CollectionSchema.inboundlinks_tag_txt)) add(doc, CollectionSchema.inboundlinks_tag_txt, subgraph.tags[0]); // if inboundlinks_tag_txt can be removed, remove also subgraph.tags
if (allAttr || contains(CollectionSchema.inboundlinks_protocol_sxt)) add(doc, CollectionSchema.inboundlinks_protocol_sxt, protocolList2indexedList(subgraph.urlProtocols[0]));

View File

@ -30,6 +30,8 @@ import java.io.Serializable;
import java.util.ArrayList;
import java.util.Date;
import java.util.Iterator;
import java.util.LinkedHashSet;
import java.util.List;
import java.util.Map;
import java.util.Properties;
import java.util.Set;
@ -38,14 +40,17 @@ import org.apache.solr.common.SolrDocument;
import org.apache.solr.common.SolrInputDocument;
import net.yacy.cora.document.ASCII;
import net.yacy.cora.federate.solr.ProcessType;
import net.yacy.cora.federate.solr.SchemaConfiguration;
import net.yacy.cora.federate.solr.SchemaDeclaration;
import net.yacy.cora.protocol.Domains;
import net.yacy.cora.protocol.ResponseHeader;
import net.yacy.cora.util.CommonPattern;
import net.yacy.document.parser.html.ImageEntry;
import net.yacy.kelondro.data.citation.CitationReference;
import net.yacy.kelondro.data.meta.DigestURI;
import net.yacy.kelondro.logging.Log;
import net.yacy.kelondro.rwi.IndexCell;
public class WebgraphConfiguration extends SchemaConfiguration implements Serializable {
@ -103,25 +108,34 @@ public class WebgraphConfiguration extends SchemaConfiguration implements Serial
}
public Subgraph edges(
final DigestURI source, final ResponseHeader responseHeader, String[] collections, int clickdepth,
final DigestURI source, final ResponseHeader responseHeader, String[] collections, int clickdepth_source,
final Map<DigestURI, Properties> alllinks,
final Map<DigestURI, ImageEntry> images,
final Set<DigestURI> inboundLinks,
final Set<DigestURI> outboundLinks
final Set<DigestURI> outboundLinks,
IndexCell<CitationReference> citations
) {
boolean allAttr = this.isEmpty();
Subgraph subgraph = new Subgraph(inboundLinks.size(), outboundLinks.size());
addEdges(subgraph, source, responseHeader, collections, clickdepth, allAttr, alllinks, images, true, inboundLinks);
addEdges(subgraph, source, responseHeader, collections, clickdepth, allAttr, alllinks, images, false, outboundLinks);
addEdges(
subgraph, source, responseHeader, collections, clickdepth_source,
allAttr, alllinks, images, true, inboundLinks, citations);
addEdges(
subgraph, source, responseHeader, collections, clickdepth_source,
allAttr, alllinks, images, false, outboundLinks, citations);
return subgraph;
}
private void addEdges(
final Subgraph subgraph,
final DigestURI source, final ResponseHeader responseHeader, String[] collections, int clickdepth,
final DigestURI source, final ResponseHeader responseHeader, String[] collections, int clickdepth_source,
final boolean allAttr, final Map<DigestURI, Properties> alllinks, final Map<DigestURI, ImageEntry> images,
final boolean inbound, final Set<DigestURI> links) {
final boolean inbound, final Set<DigestURI> links,
final IndexCell<CitationReference> citations) {
for (final DigestURI target_url: links) {
Set<ProcessType> processTypes = new LinkedHashSet<ProcessType>();
final Properties p = alllinks.get(target_url);
if (p == null) continue;
final String name = p.getProperty("name", ""); // the name attribute
@ -183,7 +197,8 @@ public class WebgraphConfiguration extends SchemaConfiguration implements Serial
add(edge, WebgraphSchema.source_path_folders_count_i, paths.length);
add(edge, WebgraphSchema.source_path_folders_sxt, paths);
}
add(edge, WebgraphSchema.source_clickdepth_i, clickdepth);
add(edge, WebgraphSchema.source_clickdepth_i, clickdepth_source);
if (clickdepth_source < 0 || clickdepth_source > 1) processTypes.add(ProcessType.CLICKDEPTH);
// add the source attributes about the target
if (allAttr || contains(WebgraphSchema.target_inbound_b)) add(edge, WebgraphSchema.target_inbound_b, inbound);
@ -239,7 +254,23 @@ public class WebgraphConfiguration extends SchemaConfiguration implements Serial
add(edge, WebgraphSchema.target_path_folders_count_i, paths.length);
add(edge, WebgraphSchema.target_path_folders_sxt, paths);
}
add(edge, WebgraphSchema.target_clickdepth_i, clickdepth);
if ((allAttr || contains(WebgraphSchema.target_clickdepth_i)) && citations != null) {
if (target_url.probablyRootURL()) {
boolean lc = this.lazy; this.lazy = false;
add(edge, WebgraphSchema.target_clickdepth_i, 0);
this.lazy = lc;
} else {
add(edge, WebgraphSchema.target_clickdepth_i, 999);
processTypes.add(ProcessType.CLICKDEPTH); // postprocessing needed; this is also needed if the depth is positive; there could be a shortcut
}
}
if (allAttr || contains(WebgraphSchema.process_sxt)) {
List<String> pr = new ArrayList<String>();
for (ProcessType t: processTypes) pr.add(t.name());
add(edge, WebgraphSchema.process_sxt, pr);
}
// add the edge to the subgraph
subgraph.edges.add(edge);

View File

@ -35,6 +35,7 @@ public enum WebgraphSchema implements SchemaDeclaration {
last_modified(SolrType.date, true, true, false, false, false, "last-modified from http header"),
load_date_dt(SolrType.date, true, true, false, false, false, "time when resource was loaded"),
collection_sxt(SolrType.string, true, true, true, false, false, "tags that are attached to crawls/index generation to separate the search result into user-defined subsets"),
process_sxt(SolrType.string, true, true, true, false, false, "needed (post-)processing steps on this metadata set, used i.e. for clickdepth-computation."),
// source information
source_id_s(SolrType.string, true, true, false, false, false, "primary key of document, the URL hash (source)"),