mirror of
https://github.com/yacy/yacy_search_server.git
synced 2024-09-19 00:01:41 +02:00
added clickdepth field writing for webgraph core (unfinished)
This commit is contained in:
parent
47114910d5
commit
6b13dd0d3d
|
@ -19,11 +19,14 @@ id
|
|||
last_modified
|
||||
|
||||
## time when resource was loaded
|
||||
load_date_dt
|
||||
#load_date_dt
|
||||
|
||||
## tags that are attached to crawls/index generation to separate the search result into user-defined subsets
|
||||
collection_sxt
|
||||
|
||||
## needed (post-)processing steps on this metadata set, used i.e. for clickdepth-computation.
|
||||
#process_sxt
|
||||
|
||||
|
||||
##
|
||||
## url construction information about the source
|
||||
|
|
|
@ -351,15 +351,8 @@ public class CollectionConfiguration extends SchemaConfiguration implements Seri
|
|||
clickdepth = 0;
|
||||
this.lazy = lc;
|
||||
} else {
|
||||
// search the citations for references
|
||||
//try {
|
||||
clickdepth = 999; //getClickDepth(citations, digestURI);
|
||||
//} catch (IOException e) {
|
||||
// add(doc, CollectionSchema.clickdepth_i, 999);
|
||||
//}
|
||||
if (clickdepth < 0 || clickdepth > 1) {
|
||||
processTypes.add(ProcessType.CLICKDEPTH); // postprocessing needed; this is also needed if the depth is positive; there could be a shortcut
|
||||
}
|
||||
clickdepth = 999;
|
||||
processTypes.add(ProcessType.CLICKDEPTH); // postprocessing needed; this is also needed if the depth is positive; there could be a shortcut
|
||||
}
|
||||
add(doc, CollectionSchema.clickdepth_i, clickdepth);
|
||||
}
|
||||
|
@ -717,7 +710,7 @@ public class CollectionConfiguration extends SchemaConfiguration implements Seri
|
|||
if (allAttr || contains(CollectionSchema.outboundlinksnofollowcount_i)) add(doc, CollectionSchema.outboundlinksnofollowcount_i, document.outboundLinkNofollowCount());
|
||||
|
||||
// list all links
|
||||
WebgraphConfiguration.Subgraph subgraph = webgraph.edges(digestURI, responseHeader, profile.collections(), clickdepth, document.getAnchors(), images, inboundLinks, outboundLinks);
|
||||
WebgraphConfiguration.Subgraph subgraph = webgraph.edges(digestURI, responseHeader, profile.collections(), clickdepth, document.getAnchors(), images, inboundLinks, outboundLinks, citations);
|
||||
doc.webgraphDocuments.addAll(subgraph.edges);
|
||||
if (allAttr || contains(CollectionSchema.inboundlinks_tag_txt)) add(doc, CollectionSchema.inboundlinks_tag_txt, subgraph.tags[0]); // if inboundlinks_tag_txt can be removed, remove also subgraph.tags
|
||||
if (allAttr || contains(CollectionSchema.inboundlinks_protocol_sxt)) add(doc, CollectionSchema.inboundlinks_protocol_sxt, protocolList2indexedList(subgraph.urlProtocols[0]));
|
||||
|
|
|
@ -30,6 +30,8 @@ import java.io.Serializable;
|
|||
import java.util.ArrayList;
|
||||
import java.util.Date;
|
||||
import java.util.Iterator;
|
||||
import java.util.LinkedHashSet;
|
||||
import java.util.List;
|
||||
import java.util.Map;
|
||||
import java.util.Properties;
|
||||
import java.util.Set;
|
||||
|
@ -38,14 +40,17 @@ import org.apache.solr.common.SolrDocument;
|
|||
import org.apache.solr.common.SolrInputDocument;
|
||||
|
||||
import net.yacy.cora.document.ASCII;
|
||||
import net.yacy.cora.federate.solr.ProcessType;
|
||||
import net.yacy.cora.federate.solr.SchemaConfiguration;
|
||||
import net.yacy.cora.federate.solr.SchemaDeclaration;
|
||||
import net.yacy.cora.protocol.Domains;
|
||||
import net.yacy.cora.protocol.ResponseHeader;
|
||||
import net.yacy.cora.util.CommonPattern;
|
||||
import net.yacy.document.parser.html.ImageEntry;
|
||||
import net.yacy.kelondro.data.citation.CitationReference;
|
||||
import net.yacy.kelondro.data.meta.DigestURI;
|
||||
import net.yacy.kelondro.logging.Log;
|
||||
import net.yacy.kelondro.rwi.IndexCell;
|
||||
|
||||
public class WebgraphConfiguration extends SchemaConfiguration implements Serializable {
|
||||
|
||||
|
@ -103,25 +108,34 @@ public class WebgraphConfiguration extends SchemaConfiguration implements Serial
|
|||
}
|
||||
|
||||
public Subgraph edges(
|
||||
final DigestURI source, final ResponseHeader responseHeader, String[] collections, int clickdepth,
|
||||
final DigestURI source, final ResponseHeader responseHeader, String[] collections, int clickdepth_source,
|
||||
final Map<DigestURI, Properties> alllinks,
|
||||
final Map<DigestURI, ImageEntry> images,
|
||||
final Set<DigestURI> inboundLinks,
|
||||
final Set<DigestURI> outboundLinks
|
||||
final Set<DigestURI> outboundLinks,
|
||||
IndexCell<CitationReference> citations
|
||||
) {
|
||||
boolean allAttr = this.isEmpty();
|
||||
Subgraph subgraph = new Subgraph(inboundLinks.size(), outboundLinks.size());
|
||||
addEdges(subgraph, source, responseHeader, collections, clickdepth, allAttr, alllinks, images, true, inboundLinks);
|
||||
addEdges(subgraph, source, responseHeader, collections, clickdepth, allAttr, alllinks, images, false, outboundLinks);
|
||||
addEdges(
|
||||
subgraph, source, responseHeader, collections, clickdepth_source,
|
||||
allAttr, alllinks, images, true, inboundLinks, citations);
|
||||
addEdges(
|
||||
subgraph, source, responseHeader, collections, clickdepth_source,
|
||||
allAttr, alllinks, images, false, outboundLinks, citations);
|
||||
return subgraph;
|
||||
}
|
||||
|
||||
private void addEdges(
|
||||
final Subgraph subgraph,
|
||||
final DigestURI source, final ResponseHeader responseHeader, String[] collections, int clickdepth,
|
||||
final DigestURI source, final ResponseHeader responseHeader, String[] collections, int clickdepth_source,
|
||||
final boolean allAttr, final Map<DigestURI, Properties> alllinks, final Map<DigestURI, ImageEntry> images,
|
||||
final boolean inbound, final Set<DigestURI> links) {
|
||||
final boolean inbound, final Set<DigestURI> links,
|
||||
final IndexCell<CitationReference> citations) {
|
||||
for (final DigestURI target_url: links) {
|
||||
|
||||
Set<ProcessType> processTypes = new LinkedHashSet<ProcessType>();
|
||||
|
||||
final Properties p = alllinks.get(target_url);
|
||||
if (p == null) continue;
|
||||
final String name = p.getProperty("name", ""); // the name attribute
|
||||
|
@ -183,7 +197,8 @@ public class WebgraphConfiguration extends SchemaConfiguration implements Serial
|
|||
add(edge, WebgraphSchema.source_path_folders_count_i, paths.length);
|
||||
add(edge, WebgraphSchema.source_path_folders_sxt, paths);
|
||||
}
|
||||
add(edge, WebgraphSchema.source_clickdepth_i, clickdepth);
|
||||
add(edge, WebgraphSchema.source_clickdepth_i, clickdepth_source);
|
||||
if (clickdepth_source < 0 || clickdepth_source > 1) processTypes.add(ProcessType.CLICKDEPTH);
|
||||
|
||||
// add the source attributes about the target
|
||||
if (allAttr || contains(WebgraphSchema.target_inbound_b)) add(edge, WebgraphSchema.target_inbound_b, inbound);
|
||||
|
@ -239,7 +254,23 @@ public class WebgraphConfiguration extends SchemaConfiguration implements Serial
|
|||
add(edge, WebgraphSchema.target_path_folders_count_i, paths.length);
|
||||
add(edge, WebgraphSchema.target_path_folders_sxt, paths);
|
||||
}
|
||||
add(edge, WebgraphSchema.target_clickdepth_i, clickdepth);
|
||||
|
||||
if ((allAttr || contains(WebgraphSchema.target_clickdepth_i)) && citations != null) {
|
||||
if (target_url.probablyRootURL()) {
|
||||
boolean lc = this.lazy; this.lazy = false;
|
||||
add(edge, WebgraphSchema.target_clickdepth_i, 0);
|
||||
this.lazy = lc;
|
||||
} else {
|
||||
add(edge, WebgraphSchema.target_clickdepth_i, 999);
|
||||
processTypes.add(ProcessType.CLICKDEPTH); // postprocessing needed; this is also needed if the depth is positive; there could be a shortcut
|
||||
}
|
||||
}
|
||||
|
||||
if (allAttr || contains(WebgraphSchema.process_sxt)) {
|
||||
List<String> pr = new ArrayList<String>();
|
||||
for (ProcessType t: processTypes) pr.add(t.name());
|
||||
add(edge, WebgraphSchema.process_sxt, pr);
|
||||
}
|
||||
|
||||
// add the edge to the subgraph
|
||||
subgraph.edges.add(edge);
|
||||
|
|
|
@ -35,6 +35,7 @@ public enum WebgraphSchema implements SchemaDeclaration {
|
|||
last_modified(SolrType.date, true, true, false, false, false, "last-modified from http header"),
|
||||
load_date_dt(SolrType.date, true, true, false, false, false, "time when resource was loaded"),
|
||||
collection_sxt(SolrType.string, true, true, true, false, false, "tags that are attached to crawls/index generation to separate the search result into user-defined subsets"),
|
||||
process_sxt(SolrType.string, true, true, true, false, false, "needed (post-)processing steps on this metadata set, used i.e. for clickdepth-computation."),
|
||||
|
||||
// source information
|
||||
source_id_s(SolrType.string, true, true, false, false, false, "primary key of document, the URL hash (source)"),
|
||||
|
|
Loading…
Reference in New Issue
Block a user