removed clickdepth_i field and related postprocessing. This information

is now available in the crawldepth_i field which is identical to
clickdepth_i because of a specific crawler strategy.
This commit is contained in:
Michael Peter Christen 2014-04-16 22:16:20 +02:00
parent da86f150ab
commit 9a5ab4e2c1
14 changed files with 33 additions and 243 deletions

View File

@ -84,10 +84,7 @@ references_external_i
## number of external hosts which provide http references
references_exthosts_i
## depth of web page according to number of clicks from the 'main' page, which is the page that appears if only the host is entered as url
clickdepth_i
## crawl depth of web page according to the number of steps that the crawler did to get to this document; if the crawl was started at a root document, then this is the maximum of clickdepth_i
## crawl depth of web page according to the number of steps that the crawler did to get to this document; if the crawl was started at a root document, then this is equal to the clickdepth
crawldepth_i
## needed (post-)processing steps on this metadata set

View File

@ -24,7 +24,7 @@ last_modified
## tags that are attached to crawls/index generation to separate the search result into user-defined subsets
collection_sxt
## needed (post-)processing steps on this metadata set, used i.e. for clickdepth-computation.
## needed (post-)processing steps on this metadata set
#process_sxt
## key from a harvest process (i.e. the crawl profile hash key) which is needed for near-realtime postprocessing. This shall be deleted as soon as postprocessing has been terminated.
@ -72,7 +72,7 @@ source_id_s
#source_parameter_value_sxt
## depth of web page according to number of clicks from the 'main' page, which is the page that appears if only the host is entered as url (source)
#source_clickdepth_i
#source_crawldepth_i
## copy of the citation rank norm value from the source link
#source_cr_host_norm_i
@ -173,7 +173,7 @@ target_path_folders_sxt
#target_parameter_value_sxt
## depth of web page according to number of clicks from the 'main' page, which is the page that appears if only the host is entered as url (target)
#target_clickdepth_i
#target_crawldepth_i
## copy of the citation rank norm value from the target link; this is only filled if the target host is identical to the source host
#target_cr_host_norm_i

View File

@ -1218,7 +1218,3 @@ browser.load4everyone = false
# with some heuristics like: loading linked documents and adding a twitter search.
# When the learning mode is finished, the user may switch on individual heuristics by himself.
greedylearning.active = true
# postprocessing parametrization
postprocessing.clickdepth.maxtime = 100
postprocessing.clickdepth.maxdepth = 6

View File

@ -297,7 +297,6 @@ public class HostBrowser {
CollectionSchema.inboundlinks_urlstub_sxt.getSolrFieldName(),
CollectionSchema.outboundlinks_protocol_sxt.getSolrFieldName(),
CollectionSchema.outboundlinks_urlstub_sxt.getSolrFieldName(),
CollectionSchema.clickdepth_i.getSolrFieldName(),
CollectionSchema.crawldepth_i.getSolrFieldName(),
CollectionSchema.references_i.getSolrFieldName(),
CollectionSchema.references_internal_i.getSolrFieldName(),
@ -564,18 +563,16 @@ public class HostBrowser {
public static final class InfoCacheEntry {
public Integer cr_n;
public Double cr_c;
public int clickdepth, crawldepth, references, references_internal, references_external, references_exthosts;
public int crawldepth, references, references_internal, references_external, references_exthosts;
public List<String> references_internal_urls, references_external_urls;
public InfoCacheEntry(final Fulltext fulltext, final ReferenceReportCache rrCache, final SolrDocument doc, final String urlhash, boolean fetchReferences) {
this.cr_c = (Double) doc.getFieldValue(CollectionSchema.cr_host_chance_d.getSolrFieldName());
this.cr_n = (Integer) doc.getFieldValue(CollectionSchema.cr_host_norm_i.getSolrFieldName());
Integer cd = (Integer) doc.getFieldValue(CollectionSchema.clickdepth_i.getSolrFieldName());
Integer cr = (Integer) doc.getFieldValue(CollectionSchema.crawldepth_i.getSolrFieldName());
Integer rc = (Integer) doc.getFieldValue(CollectionSchema.references_i.getSolrFieldName());
Integer rc_internal = (Integer) doc.getFieldValue(CollectionSchema.references_internal_i.getSolrFieldName());
Integer rc_external = (Integer) doc.getFieldValue(CollectionSchema.references_external_i.getSolrFieldName());
Integer rc_exthosts = (Integer) doc.getFieldValue(CollectionSchema.references_exthosts_i.getSolrFieldName());
this.clickdepth = (cd == null || cd.intValue() < 0) ? 999 : cd.intValue();
this.crawldepth = (cr == null || cr.intValue() < 0) ? 999 : cr.intValue();
this.references = (rc == null || rc.intValue() <= 0) ? 0 : rc.intValue();
this.references_internal = (rc_internal == null || rc_internal.intValue() <= 0) ? 0 : rc_internal.intValue();
@ -628,7 +625,6 @@ public class HostBrowser {
}
if (sbe.length() > 0) sbe.insert(0, "<br/>external referrer:</br>");
return
(this.clickdepth >= 0 ? "clickdepth: " + this.clickdepth : "") +
(this.crawldepth >= 0 ? ", crawldepth: " + this.crawldepth : "") +
(this.cr_c != null ? ", cr=" + (Math.round(this.cr_c * 1000.0d) / 1000.0d) : "") +
(this.cr_n != null ? ", crn=" + this.cr_n : "") +

View File

@ -81,7 +81,7 @@ public class RankingSolr_p {
}
}
if (post != null && post.containsKey("ResetBQ")) {
String bq = "clickdepth_i:0^0.8 clickdepth_i:1^0.4";
String bq = "crawldepth_i:0^0.8 crawldepth_i:1^0.4";
if (bq != null) {
sb.setConfig(SwitchboardConstants.SEARCH_RANKING_SOLR_COLLECTION_BOOSTQUERY_ + profileNr, bq);
sb.index.fulltext().getDefaultConfiguration().getRanking(profileNr).setBoostQuery(bq);

View File

@ -291,10 +291,6 @@ public class DigestURL extends MultiProtocolURL implements Serializable {
public final static Pattern rootPattern = Pattern.compile("/|/\\?|/index.htm(l?)|/index.php|/home.htm(l?)|/home.php|/default.htm(l?)|/default.php");
public final boolean probablyRootURL() {
return this.path.length() <= 1 || rootPattern.matcher(this.path).matches();
}
private static final String hosthash5(final String protocol, final String host, final int port) {
if (host == null) {
return Base64Order.enhancedCoder.encode(Digest.encodeMD5Raw(protocol)).substring(0, 5);

View File

@ -26,6 +26,6 @@ package net.yacy.cora.federate.solr;
*/
public enum ProcessType {
CLICKDEPTH, CITATION, UNIQUE;
CITATION, UNIQUE;
}

View File

@ -41,7 +41,6 @@ import net.yacy.cora.storage.Configuration;
import net.yacy.cora.storage.HandleSet;
import net.yacy.cora.util.ConcurrentLog;
import net.yacy.search.index.Segment;
import net.yacy.search.index.Segment.ClickdepthCache;
import net.yacy.search.index.Segment.ReferenceReport;
import net.yacy.search.index.Segment.ReferenceReportCache;
import net.yacy.search.schema.CollectionSchema;
@ -178,21 +177,6 @@ public class SchemaConfiguration extends Configuration implements Serializable {
return changed;
}
public boolean postprocessing_clickdepth(final ClickdepthCache clickdepthCache, final SolrInputDocument sid, final DigestURL url, final SchemaDeclaration clickdepthfield) {
// get new click depth and compare with old
Integer oldclickdepth = (Integer) sid.getFieldValue(clickdepthfield.getSolrFieldName());
if (oldclickdepth != null && oldclickdepth.intValue() != 999) return false; // we do not want to compute that again
try {
int clickdepth = clickdepthCache.getClickdepth(url);
if (oldclickdepth == null || oldclickdepth.intValue() != clickdepth) {
sid.setField(clickdepthfield.getSolrFieldName(), clickdepth);
return true;
}
} catch (final IOException e) {
}
return false;
}
public boolean postprocessing_references(final ReferenceReportCache rrCache, final SolrInputDocument sid, final DigestURL url, final Map<String, Long> hostExtentCount) {
if (!(this.contains(CollectionSchema.references_i) ||
this.contains(CollectionSchema.references_internal_i) ||

View File

@ -190,7 +190,6 @@ import net.yacy.repository.FilterEngine;
import net.yacy.repository.LoaderDispatcher;
import net.yacy.search.index.Fulltext;
import net.yacy.search.index.Segment;
import net.yacy.search.index.Segment.ClickdepthCache;
import net.yacy.search.index.Segment.ReferenceReportCache;
import net.yacy.search.query.AccessTracker;
import net.yacy.search.query.SearchEvent;
@ -484,9 +483,9 @@ public final class Switchboard extends serverSwitch {
String bq = this.getConfig(SwitchboardConstants.SEARCH_RANKING_SOLR_COLLECTION_BOOSTQUERY_ + i, "");
String bf = this.getConfig(SwitchboardConstants.SEARCH_RANKING_SOLR_COLLECTION_BOOSTFUNCTION_ + i, "");
// apply some hard-coded patches for earlier experiments we do not want any more
if (bf.equals("product(recip(rord(last_modified),1,1000,1000),div(product(log(product(references_external_i,references_exthosts_i)),div(references_internal_i,host_extent_i)),add(clickdepth_i,1)))") ||
if (bf.equals("product(recip(rord(last_modified),1,1000,1000),div(product(log(product(references_external_i,references_exthosts_i)),div(references_internal_i,host_extent_i)),add(crawldepth_i,1)))") ||
bf.equals("scale(cr_host_norm_i,1,20)")) bf = "";
if (i == 0 && bq.equals("fuzzy_signature_unique_b:true^100000.0")) bq = "clickdepth_i:0^0.8 clickdepth_i:1^0.4";
if (i == 0 && bq.equals("fuzzy_signature_unique_b:true^100000.0")) bq = "crawldepth_i:0^0.8 crawldepth_i:1^0.4";
if (boosts.equals("url_paths_sxt^1000.0,synonyms_sxt^1.0,title^10000.0,text_t^2.0,h1_txt^1000.0,h2_txt^100.0,host_organization_s^100000.0")) boosts = "url_paths_sxt^3.0,synonyms_sxt^0.5,title^5.0,text_t^1.0,host_s^6.0,h1_txt^5.0,url_file_name_tokens_t^4.0,h2_txt^2.0";
r.setName(name);
r.updateBoosts(boosts);
@ -2307,9 +2306,6 @@ public final class Switchboard extends serverSwitch {
// we optimize first because that is useful for postprocessing
ReferenceReportCache rrCache = index.getReferenceReportCache();
int clickdepth_maxtime = this.getConfigInt("postprocessing.clickdepth.maxtime", 100);
int clickdepth_maxdepth = this.getConfigInt("postprocessing.clickdepth.maxdepth", 6);
ClickdepthCache clickdepthCache = index.getClickdepthCache(rrCache, clickdepth_maxtime, clickdepth_maxdepth);
Set<String> deletionCandidates = collection1Configuration.contains(CollectionSchema.harvestkey_s.getSolrFieldName()) ?
this.crawler.getFinishesProfiles(this.crawlQueues) : new HashSet<String>();
int cleanupByHarvestkey = deletionCandidates.size();
@ -2320,7 +2316,7 @@ public final class Switchboard extends serverSwitch {
postprocessingRunning = true;
postprocessingStartTime[0] = System.currentTimeMillis();
try {postprocessingCount[0] = (int) fulltext.getDefaultConnector().getCountByQuery(CollectionSchema.process_sxt.getSolrFieldName() + AbstractSolrConnector.CATCHALL_DTERM);} catch (IOException e) {}
for (String profileHash: deletionCandidates) proccount += collection1Configuration.postprocessing(index, rrCache, clickdepthCache, profileHash);
for (String profileHash: deletionCandidates) proccount += collection1Configuration.postprocessing(index, rrCache, profileHash);
postprocessingStartTime[0] = 0;
try {postprocessingCount[0] = (int) fulltext.getDefaultConnector().getCountByQuery(CollectionSchema.process_sxt.getSolrFieldName() + AbstractSolrConnector.CATCHALL_DTERM);} catch (IOException e) {} // should be zero but you never know
@ -2331,7 +2327,7 @@ public final class Switchboard extends serverSwitch {
postprocessingRunning = true;
postprocessingStartTime[0] = System.currentTimeMillis();
try {postprocessingCount[0] = (int) fulltext.getDefaultConnector().getCountByQuery(CollectionSchema.process_sxt.getSolrFieldName() + AbstractSolrConnector.CATCHALL_DTERM);} catch (IOException e) {}
proccount += collection1Configuration.postprocessing(index, rrCache, clickdepthCache, null);
proccount += collection1Configuration.postprocessing(index, rrCache, null);
postprocessingStartTime[0] = 0;
try {postprocessingCount[0] = (int) fulltext.getDefaultConnector().getCountByQuery(CollectionSchema.process_sxt.getSolrFieldName() + AbstractSolrConnector.CATCHALL_DTERM);} catch (IOException e) {} // should be zero but you never know

View File

@ -30,8 +30,6 @@ import java.io.File;
import java.io.IOException;
import java.net.MalformedURLException;
import java.util.Date;
import java.util.HashMap;
import java.util.HashSet;
import java.util.Iterator;
import java.util.List;
import java.util.Map;
@ -82,7 +80,6 @@ import net.yacy.repository.LoaderDispatcher;
import net.yacy.search.query.SearchEvent;
import net.yacy.search.schema.CollectionConfiguration;
import net.yacy.search.schema.CollectionSchema;
import net.yacy.search.schema.HyperlinkGraph;
import net.yacy.search.schema.WebgraphConfiguration;
import net.yacy.search.schema.WebgraphSchema;
@ -205,76 +202,6 @@ public class Segment {
return this.urlCitationIndex;
}
/**
* compute the click level using the citation reference database
* @param citations the citation database
* @param searchhash the hash of the url to be checked
* @return the clickdepth level or 999 if the root url cannot be found or a recursion limit is reached
* @throws IOException
*/
private int getClickDepth(final ReferenceReportCache rrc, final DigestURL url, final int maxtime, final int maxdepth) throws IOException {
final byte[] searchhash = url.hash();
RowHandleSet rootCandidates = getPossibleRootHashes(url);
if (rootCandidates.has(searchhash)) return 0; // the url is a root candidate itself
Set<String> ignore = new HashSet<String>(); // a set of urlhashes to be ignored. This is generated from all hashes that are seen during recursion to prevent endless loops
Set<String> levelhashes = new HashSet<String>(); // all hashes of a clickdepth. The first call contains the target hash only and therefore just one entry
levelhashes.add(ASCII.String(searchhash));
final byte[] hosthash = new byte[6]; // the host of the url to be checked
System.arraycopy(searchhash, 6, hosthash, 0, 6);
long timeout = System.currentTimeMillis() + maxtime;
mainloop: for (int leveldepth = 0; leveldepth < maxdepth && System.currentTimeMillis() < timeout; leveldepth++) {
Set<String> checknext = new HashSet<String>();
// loop over all hashes at this clickdepth; the first call to this loop should contain only one hash and a leveldepth = 0
checkloop: for (String urlhashs: levelhashes) {
// get all the citations for this url and iterate
ReferenceReport rr = rrc.getReferenceReport(urlhashs, false);
//ReferenceContainer<CitationReference> references = this.urlCitationIndex.get(urlhash, null);
if (rr == null || rr.getInternalCount() == 0) continue checkloop; // don't know
Iterator<byte[]> i = rr.getInternallIDs().iterator();
nextloop: while (i.hasNext()) {
byte[] u = i.next();
if (u == null) continue nextloop;
// check if this is from the same host
assert (ByteBuffer.equals(u, 6, hosthash, 0, 6));
String us = ASCII.String(u);
// check ignore
if (ignore.contains(us)) continue nextloop;
// check if the url is a root url
if (rootCandidates.has(u)) {
return leveldepth + 1;
}
checknext.add(us);
ignore.add(us);
}
if (System.currentTimeMillis() > timeout) break mainloop;
}
levelhashes = checknext;
}
return 999;
}
private static RowHandleSet getPossibleRootHashes(final DigestURL url) {
RowHandleSet rootCandidates = new RowHandleSet(Word.commonHashLength, Word.commonHashOrder, 10);
String rootStub = url.getProtocol() + "://" + url.getHost() + (url.getProtocol().equals("http") && url.getPort() != 80 ? (":" + url.getPort()) : "");
try {
rootCandidates.put(new DigestURL(rootStub).hash());
for (String rootfn: HyperlinkGraph.ROOTFNS) rootCandidates.put(new DigestURL(rootStub + rootfn).hash());
rootCandidates.optimize();
} catch (final Throwable e) {}
rootCandidates.optimize();
return rootCandidates;
}
public ReferenceReportCache getReferenceReportCache() {
return new ReferenceReportCache();
}
@ -299,54 +226,6 @@ public class Segment {
}
}
public ClickdepthCache getClickdepthCache(ReferenceReportCache rrc, final int maxtime, final int maxdepth) {
return new ClickdepthCache(rrc, maxtime, maxdepth);
}
public class ClickdepthCache {
private final ReferenceReportCache rrc;
private final Map<String, HyperlinkGraph> hyperlinkGraphCache; // map from host name to a HyperlinkGraph for that host name
private final Map<String, Integer> cache;
public final int maxdepth; // maximum clickdepth
public final int maxtime; // maximum time to compute clickdepth
public ClickdepthCache(final ReferenceReportCache rrc, final int maxtime, final int maxdepth) {
this.rrc = rrc;
this.hyperlinkGraphCache = new HashMap<String, HyperlinkGraph>();
this.cache = new ConcurrentHashMap<String, Integer>();
this.maxdepth = maxdepth;
this.maxtime = maxtime;
}
public int getClickdepth(final DigestURL url) throws IOException {
// first try: get the clickdepth from the cache
Integer clickdepth = cache.get(ASCII.String(url.hash()));
if (MemoryControl.shortStatus()) cache.clear();
if (clickdepth != null) {
//ConcurrentLog.info("Segment", "get clickdepth of url " + url.toNormalform(true) + ": " + clickdepth + " CACHE HIT");
return clickdepth.intValue();
}
// second try: get the clickdepth from a hyperlinGraphCache (forward clickdepth)
HyperlinkGraph hlg = hyperlinkGraphCache.get(url.getHost());
if (hlg == null) {
hlg = new HyperlinkGraph();
hlg.fill(fulltext.getDefaultConnector(), url.getHost(), null, 300000, 10000000);
hlg.findLinkDepth();
hyperlinkGraphCache.put(url.getHost(), hlg);
}
clickdepth = hlg.getDepth(url);
if (clickdepth != null) {
return clickdepth.intValue();
}
// third try: get the clickdepth from a reverse link graph
clickdepth = Segment.this.getClickDepth(this.rrc, url, this.maxtime, this.maxdepth);
//ConcurrentLog.info("Segment", "get clickdepth of url " + url.toNormalform(true) + ": " + clickdepth);
this.cache.put(ASCII.String(url.hash()), clickdepth);
return clickdepth.intValue();
}
}
/**
* A ReferenceReport object is a container for all referenced to a specific url.
* The class stores the number of links from domain-internal and domain-external backlinks,
@ -654,7 +533,7 @@ public class Segment {
char docType = Response.docType(document.dc_format());
// CREATE SOLR DOCUMENT
final CollectionConfiguration.SolrVector vector = this.fulltext.getDefaultConfiguration().yacy2solr(collections, responseHeader, document, condenser, referrerURL, language, this.fulltext.getWebgraphConfiguration(), sourceName);
final CollectionConfiguration.SolrVector vector = this.fulltext.getDefaultConfiguration().yacy2solr(collections, responseHeader, document, condenser, referrerURL, language, this.fulltext().useWebgraph() ? this.fulltext.getWebgraphConfiguration() : null, sourceName);
// ENRICH DOCUMENT WITH RANKING INFORMATION
this.fulltext.getDefaultConfiguration().postprocessing_references(this.getReferenceReportCache(), vector, url, null);

View File

@ -82,7 +82,6 @@ import net.yacy.kelondro.rwi.ReferenceContainer;
import net.yacy.kelondro.util.Bitfield;
import net.yacy.kelondro.util.MemoryControl;
import net.yacy.search.index.Segment;
import net.yacy.search.index.Segment.ClickdepthCache;
import net.yacy.search.index.Segment.ReferenceReport;
import net.yacy.search.index.Segment.ReferenceReportCache;
import net.yacy.search.query.QueryParams;
@ -368,21 +367,9 @@ public class CollectionConfiguration extends SchemaConfiguration implements Seri
String us = digestURL.toNormalform(true);
int clickdepth = 999;
if ((allAttr || contains(CollectionSchema.clickdepth_i))) {
if (digestURL.probablyRootURL()) {
clickdepth = 0;
} else {
clickdepth = 999;
}
if (document.getDepth() < 2) clickdepth = Math.min(clickdepth, document.getDepth()); // thats not true if the start url was not a root URL. We need a test for that.
if (clickdepth > 2) processTypes.add(ProcessType.CLICKDEPTH); // postprocessing needed; this is also needed if the depth is positive; there could be a shortcut
CollectionSchema.clickdepth_i.add(doc, clickdepth); // no lazy value checking to get a '0' into the index
}
int crawldepth = document.getDepth();
if ((allAttr || contains(CollectionSchema.crawldepth_i))) {
int depth = document.getDepth();
CollectionSchema.crawldepth_i.add(doc, depth);
CollectionSchema.crawldepth_i.add(doc, crawldepth);
}
if (allAttr || (contains(CollectionSchema.cr_host_chance_d) && contains(CollectionSchema.cr_host_count_i) && contains(CollectionSchema.cr_host_norm_i))) {
@ -670,7 +657,7 @@ public class CollectionConfiguration extends SchemaConfiguration implements Seri
add(doc, CollectionSchema.framesscount_i, frames.length);
if (frames.length > 0) {
add(doc, CollectionSchema.frames_sxt, frames);
//webgraph.addEdges(subgraph, digestURI, responseHeader, collections, clickdepth, alllinks, images, true, framess, citations); // add here because links have been removed from remaining inbound/outbound
//webgraph.addEdges(subgraph, digestURI, responseHeader, collections, crawldepth, alllinks, images, true, framess, citations); // add here because links have been removed from remaining inbound/outbound
}
}
@ -687,7 +674,7 @@ public class CollectionConfiguration extends SchemaConfiguration implements Seri
add(doc, CollectionSchema.iframesscount_i, iframes.length);
if (iframes.length > 0) {
add(doc, CollectionSchema.iframes_sxt, iframes);
//webgraph.addEdges(subgraph, digestURI, responseHeader, collections, clickdepth, alllinks, images, true, iframess, citations); // add here because links have been removed from remaining inbound/outbound
//webgraph.addEdges(subgraph, digestURI, responseHeader, collections, crawldepth, alllinks, images, true, iframess, citations); // add here because links have been removed from remaining inbound/outbound
}
}
@ -856,9 +843,9 @@ public class CollectionConfiguration extends SchemaConfiguration implements Seri
if (allAttr || contains(CollectionSchema.outboundlinksnofollowcount_i)) add(doc, CollectionSchema.outboundlinksnofollowcount_i, document.outboundLinkNofollowCount());
// create a subgraph
if (!containsCanonical) {
if (!containsCanonical && webgraph != null) {
// a document with canonical tag should not get a webgraph relation, because that belongs to the canonical document
webgraph.addEdges(subgraph, digestURL, responseHeader, collections, clickdepth, images, document.getAnchors(), sourceName);
webgraph.addEdges(subgraph, digestURL, responseHeader, collections, crawldepth, images, document.getAnchors(), sourceName);
}
// list all links
@ -923,7 +910,7 @@ public class CollectionConfiguration extends SchemaConfiguration implements Seri
* @param urlCitation
* @return
*/
public int postprocessing(final Segment segment, final ReferenceReportCache rrCache, final ClickdepthCache clickdepthCache, final String harvestkey) {
public int postprocessing(final Segment segment, final ReferenceReportCache rrCache, final String harvestkey) {
if (!this.contains(CollectionSchema.process_sxt)) return 0;
if (!segment.connectedCitation() && !segment.fulltext().useWebgraph()) return 0;
final SolrConnector collectionConnector = segment.fulltext().getDefaultConnector();
@ -1054,7 +1041,7 @@ public class CollectionConfiguration extends SchemaConfiguration implements Seri
@Override
public void run() {
Thread.currentThread().setName(name);
SolrDocument doc; String protocol, urlstub, id; DigestURL url;
SolrDocument doc; String id;
try {
while ((doc = docs.take()) != AbstractSolrConnector.POISON_DOCUMENT) {
SolrInputDocument sid = webgraph.toSolrInputDocument(doc, omitFields);
@ -1081,30 +1068,6 @@ public class CollectionConfiguration extends SchemaConfiguration implements Seri
}
}
// set clickdepth
if (process.contains(ProcessType.CLICKDEPTH)) {
if (webgraph.contains(WebgraphSchema.source_clickdepth_i) && webgraph.contains(WebgraphSchema.source_protocol_s) && webgraph.contains(WebgraphSchema.source_urlstub_s) && webgraph.contains(WebgraphSchema.source_id_s)) {
protocol = (String) doc.getFieldValue(WebgraphSchema.source_protocol_s.getSolrFieldName());
urlstub = (String) doc.getFieldValue(WebgraphSchema.source_urlstub_s.getSolrFieldName());
id = (String) doc.getFieldValue(WebgraphSchema.source_id_s.getSolrFieldName());
try {
url = new DigestURL(protocol + "://" + urlstub, ASCII.getBytes(id));
postprocessing_clickdepth(clickdepthCache, sid, url, WebgraphSchema.source_clickdepth_i);
} catch (MalformedURLException e) {
}
}
if (webgraph.contains(WebgraphSchema.target_clickdepth_i) && webgraph.contains(WebgraphSchema.target_protocol_s) && webgraph.contains(WebgraphSchema.target_urlstub_s) && webgraph.contains(WebgraphSchema.target_id_s)) {
protocol = (String) doc.getFieldValue(WebgraphSchema.target_protocol_s.getSolrFieldName());
urlstub = (String) doc.getFieldValue(WebgraphSchema.target_urlstub_s.getSolrFieldName());
id = (String) doc.getFieldValue(WebgraphSchema.target_id_s.getSolrFieldName());
try {
url = new DigestURL(protocol + "://" + urlstub, ASCII.getBytes(id));
postprocessing_clickdepth(clickdepthCache, sid, url, WebgraphSchema.target_clickdepth_i);
} catch (MalformedURLException e) {
}
}
}
// write document back to index
try {
sid.removeField(WebgraphSchema.process_sxt.getSolrFieldName());
@ -1148,7 +1111,7 @@ public class CollectionConfiguration extends SchemaConfiguration implements Seri
Set<String> omitFields = new HashSet<String>();
omitFields.add(CollectionSchema.process_sxt.getSolrFieldName());
omitFields.add(CollectionSchema.harvestkey_s.getSolrFieldName());
int proccount = 0, proccount_clickdepthchange = 0, proccount_referencechange = 0, proccount_citationchange = 0, proccount_uniquechange = 0;
int proccount = 0, proccount_referencechange = 0, proccount_citationchange = 0, proccount_uniquechange = 0;
long count = collectionConnector.getCountByQuery(query);
long start = System.currentTimeMillis();
ConcurrentLog.info("CollectionConfiguration", "collecting " + count + " documents from the collection for harvestkey " + harvestkey);
@ -1170,9 +1133,6 @@ public class CollectionConfiguration extends SchemaConfiguration implements Seri
// switch over tag types
ProcessType tagtype = ProcessType.valueOf((String) tag);
if (tagtype == ProcessType.CLICKDEPTH && collection.contains(CollectionSchema.clickdepth_i)) {
if (postprocessing_clickdepth(clickdepthCache, sid, url, CollectionSchema.clickdepth_i)) proccount_clickdepthchange++;
}
if (tagtype == ProcessType.CITATION &&
collection.contains(CollectionSchema.cr_host_count_i) &&
@ -1228,7 +1188,6 @@ public class CollectionConfiguration extends SchemaConfiguration implements Seri
}
if (count != countcheck) ConcurrentLog.warn("CollectionConfiguration", "ambiguous collection document count for harvestkey " + harvestkey + ": expected=" + count + ", counted=" + countcheck); // big gap for harvestkey = null
ConcurrentLog.info("CollectionConfiguration", "cleanup_processing: re-calculated " + proccount+ " new documents, " +
proccount_clickdepthchange + " clickdepth changes, " +
proccount_referencechange + " reference-count changes, " +
proccount_uniquechange + " unique field changes, " +
proccount_citationchange + " citation ranking changes.");
@ -1534,12 +1493,8 @@ public class CollectionConfiguration extends SchemaConfiguration implements Seri
configuration.add(doc, CollectionSchema.collection_sxt, cs);
}
// clickdepth, cr and postprocessing
// cr and postprocessing
Set<ProcessType> processTypes = new LinkedHashSet<ProcessType>();
if ((allAttr || configuration.contains(CollectionSchema.clickdepth_i))) {
processTypes.add(ProcessType.CLICKDEPTH); // postprocessing needed; this is also needed if the depth is positive; there could be a shortcut
CollectionSchema.clickdepth_i.add(doc, digestURL.probablyRootURL() ? 0 : 999); // no lazy value checking to get a '0' into the index
}
if (allAttr || (configuration.contains(CollectionSchema.cr_host_chance_d) && configuration.contains(CollectionSchema.cr_host_count_i) && configuration.contains(CollectionSchema.cr_host_norm_i))) {
processTypes.add(ProcessType.CITATION); // postprocessing needed
}

View File

@ -57,8 +57,7 @@ public enum CollectionSchema implements SchemaDeclaration {
references_internal_i(SolrType.num_integer, true, true, false, false, false, "number of unique http references from same host to referenced url"),
references_external_i(SolrType.num_integer, true, true, false, false, false, "number of unique http references from external hosts"),
references_exthosts_i(SolrType.num_integer, true, true, false, false, false, "number of external hosts which provide http references"),
clickdepth_i(SolrType.num_integer, true, true, false, false, false, "depth of web page according to number of clicks from the 'main' page, which is the page that appears if only the host is entered as url"),
crawldepth_i(SolrType.num_integer, true, true, false, false, false, "crawl depth of web page according to the number of steps that the crawler did to get to this document; if the crawl was started at a root document, then this is the maximum of clickdepth_i"),
crawldepth_i(SolrType.num_integer, true, true, false, false, false, "crawl depth of web page according to the number of steps that the crawler did to get to this document; if the crawl was started at a root document, then this is equal to the clickdepth"),
process_sxt(SolrType.string, true, true, true, false, false, "needed (post-)processing steps on this metadata set"),
harvestkey_s(SolrType.string, true, true, false, false, false, "key from a harvest process (i.e. the crawl profile hash key) which is needed for near-realtime postprocessing. This shall be deleted as soon as postprocessing has been terminated."),

View File

@ -112,7 +112,7 @@ public class WebgraphConfiguration extends SchemaConfiguration implements Serial
public void addEdges(
final Subgraph subgraph,
final DigestURL source, final ResponseHeader responseHeader, Map<String, Pattern> collections, int clickdepth_source,
final DigestURL source, final ResponseHeader responseHeader, Map<String, Pattern> collections, int crawldepth_source,
final List<ImageEntry> images, final Collection<AnchorURL> links,
final String sourceName) {
boolean allAttr = this.isEmpty();
@ -120,7 +120,7 @@ public class WebgraphConfiguration extends SchemaConfiguration implements Serial
int target_order = 0;
for (final AnchorURL target_url: links) {
SolrInputDocument edge = getEdge(
subgraph, source, responseHeader, collections, clickdepth_source, images,
subgraph, source, responseHeader, collections, crawldepth_source, images,
sourceName, allAttr, generalNofollow, target_order, target_url);
target_order++;
// add the edge to the subgraph
@ -130,7 +130,7 @@ public class WebgraphConfiguration extends SchemaConfiguration implements Serial
public SolrInputDocument getEdge(
final Subgraph subgraph,
final DigestURL source_url, final ResponseHeader responseHeader, Map<String, Pattern> collections, int clickdepth_source,
final DigestURL source_url, final ResponseHeader responseHeader, Map<String, Pattern> collections, int crawldepth_source,
final List<ImageEntry> images,
final String sourceName, boolean allAttr, boolean generalNofollow, int target_order, AnchorURL target_url) {
@ -217,9 +217,8 @@ public class WebgraphConfiguration extends SchemaConfiguration implements Serial
add(edge, WebgraphSchema.source_path_folders_count_i, paths.length);
add(edge, WebgraphSchema.source_path_folders_sxt, paths);
}
if ((allAttr || contains(WebgraphSchema.source_clickdepth_i)) && this.contains(WebgraphSchema.source_protocol_s) && this.contains(WebgraphSchema.source_urlstub_s) && this.contains(WebgraphSchema.source_id_s)) {
add(edge, WebgraphSchema.source_clickdepth_i, clickdepth_source);
processTypes.add(ProcessType.CLICKDEPTH); // postprocessing needed; this is also needed if the depth is positive; there could be a shortcut
if ((allAttr || contains(WebgraphSchema.source_crawldepth_i)) && this.contains(WebgraphSchema.source_protocol_s) && this.contains(WebgraphSchema.source_urlstub_s) && this.contains(WebgraphSchema.source_id_s)) {
add(edge, WebgraphSchema.source_crawldepth_i, crawldepth_source);
}
// parse text to find images and clear text
@ -289,15 +288,8 @@ public class WebgraphConfiguration extends SchemaConfiguration implements Serial
add(edge, WebgraphSchema.target_path_folders_sxt, paths);
}
if ((allAttr || contains(WebgraphSchema.target_clickdepth_i)) && this.contains(WebgraphSchema.target_protocol_s) && this.contains(WebgraphSchema.target_urlstub_s) && this.contains(WebgraphSchema.target_id_s)) {
if (target_url.probablyRootURL()) {
boolean lc = this.lazy; this.lazy = false;
add(edge, WebgraphSchema.target_clickdepth_i, 0);
this.lazy = lc;
} else {
add(edge, WebgraphSchema.target_clickdepth_i, 999);
processTypes.add(ProcessType.CLICKDEPTH); // postprocessing needed; this is also needed if the depth is positive; there could be a shortcut
}
if ((allAttr || contains(WebgraphSchema.target_crawldepth_i)) && this.contains(WebgraphSchema.target_protocol_s) && this.contains(WebgraphSchema.target_urlstub_s) && this.contains(WebgraphSchema.target_id_s)) {
add(edge, WebgraphSchema.target_crawldepth_i, 999);
}
if (allAttr || contains(WebgraphSchema.process_sxt)) {

View File

@ -35,7 +35,7 @@ public enum WebgraphSchema implements SchemaDeclaration {
last_modified(SolrType.date, true, true, false, false, false, "last-modified from http header"),
load_date_dt(SolrType.date, true, true, false, false, false, "time when resource was loaded"),
collection_sxt(SolrType.string, true, true, true, false, false, "tags that are attached to crawls/index generation to separate the search result into user-defined subsets"),
process_sxt(SolrType.string, true, true, true, false, false, "needed (post-)processing steps on this metadata set, used i.e. for clickdepth-computation."),
process_sxt(SolrType.string, true, true, true, false, false, "needed (post-)processing steps on this metadata set."),
harvestkey_s(SolrType.string, true, true, false, false, false, "key from a harvest process (i.e. the crawl profile hash key) which is needed for near-realtime postprocessing. This shall be deleted as soon as postprocessing has been terminated."),
// source information
@ -51,7 +51,7 @@ public enum WebgraphSchema implements SchemaDeclaration {
source_parameter_count_i(SolrType.num_integer, true, true, false, false, false, "number of key-value pairs in search part of the url (source)"),
source_parameter_key_sxt(SolrType.string, true, true, true, false, false, "the keys from key-value pairs in the search part of the url (source)"),
source_parameter_value_sxt(SolrType.string, true, true, true, false, false, "the values from key-value pairs in the search part of the url (source)"),
source_clickdepth_i(SolrType.num_integer, true, true, false, false, false, "depth of web page according to number of clicks from the 'main' page, which is the page that appears if only the host is entered as url (source)"),
source_crawldepth_i(SolrType.num_integer, true, true, false, false, false, "depth of web page according to number of clicks from the 'main' page, which is the page that appears if only the host is entered as url (source)"),
source_cr_host_norm_i(SolrType.num_integer, true, true, false, false, false, "copy of the citation rank norm value from the source link"),
source_host_s(SolrType.string, true, true, false, false, false, "host of the url (source)"),
@ -86,7 +86,7 @@ public enum WebgraphSchema implements SchemaDeclaration {
target_parameter_count_i(SolrType.num_integer, true, true, false, false, false, "number of key-value pairs in search part of the url (target)"),
target_parameter_key_sxt(SolrType.string, true, true, true, false, false, "the keys from key-value pairs in the search part of the url (target)"),
target_parameter_value_sxt(SolrType.string, true, true, true, false, true, "the values from key-value pairs in the search part of the url (target)"),
target_clickdepth_i(SolrType.num_integer, true, true, false, false, false, "depth of web page according to number of clicks from the 'main' page, which is the page that appears if only the host is entered as url (target)"),
target_crawldepth_i(SolrType.num_integer, true, true, false, false, false, "depth of web page according to number of clicks from the 'main' page, which is the page that appears if only the host is entered as url (target)"),
target_cr_host_norm_i(SolrType.num_integer, true, true, false, false, false, "copy of the citation rank norm value from the target link; this is only filled if the target host is identical to the source host"),
target_host_s(SolrType.string, true, true, false, false, true, "host of the url (target)"),