mirror of
https://github.com/yacy/yacy_search_server.git
synced 2024-09-19 00:01:41 +02:00
removed clickdepth_i field and related postprocessing. This information
is now available in the crawldepth_i field which is identical to clickdepth_i because of a specific crawler strategy.
This commit is contained in:
parent
da86f150ab
commit
9a5ab4e2c1
|
@ -84,10 +84,7 @@ references_external_i
|
|||
## number of external hosts which provide http references
|
||||
references_exthosts_i
|
||||
|
||||
## depth of web page according to number of clicks from the 'main' page, which is the page that appears if only the host is entered as url
|
||||
clickdepth_i
|
||||
|
||||
## crawl depth of web page according to the number of steps that the crawler did to get to this document; if the crawl was started at a root document, then this is the maximum of clickdepth_i
|
||||
## crawl depth of web page according to the number of steps that the crawler did to get to this document; if the crawl was started at a root document, then this is equal to the clickdepth
|
||||
crawldepth_i
|
||||
|
||||
## needed (post-)processing steps on this metadata set
|
||||
|
|
|
@ -24,7 +24,7 @@ last_modified
|
|||
## tags that are attached to crawls/index generation to separate the search result into user-defined subsets
|
||||
collection_sxt
|
||||
|
||||
## needed (post-)processing steps on this metadata set, used i.e. for clickdepth-computation.
|
||||
## needed (post-)processing steps on this metadata set
|
||||
#process_sxt
|
||||
|
||||
## key from a harvest process (i.e. the crawl profile hash key) which is needed for near-realtime postprocessing. This shall be deleted as soon as postprocessing has been terminated.
|
||||
|
@ -72,7 +72,7 @@ source_id_s
|
|||
#source_parameter_value_sxt
|
||||
|
||||
## depth of web page according to number of clicks from the 'main' page, which is the page that appears if only the host is entered as url (source)
|
||||
#source_clickdepth_i
|
||||
#source_crawldepth_i
|
||||
|
||||
## copy of the citation rank norm value from the source link
|
||||
#source_cr_host_norm_i
|
||||
|
@ -173,7 +173,7 @@ target_path_folders_sxt
|
|||
#target_parameter_value_sxt
|
||||
|
||||
## depth of web page according to number of clicks from the 'main' page, which is the page that appears if only the host is entered as url (target)
|
||||
#target_clickdepth_i
|
||||
#target_crawldepth_i
|
||||
|
||||
## copy of the citation rank norm value from the target link; this is only filled if the target host is identical to the source host
|
||||
#target_cr_host_norm_i
|
||||
|
|
|
@ -1218,7 +1218,3 @@ browser.load4everyone = false
|
|||
# with some heuristics like: loading linked documents and adding a twitter search.
|
||||
# When the learning mode is finished, the user may switch on individual heuristics by himself.
|
||||
greedylearning.active = true
|
||||
|
||||
# postprocessing parametrization
|
||||
postprocessing.clickdepth.maxtime = 100
|
||||
postprocessing.clickdepth.maxdepth = 6
|
|
@ -297,7 +297,6 @@ public class HostBrowser {
|
|||
CollectionSchema.inboundlinks_urlstub_sxt.getSolrFieldName(),
|
||||
CollectionSchema.outboundlinks_protocol_sxt.getSolrFieldName(),
|
||||
CollectionSchema.outboundlinks_urlstub_sxt.getSolrFieldName(),
|
||||
CollectionSchema.clickdepth_i.getSolrFieldName(),
|
||||
CollectionSchema.crawldepth_i.getSolrFieldName(),
|
||||
CollectionSchema.references_i.getSolrFieldName(),
|
||||
CollectionSchema.references_internal_i.getSolrFieldName(),
|
||||
|
@ -564,18 +563,16 @@ public class HostBrowser {
|
|||
public static final class InfoCacheEntry {
|
||||
public Integer cr_n;
|
||||
public Double cr_c;
|
||||
public int clickdepth, crawldepth, references, references_internal, references_external, references_exthosts;
|
||||
public int crawldepth, references, references_internal, references_external, references_exthosts;
|
||||
public List<String> references_internal_urls, references_external_urls;
|
||||
public InfoCacheEntry(final Fulltext fulltext, final ReferenceReportCache rrCache, final SolrDocument doc, final String urlhash, boolean fetchReferences) {
|
||||
this.cr_c = (Double) doc.getFieldValue(CollectionSchema.cr_host_chance_d.getSolrFieldName());
|
||||
this.cr_n = (Integer) doc.getFieldValue(CollectionSchema.cr_host_norm_i.getSolrFieldName());
|
||||
Integer cd = (Integer) doc.getFieldValue(CollectionSchema.clickdepth_i.getSolrFieldName());
|
||||
Integer cr = (Integer) doc.getFieldValue(CollectionSchema.crawldepth_i.getSolrFieldName());
|
||||
Integer rc = (Integer) doc.getFieldValue(CollectionSchema.references_i.getSolrFieldName());
|
||||
Integer rc_internal = (Integer) doc.getFieldValue(CollectionSchema.references_internal_i.getSolrFieldName());
|
||||
Integer rc_external = (Integer) doc.getFieldValue(CollectionSchema.references_external_i.getSolrFieldName());
|
||||
Integer rc_exthosts = (Integer) doc.getFieldValue(CollectionSchema.references_exthosts_i.getSolrFieldName());
|
||||
this.clickdepth = (cd == null || cd.intValue() < 0) ? 999 : cd.intValue();
|
||||
this.crawldepth = (cr == null || cr.intValue() < 0) ? 999 : cr.intValue();
|
||||
this.references = (rc == null || rc.intValue() <= 0) ? 0 : rc.intValue();
|
||||
this.references_internal = (rc_internal == null || rc_internal.intValue() <= 0) ? 0 : rc_internal.intValue();
|
||||
|
@ -628,7 +625,6 @@ public class HostBrowser {
|
|||
}
|
||||
if (sbe.length() > 0) sbe.insert(0, "<br/>external referrer:</br>");
|
||||
return
|
||||
(this.clickdepth >= 0 ? "clickdepth: " + this.clickdepth : "") +
|
||||
(this.crawldepth >= 0 ? ", crawldepth: " + this.crawldepth : "") +
|
||||
(this.cr_c != null ? ", cr=" + (Math.round(this.cr_c * 1000.0d) / 1000.0d) : "") +
|
||||
(this.cr_n != null ? ", crn=" + this.cr_n : "") +
|
||||
|
|
|
@ -81,7 +81,7 @@ public class RankingSolr_p {
|
|||
}
|
||||
}
|
||||
if (post != null && post.containsKey("ResetBQ")) {
|
||||
String bq = "clickdepth_i:0^0.8 clickdepth_i:1^0.4";
|
||||
String bq = "crawldepth_i:0^0.8 crawldepth_i:1^0.4";
|
||||
if (bq != null) {
|
||||
sb.setConfig(SwitchboardConstants.SEARCH_RANKING_SOLR_COLLECTION_BOOSTQUERY_ + profileNr, bq);
|
||||
sb.index.fulltext().getDefaultConfiguration().getRanking(profileNr).setBoostQuery(bq);
|
||||
|
|
|
@ -291,10 +291,6 @@ public class DigestURL extends MultiProtocolURL implements Serializable {
|
|||
|
||||
public final static Pattern rootPattern = Pattern.compile("/|/\\?|/index.htm(l?)|/index.php|/home.htm(l?)|/home.php|/default.htm(l?)|/default.php");
|
||||
|
||||
public final boolean probablyRootURL() {
|
||||
return this.path.length() <= 1 || rootPattern.matcher(this.path).matches();
|
||||
}
|
||||
|
||||
private static final String hosthash5(final String protocol, final String host, final int port) {
|
||||
if (host == null) {
|
||||
return Base64Order.enhancedCoder.encode(Digest.encodeMD5Raw(protocol)).substring(0, 5);
|
||||
|
|
|
@ -26,6 +26,6 @@ package net.yacy.cora.federate.solr;
|
|||
*/
|
||||
public enum ProcessType {
|
||||
|
||||
CLICKDEPTH, CITATION, UNIQUE;
|
||||
CITATION, UNIQUE;
|
||||
|
||||
}
|
||||
|
|
|
@ -41,7 +41,6 @@ import net.yacy.cora.storage.Configuration;
|
|||
import net.yacy.cora.storage.HandleSet;
|
||||
import net.yacy.cora.util.ConcurrentLog;
|
||||
import net.yacy.search.index.Segment;
|
||||
import net.yacy.search.index.Segment.ClickdepthCache;
|
||||
import net.yacy.search.index.Segment.ReferenceReport;
|
||||
import net.yacy.search.index.Segment.ReferenceReportCache;
|
||||
import net.yacy.search.schema.CollectionSchema;
|
||||
|
@ -178,21 +177,6 @@ public class SchemaConfiguration extends Configuration implements Serializable {
|
|||
return changed;
|
||||
}
|
||||
|
||||
public boolean postprocessing_clickdepth(final ClickdepthCache clickdepthCache, final SolrInputDocument sid, final DigestURL url, final SchemaDeclaration clickdepthfield) {
|
||||
// get new click depth and compare with old
|
||||
Integer oldclickdepth = (Integer) sid.getFieldValue(clickdepthfield.getSolrFieldName());
|
||||
if (oldclickdepth != null && oldclickdepth.intValue() != 999) return false; // we do not want to compute that again
|
||||
try {
|
||||
int clickdepth = clickdepthCache.getClickdepth(url);
|
||||
if (oldclickdepth == null || oldclickdepth.intValue() != clickdepth) {
|
||||
sid.setField(clickdepthfield.getSolrFieldName(), clickdepth);
|
||||
return true;
|
||||
}
|
||||
} catch (final IOException e) {
|
||||
}
|
||||
return false;
|
||||
}
|
||||
|
||||
public boolean postprocessing_references(final ReferenceReportCache rrCache, final SolrInputDocument sid, final DigestURL url, final Map<String, Long> hostExtentCount) {
|
||||
if (!(this.contains(CollectionSchema.references_i) ||
|
||||
this.contains(CollectionSchema.references_internal_i) ||
|
||||
|
|
|
@ -190,7 +190,6 @@ import net.yacy.repository.FilterEngine;
|
|||
import net.yacy.repository.LoaderDispatcher;
|
||||
import net.yacy.search.index.Fulltext;
|
||||
import net.yacy.search.index.Segment;
|
||||
import net.yacy.search.index.Segment.ClickdepthCache;
|
||||
import net.yacy.search.index.Segment.ReferenceReportCache;
|
||||
import net.yacy.search.query.AccessTracker;
|
||||
import net.yacy.search.query.SearchEvent;
|
||||
|
@ -484,9 +483,9 @@ public final class Switchboard extends serverSwitch {
|
|||
String bq = this.getConfig(SwitchboardConstants.SEARCH_RANKING_SOLR_COLLECTION_BOOSTQUERY_ + i, "");
|
||||
String bf = this.getConfig(SwitchboardConstants.SEARCH_RANKING_SOLR_COLLECTION_BOOSTFUNCTION_ + i, "");
|
||||
// apply some hard-coded patches for earlier experiments we do not want any more
|
||||
if (bf.equals("product(recip(rord(last_modified),1,1000,1000),div(product(log(product(references_external_i,references_exthosts_i)),div(references_internal_i,host_extent_i)),add(clickdepth_i,1)))") ||
|
||||
if (bf.equals("product(recip(rord(last_modified),1,1000,1000),div(product(log(product(references_external_i,references_exthosts_i)),div(references_internal_i,host_extent_i)),add(crawldepth_i,1)))") ||
|
||||
bf.equals("scale(cr_host_norm_i,1,20)")) bf = "";
|
||||
if (i == 0 && bq.equals("fuzzy_signature_unique_b:true^100000.0")) bq = "clickdepth_i:0^0.8 clickdepth_i:1^0.4";
|
||||
if (i == 0 && bq.equals("fuzzy_signature_unique_b:true^100000.0")) bq = "crawldepth_i:0^0.8 crawldepth_i:1^0.4";
|
||||
if (boosts.equals("url_paths_sxt^1000.0,synonyms_sxt^1.0,title^10000.0,text_t^2.0,h1_txt^1000.0,h2_txt^100.0,host_organization_s^100000.0")) boosts = "url_paths_sxt^3.0,synonyms_sxt^0.5,title^5.0,text_t^1.0,host_s^6.0,h1_txt^5.0,url_file_name_tokens_t^4.0,h2_txt^2.0";
|
||||
r.setName(name);
|
||||
r.updateBoosts(boosts);
|
||||
|
@ -2307,9 +2306,6 @@ public final class Switchboard extends serverSwitch {
|
|||
|
||||
// we optimize first because that is useful for postprocessing
|
||||
ReferenceReportCache rrCache = index.getReferenceReportCache();
|
||||
int clickdepth_maxtime = this.getConfigInt("postprocessing.clickdepth.maxtime", 100);
|
||||
int clickdepth_maxdepth = this.getConfigInt("postprocessing.clickdepth.maxdepth", 6);
|
||||
ClickdepthCache clickdepthCache = index.getClickdepthCache(rrCache, clickdepth_maxtime, clickdepth_maxdepth);
|
||||
Set<String> deletionCandidates = collection1Configuration.contains(CollectionSchema.harvestkey_s.getSolrFieldName()) ?
|
||||
this.crawler.getFinishesProfiles(this.crawlQueues) : new HashSet<String>();
|
||||
int cleanupByHarvestkey = deletionCandidates.size();
|
||||
|
@ -2320,7 +2316,7 @@ public final class Switchboard extends serverSwitch {
|
|||
postprocessingRunning = true;
|
||||
postprocessingStartTime[0] = System.currentTimeMillis();
|
||||
try {postprocessingCount[0] = (int) fulltext.getDefaultConnector().getCountByQuery(CollectionSchema.process_sxt.getSolrFieldName() + AbstractSolrConnector.CATCHALL_DTERM);} catch (IOException e) {}
|
||||
for (String profileHash: deletionCandidates) proccount += collection1Configuration.postprocessing(index, rrCache, clickdepthCache, profileHash);
|
||||
for (String profileHash: deletionCandidates) proccount += collection1Configuration.postprocessing(index, rrCache, profileHash);
|
||||
postprocessingStartTime[0] = 0;
|
||||
try {postprocessingCount[0] = (int) fulltext.getDefaultConnector().getCountByQuery(CollectionSchema.process_sxt.getSolrFieldName() + AbstractSolrConnector.CATCHALL_DTERM);} catch (IOException e) {} // should be zero but you never know
|
||||
|
||||
|
@ -2331,7 +2327,7 @@ public final class Switchboard extends serverSwitch {
|
|||
postprocessingRunning = true;
|
||||
postprocessingStartTime[0] = System.currentTimeMillis();
|
||||
try {postprocessingCount[0] = (int) fulltext.getDefaultConnector().getCountByQuery(CollectionSchema.process_sxt.getSolrFieldName() + AbstractSolrConnector.CATCHALL_DTERM);} catch (IOException e) {}
|
||||
proccount += collection1Configuration.postprocessing(index, rrCache, clickdepthCache, null);
|
||||
proccount += collection1Configuration.postprocessing(index, rrCache, null);
|
||||
postprocessingStartTime[0] = 0;
|
||||
try {postprocessingCount[0] = (int) fulltext.getDefaultConnector().getCountByQuery(CollectionSchema.process_sxt.getSolrFieldName() + AbstractSolrConnector.CATCHALL_DTERM);} catch (IOException e) {} // should be zero but you never know
|
||||
|
||||
|
|
|
@ -30,8 +30,6 @@ import java.io.File;
|
|||
import java.io.IOException;
|
||||
import java.net.MalformedURLException;
|
||||
import java.util.Date;
|
||||
import java.util.HashMap;
|
||||
import java.util.HashSet;
|
||||
import java.util.Iterator;
|
||||
import java.util.List;
|
||||
import java.util.Map;
|
||||
|
@ -82,7 +80,6 @@ import net.yacy.repository.LoaderDispatcher;
|
|||
import net.yacy.search.query.SearchEvent;
|
||||
import net.yacy.search.schema.CollectionConfiguration;
|
||||
import net.yacy.search.schema.CollectionSchema;
|
||||
import net.yacy.search.schema.HyperlinkGraph;
|
||||
import net.yacy.search.schema.WebgraphConfiguration;
|
||||
import net.yacy.search.schema.WebgraphSchema;
|
||||
|
||||
|
@ -205,76 +202,6 @@ public class Segment {
|
|||
return this.urlCitationIndex;
|
||||
}
|
||||
|
||||
/**
|
||||
* compute the click level using the citation reference database
|
||||
* @param citations the citation database
|
||||
* @param searchhash the hash of the url to be checked
|
||||
* @return the clickdepth level or 999 if the root url cannot be found or a recursion limit is reached
|
||||
* @throws IOException
|
||||
*/
|
||||
private int getClickDepth(final ReferenceReportCache rrc, final DigestURL url, final int maxtime, final int maxdepth) throws IOException {
|
||||
|
||||
final byte[] searchhash = url.hash();
|
||||
RowHandleSet rootCandidates = getPossibleRootHashes(url);
|
||||
if (rootCandidates.has(searchhash)) return 0; // the url is a root candidate itself
|
||||
|
||||
Set<String> ignore = new HashSet<String>(); // a set of urlhashes to be ignored. This is generated from all hashes that are seen during recursion to prevent endless loops
|
||||
Set<String> levelhashes = new HashSet<String>(); // all hashes of a clickdepth. The first call contains the target hash only and therefore just one entry
|
||||
levelhashes.add(ASCII.String(searchhash));
|
||||
final byte[] hosthash = new byte[6]; // the host of the url to be checked
|
||||
System.arraycopy(searchhash, 6, hosthash, 0, 6);
|
||||
|
||||
long timeout = System.currentTimeMillis() + maxtime;
|
||||
mainloop: for (int leveldepth = 0; leveldepth < maxdepth && System.currentTimeMillis() < timeout; leveldepth++) {
|
||||
|
||||
Set<String> checknext = new HashSet<String>();
|
||||
|
||||
// loop over all hashes at this clickdepth; the first call to this loop should contain only one hash and a leveldepth = 0
|
||||
checkloop: for (String urlhashs: levelhashes) {
|
||||
|
||||
// get all the citations for this url and iterate
|
||||
ReferenceReport rr = rrc.getReferenceReport(urlhashs, false);
|
||||
//ReferenceContainer<CitationReference> references = this.urlCitationIndex.get(urlhash, null);
|
||||
if (rr == null || rr.getInternalCount() == 0) continue checkloop; // don't know
|
||||
Iterator<byte[]> i = rr.getInternallIDs().iterator();
|
||||
nextloop: while (i.hasNext()) {
|
||||
byte[] u = i.next();
|
||||
if (u == null) continue nextloop;
|
||||
|
||||
// check if this is from the same host
|
||||
assert (ByteBuffer.equals(u, 6, hosthash, 0, 6));
|
||||
String us = ASCII.String(u);
|
||||
// check ignore
|
||||
if (ignore.contains(us)) continue nextloop;
|
||||
|
||||
// check if the url is a root url
|
||||
if (rootCandidates.has(u)) {
|
||||
return leveldepth + 1;
|
||||
}
|
||||
|
||||
checknext.add(us);
|
||||
ignore.add(us);
|
||||
}
|
||||
if (System.currentTimeMillis() > timeout) break mainloop;
|
||||
}
|
||||
levelhashes = checknext;
|
||||
}
|
||||
return 999;
|
||||
}
|
||||
|
||||
|
||||
private static RowHandleSet getPossibleRootHashes(final DigestURL url) {
|
||||
RowHandleSet rootCandidates = new RowHandleSet(Word.commonHashLength, Word.commonHashOrder, 10);
|
||||
String rootStub = url.getProtocol() + "://" + url.getHost() + (url.getProtocol().equals("http") && url.getPort() != 80 ? (":" + url.getPort()) : "");
|
||||
try {
|
||||
rootCandidates.put(new DigestURL(rootStub).hash());
|
||||
for (String rootfn: HyperlinkGraph.ROOTFNS) rootCandidates.put(new DigestURL(rootStub + rootfn).hash());
|
||||
rootCandidates.optimize();
|
||||
} catch (final Throwable e) {}
|
||||
rootCandidates.optimize();
|
||||
return rootCandidates;
|
||||
}
|
||||
|
||||
public ReferenceReportCache getReferenceReportCache() {
|
||||
return new ReferenceReportCache();
|
||||
}
|
||||
|
@ -299,54 +226,6 @@ public class Segment {
|
|||
}
|
||||
}
|
||||
|
||||
public ClickdepthCache getClickdepthCache(ReferenceReportCache rrc, final int maxtime, final int maxdepth) {
|
||||
return new ClickdepthCache(rrc, maxtime, maxdepth);
|
||||
}
|
||||
|
||||
public class ClickdepthCache {
|
||||
private final ReferenceReportCache rrc;
|
||||
private final Map<String, HyperlinkGraph> hyperlinkGraphCache; // map from host name to a HyperlinkGraph for that host name
|
||||
private final Map<String, Integer> cache;
|
||||
public final int maxdepth; // maximum clickdepth
|
||||
public final int maxtime; // maximum time to compute clickdepth
|
||||
public ClickdepthCache(final ReferenceReportCache rrc, final int maxtime, final int maxdepth) {
|
||||
this.rrc = rrc;
|
||||
this.hyperlinkGraphCache = new HashMap<String, HyperlinkGraph>();
|
||||
this.cache = new ConcurrentHashMap<String, Integer>();
|
||||
this.maxdepth = maxdepth;
|
||||
this.maxtime = maxtime;
|
||||
}
|
||||
public int getClickdepth(final DigestURL url) throws IOException {
|
||||
// first try: get the clickdepth from the cache
|
||||
Integer clickdepth = cache.get(ASCII.String(url.hash()));
|
||||
if (MemoryControl.shortStatus()) cache.clear();
|
||||
if (clickdepth != null) {
|
||||
//ConcurrentLog.info("Segment", "get clickdepth of url " + url.toNormalform(true) + ": " + clickdepth + " CACHE HIT");
|
||||
return clickdepth.intValue();
|
||||
}
|
||||
|
||||
// second try: get the clickdepth from a hyperlinGraphCache (forward clickdepth)
|
||||
HyperlinkGraph hlg = hyperlinkGraphCache.get(url.getHost());
|
||||
if (hlg == null) {
|
||||
hlg = new HyperlinkGraph();
|
||||
hlg.fill(fulltext.getDefaultConnector(), url.getHost(), null, 300000, 10000000);
|
||||
hlg.findLinkDepth();
|
||||
hyperlinkGraphCache.put(url.getHost(), hlg);
|
||||
}
|
||||
clickdepth = hlg.getDepth(url);
|
||||
if (clickdepth != null) {
|
||||
return clickdepth.intValue();
|
||||
}
|
||||
|
||||
|
||||
// third try: get the clickdepth from a reverse link graph
|
||||
clickdepth = Segment.this.getClickDepth(this.rrc, url, this.maxtime, this.maxdepth);
|
||||
//ConcurrentLog.info("Segment", "get clickdepth of url " + url.toNormalform(true) + ": " + clickdepth);
|
||||
this.cache.put(ASCII.String(url.hash()), clickdepth);
|
||||
return clickdepth.intValue();
|
||||
}
|
||||
}
|
||||
|
||||
/**
|
||||
* A ReferenceReport object is a container for all referenced to a specific url.
|
||||
* The class stores the number of links from domain-internal and domain-external backlinks,
|
||||
|
@ -654,7 +533,7 @@ public class Segment {
|
|||
char docType = Response.docType(document.dc_format());
|
||||
|
||||
// CREATE SOLR DOCUMENT
|
||||
final CollectionConfiguration.SolrVector vector = this.fulltext.getDefaultConfiguration().yacy2solr(collections, responseHeader, document, condenser, referrerURL, language, this.fulltext.getWebgraphConfiguration(), sourceName);
|
||||
final CollectionConfiguration.SolrVector vector = this.fulltext.getDefaultConfiguration().yacy2solr(collections, responseHeader, document, condenser, referrerURL, language, this.fulltext().useWebgraph() ? this.fulltext.getWebgraphConfiguration() : null, sourceName);
|
||||
|
||||
// ENRICH DOCUMENT WITH RANKING INFORMATION
|
||||
this.fulltext.getDefaultConfiguration().postprocessing_references(this.getReferenceReportCache(), vector, url, null);
|
||||
|
|
|
@ -82,7 +82,6 @@ import net.yacy.kelondro.rwi.ReferenceContainer;
|
|||
import net.yacy.kelondro.util.Bitfield;
|
||||
import net.yacy.kelondro.util.MemoryControl;
|
||||
import net.yacy.search.index.Segment;
|
||||
import net.yacy.search.index.Segment.ClickdepthCache;
|
||||
import net.yacy.search.index.Segment.ReferenceReport;
|
||||
import net.yacy.search.index.Segment.ReferenceReportCache;
|
||||
import net.yacy.search.query.QueryParams;
|
||||
|
@ -368,21 +367,9 @@ public class CollectionConfiguration extends SchemaConfiguration implements Seri
|
|||
|
||||
String us = digestURL.toNormalform(true);
|
||||
|
||||
int clickdepth = 999;
|
||||
if ((allAttr || contains(CollectionSchema.clickdepth_i))) {
|
||||
if (digestURL.probablyRootURL()) {
|
||||
clickdepth = 0;
|
||||
} else {
|
||||
clickdepth = 999;
|
||||
}
|
||||
if (document.getDepth() < 2) clickdepth = Math.min(clickdepth, document.getDepth()); // thats not true if the start url was not a root URL. We need a test for that.
|
||||
if (clickdepth > 2) processTypes.add(ProcessType.CLICKDEPTH); // postprocessing needed; this is also needed if the depth is positive; there could be a shortcut
|
||||
CollectionSchema.clickdepth_i.add(doc, clickdepth); // no lazy value checking to get a '0' into the index
|
||||
}
|
||||
|
||||
int crawldepth = document.getDepth();
|
||||
if ((allAttr || contains(CollectionSchema.crawldepth_i))) {
|
||||
int depth = document.getDepth();
|
||||
CollectionSchema.crawldepth_i.add(doc, depth);
|
||||
CollectionSchema.crawldepth_i.add(doc, crawldepth);
|
||||
}
|
||||
|
||||
if (allAttr || (contains(CollectionSchema.cr_host_chance_d) && contains(CollectionSchema.cr_host_count_i) && contains(CollectionSchema.cr_host_norm_i))) {
|
||||
|
@ -670,7 +657,7 @@ public class CollectionConfiguration extends SchemaConfiguration implements Seri
|
|||
add(doc, CollectionSchema.framesscount_i, frames.length);
|
||||
if (frames.length > 0) {
|
||||
add(doc, CollectionSchema.frames_sxt, frames);
|
||||
//webgraph.addEdges(subgraph, digestURI, responseHeader, collections, clickdepth, alllinks, images, true, framess, citations); // add here because links have been removed from remaining inbound/outbound
|
||||
//webgraph.addEdges(subgraph, digestURI, responseHeader, collections, crawldepth, alllinks, images, true, framess, citations); // add here because links have been removed from remaining inbound/outbound
|
||||
}
|
||||
}
|
||||
|
||||
|
@ -687,7 +674,7 @@ public class CollectionConfiguration extends SchemaConfiguration implements Seri
|
|||
add(doc, CollectionSchema.iframesscount_i, iframes.length);
|
||||
if (iframes.length > 0) {
|
||||
add(doc, CollectionSchema.iframes_sxt, iframes);
|
||||
//webgraph.addEdges(subgraph, digestURI, responseHeader, collections, clickdepth, alllinks, images, true, iframess, citations); // add here because links have been removed from remaining inbound/outbound
|
||||
//webgraph.addEdges(subgraph, digestURI, responseHeader, collections, crawldepth, alllinks, images, true, iframess, citations); // add here because links have been removed from remaining inbound/outbound
|
||||
}
|
||||
}
|
||||
|
||||
|
@ -856,9 +843,9 @@ public class CollectionConfiguration extends SchemaConfiguration implements Seri
|
|||
if (allAttr || contains(CollectionSchema.outboundlinksnofollowcount_i)) add(doc, CollectionSchema.outboundlinksnofollowcount_i, document.outboundLinkNofollowCount());
|
||||
|
||||
// create a subgraph
|
||||
if (!containsCanonical) {
|
||||
if (!containsCanonical && webgraph != null) {
|
||||
// a document with canonical tag should not get a webgraph relation, because that belongs to the canonical document
|
||||
webgraph.addEdges(subgraph, digestURL, responseHeader, collections, clickdepth, images, document.getAnchors(), sourceName);
|
||||
webgraph.addEdges(subgraph, digestURL, responseHeader, collections, crawldepth, images, document.getAnchors(), sourceName);
|
||||
}
|
||||
|
||||
// list all links
|
||||
|
@ -923,7 +910,7 @@ public class CollectionConfiguration extends SchemaConfiguration implements Seri
|
|||
* @param urlCitation
|
||||
* @return
|
||||
*/
|
||||
public int postprocessing(final Segment segment, final ReferenceReportCache rrCache, final ClickdepthCache clickdepthCache, final String harvestkey) {
|
||||
public int postprocessing(final Segment segment, final ReferenceReportCache rrCache, final String harvestkey) {
|
||||
if (!this.contains(CollectionSchema.process_sxt)) return 0;
|
||||
if (!segment.connectedCitation() && !segment.fulltext().useWebgraph()) return 0;
|
||||
final SolrConnector collectionConnector = segment.fulltext().getDefaultConnector();
|
||||
|
@ -1054,7 +1041,7 @@ public class CollectionConfiguration extends SchemaConfiguration implements Seri
|
|||
@Override
|
||||
public void run() {
|
||||
Thread.currentThread().setName(name);
|
||||
SolrDocument doc; String protocol, urlstub, id; DigestURL url;
|
||||
SolrDocument doc; String id;
|
||||
try {
|
||||
while ((doc = docs.take()) != AbstractSolrConnector.POISON_DOCUMENT) {
|
||||
SolrInputDocument sid = webgraph.toSolrInputDocument(doc, omitFields);
|
||||
|
@ -1081,30 +1068,6 @@ public class CollectionConfiguration extends SchemaConfiguration implements Seri
|
|||
}
|
||||
}
|
||||
|
||||
// set clickdepth
|
||||
if (process.contains(ProcessType.CLICKDEPTH)) {
|
||||
if (webgraph.contains(WebgraphSchema.source_clickdepth_i) && webgraph.contains(WebgraphSchema.source_protocol_s) && webgraph.contains(WebgraphSchema.source_urlstub_s) && webgraph.contains(WebgraphSchema.source_id_s)) {
|
||||
protocol = (String) doc.getFieldValue(WebgraphSchema.source_protocol_s.getSolrFieldName());
|
||||
urlstub = (String) doc.getFieldValue(WebgraphSchema.source_urlstub_s.getSolrFieldName());
|
||||
id = (String) doc.getFieldValue(WebgraphSchema.source_id_s.getSolrFieldName());
|
||||
try {
|
||||
url = new DigestURL(protocol + "://" + urlstub, ASCII.getBytes(id));
|
||||
postprocessing_clickdepth(clickdepthCache, sid, url, WebgraphSchema.source_clickdepth_i);
|
||||
} catch (MalformedURLException e) {
|
||||
}
|
||||
}
|
||||
if (webgraph.contains(WebgraphSchema.target_clickdepth_i) && webgraph.contains(WebgraphSchema.target_protocol_s) && webgraph.contains(WebgraphSchema.target_urlstub_s) && webgraph.contains(WebgraphSchema.target_id_s)) {
|
||||
protocol = (String) doc.getFieldValue(WebgraphSchema.target_protocol_s.getSolrFieldName());
|
||||
urlstub = (String) doc.getFieldValue(WebgraphSchema.target_urlstub_s.getSolrFieldName());
|
||||
id = (String) doc.getFieldValue(WebgraphSchema.target_id_s.getSolrFieldName());
|
||||
try {
|
||||
url = new DigestURL(protocol + "://" + urlstub, ASCII.getBytes(id));
|
||||
postprocessing_clickdepth(clickdepthCache, sid, url, WebgraphSchema.target_clickdepth_i);
|
||||
} catch (MalformedURLException e) {
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
// write document back to index
|
||||
try {
|
||||
sid.removeField(WebgraphSchema.process_sxt.getSolrFieldName());
|
||||
|
@ -1148,7 +1111,7 @@ public class CollectionConfiguration extends SchemaConfiguration implements Seri
|
|||
Set<String> omitFields = new HashSet<String>();
|
||||
omitFields.add(CollectionSchema.process_sxt.getSolrFieldName());
|
||||
omitFields.add(CollectionSchema.harvestkey_s.getSolrFieldName());
|
||||
int proccount = 0, proccount_clickdepthchange = 0, proccount_referencechange = 0, proccount_citationchange = 0, proccount_uniquechange = 0;
|
||||
int proccount = 0, proccount_referencechange = 0, proccount_citationchange = 0, proccount_uniquechange = 0;
|
||||
long count = collectionConnector.getCountByQuery(query);
|
||||
long start = System.currentTimeMillis();
|
||||
ConcurrentLog.info("CollectionConfiguration", "collecting " + count + " documents from the collection for harvestkey " + harvestkey);
|
||||
|
@ -1170,9 +1133,6 @@ public class CollectionConfiguration extends SchemaConfiguration implements Seri
|
|||
|
||||
// switch over tag types
|
||||
ProcessType tagtype = ProcessType.valueOf((String) tag);
|
||||
if (tagtype == ProcessType.CLICKDEPTH && collection.contains(CollectionSchema.clickdepth_i)) {
|
||||
if (postprocessing_clickdepth(clickdepthCache, sid, url, CollectionSchema.clickdepth_i)) proccount_clickdepthchange++;
|
||||
}
|
||||
|
||||
if (tagtype == ProcessType.CITATION &&
|
||||
collection.contains(CollectionSchema.cr_host_count_i) &&
|
||||
|
@ -1228,7 +1188,6 @@ public class CollectionConfiguration extends SchemaConfiguration implements Seri
|
|||
}
|
||||
if (count != countcheck) ConcurrentLog.warn("CollectionConfiguration", "ambiguous collection document count for harvestkey " + harvestkey + ": expected=" + count + ", counted=" + countcheck); // big gap for harvestkey = null
|
||||
ConcurrentLog.info("CollectionConfiguration", "cleanup_processing: re-calculated " + proccount+ " new documents, " +
|
||||
proccount_clickdepthchange + " clickdepth changes, " +
|
||||
proccount_referencechange + " reference-count changes, " +
|
||||
proccount_uniquechange + " unique field changes, " +
|
||||
proccount_citationchange + " citation ranking changes.");
|
||||
|
@ -1534,12 +1493,8 @@ public class CollectionConfiguration extends SchemaConfiguration implements Seri
|
|||
configuration.add(doc, CollectionSchema.collection_sxt, cs);
|
||||
}
|
||||
|
||||
// clickdepth, cr and postprocessing
|
||||
// cr and postprocessing
|
||||
Set<ProcessType> processTypes = new LinkedHashSet<ProcessType>();
|
||||
if ((allAttr || configuration.contains(CollectionSchema.clickdepth_i))) {
|
||||
processTypes.add(ProcessType.CLICKDEPTH); // postprocessing needed; this is also needed if the depth is positive; there could be a shortcut
|
||||
CollectionSchema.clickdepth_i.add(doc, digestURL.probablyRootURL() ? 0 : 999); // no lazy value checking to get a '0' into the index
|
||||
}
|
||||
if (allAttr || (configuration.contains(CollectionSchema.cr_host_chance_d) && configuration.contains(CollectionSchema.cr_host_count_i) && configuration.contains(CollectionSchema.cr_host_norm_i))) {
|
||||
processTypes.add(ProcessType.CITATION); // postprocessing needed
|
||||
}
|
||||
|
|
|
@ -57,8 +57,7 @@ public enum CollectionSchema implements SchemaDeclaration {
|
|||
references_internal_i(SolrType.num_integer, true, true, false, false, false, "number of unique http references from same host to referenced url"),
|
||||
references_external_i(SolrType.num_integer, true, true, false, false, false, "number of unique http references from external hosts"),
|
||||
references_exthosts_i(SolrType.num_integer, true, true, false, false, false, "number of external hosts which provide http references"),
|
||||
clickdepth_i(SolrType.num_integer, true, true, false, false, false, "depth of web page according to number of clicks from the 'main' page, which is the page that appears if only the host is entered as url"),
|
||||
crawldepth_i(SolrType.num_integer, true, true, false, false, false, "crawl depth of web page according to the number of steps that the crawler did to get to this document; if the crawl was started at a root document, then this is the maximum of clickdepth_i"),
|
||||
crawldepth_i(SolrType.num_integer, true, true, false, false, false, "crawl depth of web page according to the number of steps that the crawler did to get to this document; if the crawl was started at a root document, then this is equal to the clickdepth"),
|
||||
process_sxt(SolrType.string, true, true, true, false, false, "needed (post-)processing steps on this metadata set"),
|
||||
harvestkey_s(SolrType.string, true, true, false, false, false, "key from a harvest process (i.e. the crawl profile hash key) which is needed for near-realtime postprocessing. This shall be deleted as soon as postprocessing has been terminated."),
|
||||
|
||||
|
|
|
@ -112,7 +112,7 @@ public class WebgraphConfiguration extends SchemaConfiguration implements Serial
|
|||
|
||||
public void addEdges(
|
||||
final Subgraph subgraph,
|
||||
final DigestURL source, final ResponseHeader responseHeader, Map<String, Pattern> collections, int clickdepth_source,
|
||||
final DigestURL source, final ResponseHeader responseHeader, Map<String, Pattern> collections, int crawldepth_source,
|
||||
final List<ImageEntry> images, final Collection<AnchorURL> links,
|
||||
final String sourceName) {
|
||||
boolean allAttr = this.isEmpty();
|
||||
|
@ -120,7 +120,7 @@ public class WebgraphConfiguration extends SchemaConfiguration implements Serial
|
|||
int target_order = 0;
|
||||
for (final AnchorURL target_url: links) {
|
||||
SolrInputDocument edge = getEdge(
|
||||
subgraph, source, responseHeader, collections, clickdepth_source, images,
|
||||
subgraph, source, responseHeader, collections, crawldepth_source, images,
|
||||
sourceName, allAttr, generalNofollow, target_order, target_url);
|
||||
target_order++;
|
||||
// add the edge to the subgraph
|
||||
|
@ -130,7 +130,7 @@ public class WebgraphConfiguration extends SchemaConfiguration implements Serial
|
|||
|
||||
public SolrInputDocument getEdge(
|
||||
final Subgraph subgraph,
|
||||
final DigestURL source_url, final ResponseHeader responseHeader, Map<String, Pattern> collections, int clickdepth_source,
|
||||
final DigestURL source_url, final ResponseHeader responseHeader, Map<String, Pattern> collections, int crawldepth_source,
|
||||
final List<ImageEntry> images,
|
||||
final String sourceName, boolean allAttr, boolean generalNofollow, int target_order, AnchorURL target_url) {
|
||||
|
||||
|
@ -217,9 +217,8 @@ public class WebgraphConfiguration extends SchemaConfiguration implements Serial
|
|||
add(edge, WebgraphSchema.source_path_folders_count_i, paths.length);
|
||||
add(edge, WebgraphSchema.source_path_folders_sxt, paths);
|
||||
}
|
||||
if ((allAttr || contains(WebgraphSchema.source_clickdepth_i)) && this.contains(WebgraphSchema.source_protocol_s) && this.contains(WebgraphSchema.source_urlstub_s) && this.contains(WebgraphSchema.source_id_s)) {
|
||||
add(edge, WebgraphSchema.source_clickdepth_i, clickdepth_source);
|
||||
processTypes.add(ProcessType.CLICKDEPTH); // postprocessing needed; this is also needed if the depth is positive; there could be a shortcut
|
||||
if ((allAttr || contains(WebgraphSchema.source_crawldepth_i)) && this.contains(WebgraphSchema.source_protocol_s) && this.contains(WebgraphSchema.source_urlstub_s) && this.contains(WebgraphSchema.source_id_s)) {
|
||||
add(edge, WebgraphSchema.source_crawldepth_i, crawldepth_source);
|
||||
}
|
||||
|
||||
// parse text to find images and clear text
|
||||
|
@ -289,15 +288,8 @@ public class WebgraphConfiguration extends SchemaConfiguration implements Serial
|
|||
add(edge, WebgraphSchema.target_path_folders_sxt, paths);
|
||||
}
|
||||
|
||||
if ((allAttr || contains(WebgraphSchema.target_clickdepth_i)) && this.contains(WebgraphSchema.target_protocol_s) && this.contains(WebgraphSchema.target_urlstub_s) && this.contains(WebgraphSchema.target_id_s)) {
|
||||
if (target_url.probablyRootURL()) {
|
||||
boolean lc = this.lazy; this.lazy = false;
|
||||
add(edge, WebgraphSchema.target_clickdepth_i, 0);
|
||||
this.lazy = lc;
|
||||
} else {
|
||||
add(edge, WebgraphSchema.target_clickdepth_i, 999);
|
||||
processTypes.add(ProcessType.CLICKDEPTH); // postprocessing needed; this is also needed if the depth is positive; there could be a shortcut
|
||||
}
|
||||
if ((allAttr || contains(WebgraphSchema.target_crawldepth_i)) && this.contains(WebgraphSchema.target_protocol_s) && this.contains(WebgraphSchema.target_urlstub_s) && this.contains(WebgraphSchema.target_id_s)) {
|
||||
add(edge, WebgraphSchema.target_crawldepth_i, 999);
|
||||
}
|
||||
|
||||
if (allAttr || contains(WebgraphSchema.process_sxt)) {
|
||||
|
|
|
@ -35,7 +35,7 @@ public enum WebgraphSchema implements SchemaDeclaration {
|
|||
last_modified(SolrType.date, true, true, false, false, false, "last-modified from http header"),
|
||||
load_date_dt(SolrType.date, true, true, false, false, false, "time when resource was loaded"),
|
||||
collection_sxt(SolrType.string, true, true, true, false, false, "tags that are attached to crawls/index generation to separate the search result into user-defined subsets"),
|
||||
process_sxt(SolrType.string, true, true, true, false, false, "needed (post-)processing steps on this metadata set, used i.e. for clickdepth-computation."),
|
||||
process_sxt(SolrType.string, true, true, true, false, false, "needed (post-)processing steps on this metadata set."),
|
||||
harvestkey_s(SolrType.string, true, true, false, false, false, "key from a harvest process (i.e. the crawl profile hash key) which is needed for near-realtime postprocessing. This shall be deleted as soon as postprocessing has been terminated."),
|
||||
|
||||
// source information
|
||||
|
@ -51,7 +51,7 @@ public enum WebgraphSchema implements SchemaDeclaration {
|
|||
source_parameter_count_i(SolrType.num_integer, true, true, false, false, false, "number of key-value pairs in search part of the url (source)"),
|
||||
source_parameter_key_sxt(SolrType.string, true, true, true, false, false, "the keys from key-value pairs in the search part of the url (source)"),
|
||||
source_parameter_value_sxt(SolrType.string, true, true, true, false, false, "the values from key-value pairs in the search part of the url (source)"),
|
||||
source_clickdepth_i(SolrType.num_integer, true, true, false, false, false, "depth of web page according to number of clicks from the 'main' page, which is the page that appears if only the host is entered as url (source)"),
|
||||
source_crawldepth_i(SolrType.num_integer, true, true, false, false, false, "depth of web page according to number of clicks from the 'main' page, which is the page that appears if only the host is entered as url (source)"),
|
||||
source_cr_host_norm_i(SolrType.num_integer, true, true, false, false, false, "copy of the citation rank norm value from the source link"),
|
||||
|
||||
source_host_s(SolrType.string, true, true, false, false, false, "host of the url (source)"),
|
||||
|
@ -86,7 +86,7 @@ public enum WebgraphSchema implements SchemaDeclaration {
|
|||
target_parameter_count_i(SolrType.num_integer, true, true, false, false, false, "number of key-value pairs in search part of the url (target)"),
|
||||
target_parameter_key_sxt(SolrType.string, true, true, true, false, false, "the keys from key-value pairs in the search part of the url (target)"),
|
||||
target_parameter_value_sxt(SolrType.string, true, true, true, false, true, "the values from key-value pairs in the search part of the url (target)"),
|
||||
target_clickdepth_i(SolrType.num_integer, true, true, false, false, false, "depth of web page according to number of clicks from the 'main' page, which is the page that appears if only the host is entered as url (target)"),
|
||||
target_crawldepth_i(SolrType.num_integer, true, true, false, false, false, "depth of web page according to number of clicks from the 'main' page, which is the page that appears if only the host is entered as url (target)"),
|
||||
target_cr_host_norm_i(SolrType.num_integer, true, true, false, false, false, "copy of the citation rank norm value from the target link; this is only filled if the target host is identical to the source host"),
|
||||
|
||||
target_host_s(SolrType.string, true, true, false, false, true, "host of the url (target)"),
|
||||
|
|
Loading…
Reference in New Issue
Block a user