From 570511f3c819dcd30bc28be8a4d05bd298f55f4b Mon Sep 17 00:00:00 2001 From: Michael Peter Christen Date: Thu, 13 Jun 2013 13:01:28 +0200 Subject: [PATCH] removed fields references_internal_id_sxt and references_internal_url_sxt because they had been shown to be superfluous. The citation of referrer in the host browser is possible without them. Therefore now the host browser does not only show internal, but also external referrer to each link. --- defaults/solr.collection.schema | 6 -- htroot/HostBrowser.java | 75 +++++++++++++------ .../federate/solr/SchemaConfiguration.java | 25 +------ source/net/yacy/search/index/Segment.java | 2 +- .../schema/CollectionConfiguration.java | 2 +- .../yacy/search/schema/CollectionSchema.java | 2 - 6 files changed, 55 insertions(+), 57 deletions(-) diff --git a/defaults/solr.collection.schema b/defaults/solr.collection.schema index 41fd633bc..cb69a70ac 100644 --- a/defaults/solr.collection.schema +++ b/defaults/solr.collection.schema @@ -72,12 +72,6 @@ references_i ## number of unique http references from same host to referenced url references_internal_i -## ids of unique http references from same host to referenced url -#references_internal_id_sxt - -## urls of unique http references from same host to referenced url -#references_internal_url_sxt - ## number of unique http references from external hosts references_external_i diff --git a/htroot/HostBrowser.java b/htroot/HostBrowser.java index aef6e5523..3e26b6ca7 100644 --- a/htroot/HostBrowser.java +++ b/htroot/HostBrowser.java @@ -21,7 +21,6 @@ import java.io.IOException; import java.net.MalformedURLException; import java.util.ArrayList; -import java.util.Collection; import java.util.Date; import java.util.HashMap; import java.util.HashSet; @@ -42,6 +41,7 @@ import net.yacy.cora.federate.solr.connector.AbstractSolrConnector; import net.yacy.cora.protocol.RequestHeader; import net.yacy.cora.sorting.ClusteredScoreMap; import net.yacy.cora.sorting.ReversibleScoreMap; +import net.yacy.cora.storage.HandleSet; import net.yacy.crawler.HarvestProcess; import net.yacy.crawler.data.NoticedURL.StackType; import net.yacy.crawler.retrieval.Request; @@ -51,6 +51,8 @@ import net.yacy.kelondro.logging.Log; import net.yacy.peers.graphics.WebStructureGraph.StructureEntry; import net.yacy.search.Switchboard; import net.yacy.search.index.Fulltext; +import net.yacy.search.index.Segment.ReferenceReport; +import net.yacy.search.index.Segment.ReferenceReportCache; import net.yacy.search.schema.CollectionSchema; import net.yacy.server.serverObjects; import net.yacy.server.serverSwitch; @@ -274,8 +276,6 @@ public class HostBrowser { CollectionSchema.clickdepth_i.getSolrFieldName(), CollectionSchema.references_i.getSolrFieldName(), CollectionSchema.references_internal_i.getSolrFieldName(), - CollectionSchema.references_internal_id_sxt.getSolrFieldName(), - CollectionSchema.references_internal_url_sxt.getSolrFieldName(), CollectionSchema.references_external_i.getSolrFieldName(), CollectionSchema.references_exthosts_i.getSolrFieldName(), CollectionSchema.cr_host_chance_d.getSolrFieldName(), @@ -289,13 +289,15 @@ public class HostBrowser { Map infoCache = new HashMap(); int hostsize = 0; final List deleteIDs = new ArrayList(); - long timeout = System.currentTimeMillis() + TIMEOUT; + long timeoutList = System.currentTimeMillis() + TIMEOUT; + long timeoutReferences = System.currentTimeMillis() + 3000; + ReferenceReportCache rrCache = sb.index.getReferenceReportCache(); while ((doc = docs.take()) != AbstractSolrConnector.POISON_DOCUMENT) { String u = (String) doc.getFieldValue(CollectionSchema.sku.getSolrFieldName()); String errortype = (String) doc.getFieldValue(CollectionSchema.failtype_s.getSolrFieldName()); FailType error = errortype == null ? null : FailType.valueOf(errortype); String ids = (String) doc.getFieldValue(CollectionSchema.id.getSolrFieldName()); - infoCache.put(ids, new InfoCacheEntry(sb.index.fulltext(), doc)); + infoCache.put(ids, new InfoCacheEntry(sb.index.fulltext(), rrCache, doc, ids, System.currentTimeMillis() < timeoutReferences)); if (u.startsWith(path)) { if (delete) { deleteIDs.add(ids); @@ -315,7 +317,7 @@ public class HostBrowser { if ((complete || u.startsWith(path)) && !storedDocs.contains(u)) inboundLinks.add(u); } - // collect outboundlinks to browse to the outbound + // collect referrer links links = URIMetadataNode.getLinks(doc, false); while (links.hasNext()) { u = links.next(); @@ -332,7 +334,7 @@ public class HostBrowser { } catch (MalformedURLException e) {} } } - if (System.currentTimeMillis() > timeout) break; + if (System.currentTimeMillis() > timeoutList) break; } if (deleteIDs.size() > 0) sb.remove(deleteIDs); @@ -511,17 +513,13 @@ public class HostBrowser { public Integer cr_n; public Double cr_c; public int clickdepth, references, references_internal, references_external, references_exthosts; - public List references_internal_urls; - private final Fulltext fulltext; - public InfoCacheEntry(final Fulltext fulltext, final SolrDocument doc) { - this.fulltext = fulltext; + public List references_internal_urls, references_external_urls; + public InfoCacheEntry(final Fulltext fulltext, final ReferenceReportCache rrCache, final SolrDocument doc, final String urlhash, boolean fetchReferences) { this.cr_c = (Double) doc.getFieldValue(CollectionSchema.cr_host_chance_d.getSolrFieldName()); this.cr_n = (Integer) doc.getFieldValue(CollectionSchema.cr_host_norm_i.getSolrFieldName()); Integer cd = (Integer) doc.getFieldValue(CollectionSchema.clickdepth_i.getSolrFieldName()); Integer rc = (Integer) doc.getFieldValue(CollectionSchema.references_i.getSolrFieldName()); Integer rc_internal = (Integer) doc.getFieldValue(CollectionSchema.references_internal_i.getSolrFieldName()); - Collection rc_internal_id = doc.getFieldValues(CollectionSchema.references_internal_id_sxt.getSolrFieldName()); - Collection rc_internal_url = doc.getFieldValues(CollectionSchema.references_internal_url_sxt.getSolrFieldName()); Integer rc_external = (Integer) doc.getFieldValue(CollectionSchema.references_external_i.getSolrFieldName()); Integer rc_exthosts = (Integer) doc.getFieldValue(CollectionSchema.references_exthosts_i.getSolrFieldName()); this.clickdepth = (cd == null || cd.intValue() < 0) ? 999 : cd.intValue(); @@ -529,21 +527,52 @@ public class HostBrowser { this.references_internal = (rc_internal == null || rc_internal.intValue() <= 0) ? 0 : rc_internal.intValue(); // calculate the url reference list this.references_internal_urls = new ArrayList(); - if (rc_internal_url != null) { - for (Object o: rc_internal_url) references_internal_urls.add((String) o); - } else if (rc_internal_id != null) { - for (Object o: rc_internal_id) { - DigestURI u = fulltext.getURL(ASCII.getBytes((String) o)); - if (u != null) references_internal_urls.add(u.toNormalform(true)); + this.references_external_urls = new ArrayList(); + if (fetchReferences) { + // get the references from the citation index + try { + ReferenceReport rr = rrCache.getReferenceReport(ASCII.getBytes(urlhash), false); + List internalIDs = new ArrayList(); + List externalIDs = new ArrayList(); + HandleSet iids = rr.getInternallIDs(); + for (byte[] b: iids) internalIDs.add(ASCII.String(b)); + HandleSet eids = rr.getExternalIDs(); + for (byte[] b: eids) externalIDs.add(ASCII.String(b)); + // get all urls from the index and store them here + for (String id: internalIDs) { + if (id.equals(urlhash)) continue; // no self-references + DigestURI u = fulltext.getURL(ASCII.getBytes(id)); + if (u != null) references_internal_urls.add(u.toNormalform(true)); + } + for (String id: externalIDs) { + if (id.equals(urlhash)) continue; // no self-references + DigestURI u = fulltext.getURL(ASCII.getBytes(id)); + if (u != null) references_external_urls.add(u.toNormalform(true)); + } + } catch (IOException e) { } + } this.references_external = (rc_external == null || rc_external.intValue() <= 0) ? 0 : rc_external.intValue(); this.references_exthosts = (rc_exthosts == null || rc_exthosts.intValue() <= 0) ? 0 : rc_exthosts.intValue(); } public String toString() { - StringBuilder sb = new StringBuilder(); - for (String s: references_internal_urls) sb.append("info"); - if (sb.length() == 0 && !fulltext.getDefaultConfiguration().contains(CollectionSchema.references_internal_id_sxt)) sb.append("info"); + StringBuilder sbi = new StringBuilder(); + int c = 0; + for (String s: references_internal_urls) { + sbi.append("info"); + c++; + if (c % 80 == 0) sbi.append("
"); + } + if (sbi.length() > 0) sbi.insert(0, "
internal referrer:
"); + StringBuilder sbe = new StringBuilder(); + c = 0; + for (String s: references_external_urls) { + sbe.append("info"); + c++; + if (c % 80 == 0) sbe.append("
"); + } + if (sbe.length() > 0) sbe.insert(0, "
external referrer:
"); return (this.clickdepth >= 0 ? "clickdepth: " + this.clickdepth : @@ -551,7 +580,7 @@ public class HostBrowser { (this.cr_c != null ? ", cr=" + (Math.round(this.cr_c * 1000.0d) / 1000.0d) : "") + (this.cr_n != null ? ", crn=" + this.cr_n : "") + (this.references >= 0 ? - ", refs: " + this.references_exthosts + " hosts, " + this.references_external + " ext, " + this.references_internal + " int" + (sb.length() > 0 ? " " + sb.toString() + "" : "") : + ", refs: " + this.references_exthosts + " hosts, " + this.references_external + " ext, " + this.references_internal + " int" + sbi.toString() + sbe.toString() : ""); } } diff --git a/source/net/yacy/cora/federate/solr/SchemaConfiguration.java b/source/net/yacy/cora/federate/solr/SchemaConfiguration.java index 399562c5d..27c4211d8 100644 --- a/source/net/yacy/cora/federate/solr/SchemaConfiguration.java +++ b/source/net/yacy/cora/federate/solr/SchemaConfiguration.java @@ -24,7 +24,6 @@ import java.io.File; import java.io.IOException; import java.io.Serializable; import java.util.ArrayList; -import java.util.Collection; import java.util.Date; import java.util.Iterator; import java.util.List; @@ -38,7 +37,6 @@ import net.yacy.cora.document.ASCII; import net.yacy.cora.storage.Configuration; import net.yacy.cora.storage.HandleSet; import net.yacy.kelondro.data.meta.DigestURI; -import net.yacy.search.index.Fulltext; import net.yacy.search.index.Segment; import net.yacy.search.index.Segment.ReferenceReport; import net.yacy.search.index.Segment.ReferenceReportCache; @@ -94,15 +92,12 @@ public class SchemaConfiguration extends Configuration implements Serializable { return false; } - public boolean postprocessing_references(Fulltext fulltext, ReferenceReportCache rrCache, SolrDocument doc, SolrInputDocument sid, DigestURI url, Map hostExtentCount) { + public boolean postprocessing_references(ReferenceReportCache rrCache, SolrDocument doc, SolrInputDocument sid, DigestURI url, Map hostExtentCount) { if (!(this.contains(CollectionSchema.references_i) || this.contains(CollectionSchema.references_internal_i) || - this.contains(CollectionSchema.references_internal_id_sxt) || this.contains(CollectionSchema.references_internal_url_sxt) || this.contains(CollectionSchema.references_external_i) || this.contains(CollectionSchema.references_exthosts_i))) return false; Integer all_old = doc == null ? null : (Integer) doc.getFieldValue(CollectionSchema.references_i.getSolrFieldName()); Integer internal_old = doc == null ? null : (Integer) doc.getFieldValue(CollectionSchema.references_internal_i.getSolrFieldName()); - Collection internal_ids_old = doc == null ? null : doc.getFieldValues(CollectionSchema.references_internal_id_sxt.getSolrFieldName()); - Collection internal_urls_old = doc == null ? null : doc.getFieldValues(CollectionSchema.references_internal_url_sxt.getSolrFieldName()); Integer external_old = doc == null ? null : (Integer) doc.getFieldValue(CollectionSchema.references_external_i.getSolrFieldName()); Integer exthosts_old = doc == null ? null : (Integer) doc.getFieldValue(CollectionSchema.references_exthosts_i.getSolrFieldName()); Integer hostextc_old = doc == null ? null : (Integer) doc.getFieldValue(CollectionSchema.host_extent_i.getSolrFieldName()); @@ -111,14 +106,6 @@ public class SchemaConfiguration extends Configuration implements Serializable { List internalIDs = new ArrayList(); HandleSet iids = rr.getInternallIDs(); for (byte[] b: iids) internalIDs.add(ASCII.String(b)); - List internalURLs = new ArrayList(); - if (this.contains(CollectionSchema.references_internal_url_sxt)) { - // get all urls from the index and store them here - for (String id: internalIDs) { - DigestURI u = fulltext.getURL(ASCII.getBytes(id)); - if (u != null) internalURLs.add(u.toNormalform(true)); - } - } boolean change = false; int all = rr.getExternalCount() + rr.getInternalCount(); @@ -132,16 +119,6 @@ public class SchemaConfiguration extends Configuration implements Serializable { sid.setField(CollectionSchema.references_internal_i.getSolrFieldName(), rr.getInternalCount()); change = true; } - if (this.contains(CollectionSchema.references_internal_id_sxt) && - (internal_ids_old == null || internal_ids_old.size() != internalIDs.size())) { - sid.setField(CollectionSchema.references_internal_id_sxt.getSolrFieldName(), internalIDs); - change = true; - } - if (this.contains(CollectionSchema.references_internal_url_sxt) && - (internal_urls_old == null || internal_urls_old.size() != internalURLs.size())) { - sid.setField(CollectionSchema.references_internal_url_sxt.getSolrFieldName(), internalURLs); - change = true; - } if (this.contains(CollectionSchema.references_external_i) && (external_old == null || external_old.intValue() != rr.getExternalCount())) { sid.setField(CollectionSchema.references_external_i.getSolrFieldName(), rr.getExternalCount()); diff --git a/source/net/yacy/search/index/Segment.java b/source/net/yacy/search/index/Segment.java index 8abe8b42c..f6e122e3e 100644 --- a/source/net/yacy/search/index/Segment.java +++ b/source/net/yacy/search/index/Segment.java @@ -705,7 +705,7 @@ public class Segment { // ENRICH DOCUMENT WITH RANKING INFORMATION if (this.connectedCitation()) { - this.fulltext.getDefaultConfiguration().postprocessing_references(this.fulltext, this.getReferenceReportCache(), null, vector, url, null); + this.fulltext.getDefaultConfiguration().postprocessing_references(this.getReferenceReportCache(), null, vector, url, null); } // STORE TO SOLR String error = null; diff --git a/source/net/yacy/search/schema/CollectionConfiguration.java b/source/net/yacy/search/schema/CollectionConfiguration.java index 418657f46..5ae5d4645 100644 --- a/source/net/yacy/search/schema/CollectionConfiguration.java +++ b/source/net/yacy/search/schema/CollectionConfiguration.java @@ -897,7 +897,7 @@ public class CollectionConfiguration extends SchemaConfiguration implements Seri long count = segment.fulltext().getDefaultConnector().getCountByQuery(q.toString()); hostExtentCache.put(hosthash, count); } - if (postprocessing_references(segment.fulltext(), rrCache, doc, sid, url, hostExtentCache)) proccount_referencechange++; + if (postprocessing_references(rrCache, doc, sid, url, hostExtentCache)) proccount_referencechange++; // all processing steps checked, remove the processing tag sid.removeField(CollectionSchema.process_sxt.getSolrFieldName()); diff --git a/source/net/yacy/search/schema/CollectionSchema.java b/source/net/yacy/search/schema/CollectionSchema.java index 5681915c9..07ba7877e 100644 --- a/source/net/yacy/search/schema/CollectionSchema.java +++ b/source/net/yacy/search/schema/CollectionSchema.java @@ -52,8 +52,6 @@ public enum CollectionSchema implements SchemaDeclaration { httpstatus_redirect_s(SolrType.num_integer, true, true, false, false, false, "html status return code (i.e. \"200\" for ok), -1 if not loaded"), references_i(SolrType.num_integer, true, true, false, false, false, "number of unique http references, should be equal to references_internal_i + references_external_i"), references_internal_i(SolrType.num_integer, true, true, false, false, false, "number of unique http references from same host to referenced url"), - references_internal_id_sxt(SolrType.string, true, true, true, false, true, "ids of unique http references from same host to referenced url"), - references_internal_url_sxt(SolrType.string, true, true, true, false, true, "urls of unique http references from same host to referenced url"), references_external_i(SolrType.num_integer, true, true, false, false, false, "number of unique http references from external hosts"), references_exthosts_i(SolrType.num_integer, true, true, false, false, false, "number of external hosts which provide http references"), clickdepth_i(SolrType.num_integer, true, true, false, false, false, "depth of web page according to number of clicks from the 'main' page, which is the page that appears if only the host is entered as url"),