diff --git a/defaults/solr.collection.schema b/defaults/solr.collection.schema index e6431c3a0..07d2b229a 100644 --- a/defaults/solr.collection.schema +++ b/defaults/solr.collection.schema @@ -63,9 +63,18 @@ httpstatus_i ## redirect url if the error code is 299 < httpstatus_i < 310 #httpstatus_redirect_s -## number of unique http references; used for ranking +## number of unique http references, should be equal to references_internal_i + references_external_i references_i +## number of unique http references from same host as referenced url +references_internal_i + +## number of unique http references from external hosts +references_external_i + +## number of external hosts which provide http references +references_exthosts_i + ## depth of web page according to number of clicks from the 'main' page, which is the page that appears if only the host is entered as url clickdepth_i diff --git a/source/net/yacy/cora/federate/solr/SchemaConfiguration.java b/source/net/yacy/cora/federate/solr/SchemaConfiguration.java index 4900086d1..aadc2bd72 100644 --- a/source/net/yacy/cora/federate/solr/SchemaConfiguration.java +++ b/source/net/yacy/cora/federate/solr/SchemaConfiguration.java @@ -31,9 +31,17 @@ import org.apache.log4j.Logger; import org.apache.solr.common.SolrDocument; import org.apache.solr.common.SolrInputDocument; +import net.yacy.cora.order.Base64Order; import net.yacy.cora.storage.Configuration; +import net.yacy.cora.storage.HandleSet; +import net.yacy.cora.util.SpaceExceededException; +import net.yacy.kelondro.data.citation.CitationReference; import net.yacy.kelondro.data.meta.DigestURI; +import net.yacy.kelondro.index.RowHandleSet; +import net.yacy.kelondro.rwi.ReferenceContainer; +import net.yacy.kelondro.util.ByteBuffer; import net.yacy.search.index.Segment; +import net.yacy.search.schema.CollectionSchema; public class SchemaConfiguration extends Configuration implements Serializable { @@ -85,13 +93,50 @@ public class SchemaConfiguration extends Configuration implements Serializable { return false; } - public boolean postprocessing_references(Segment segment, SolrDocument doc, SolrInputDocument sid, DigestURI url, SchemaDeclaration referencesfield) { - if (!this.contains(referencesfield)) return false; - Integer oldreferences = (Integer) doc.getFieldValue(referencesfield.getSolrFieldName()); - int references = segment.urlCitation().count(url.hash()); - if (references > 0 && (oldreferences == null || oldreferences.intValue() != references)) { - sid.setField(referencesfield.getSolrFieldName(), references); - return true; + public boolean postprocessing_references(Segment segment, SolrDocument doc, SolrInputDocument sid, DigestURI url) { + if (!(this.contains(CollectionSchema.references_i) || this.contains(CollectionSchema.references_internal_i) || + this.contains(CollectionSchema.references_external_i) || this.contains(CollectionSchema.references_exthosts_i))) return false; + Integer all_old = doc == null ? null : (Integer) doc.getFieldValue(CollectionSchema.references_i.getSolrFieldName()); + Integer internal_old = doc == null ? null : (Integer) doc.getFieldValue(CollectionSchema.references_internal_i.getSolrFieldName()); + Integer external_old = doc == null ? null : (Integer) doc.getFieldValue(CollectionSchema.references_external_i.getSolrFieldName()); + Integer exthosts_old = doc == null ? null : (Integer) doc.getFieldValue(CollectionSchema.references_exthosts_i.getSolrFieldName()); + ReferenceContainer references; + try { + int all = 0, internal = 0, external = 0; + references = segment.urlCitation().get(url.hash(), null); + if (references == null) return false; // no references at all + //int references = segment.urlCitation().count(url.hash()); + byte[] uh0 = url.hash(); + Iterator ri = references.entries(); + HandleSet exthosts = new RowHandleSet(6, Base64Order.enhancedCoder, 0); + while (ri.hasNext()) { + CitationReference ref = ri.next(); + byte[] hh = ref.hosthash(); + exthosts.put(hh); + all++; + if (ByteBuffer.equals(hh, 0, uh0, 6, 6)) internal++; else external++; + } + + boolean change = false; + if (all_old == null || all_old.intValue() != all) { + sid.setField(CollectionSchema.references_i.getSolrFieldName(), all); + change = true; + } + if (internal_old == null || internal_old.intValue() != internal) { + sid.setField(CollectionSchema.references_internal_i.getSolrFieldName(), internal); + change = true; + } + if (external_old == null || external_old.intValue() != external) { + sid.setField(CollectionSchema.references_external_i.getSolrFieldName(), external); + change = true; + } + if (exthosts_old == null || exthosts_old.intValue() != exthosts.size()) { + sid.setField(CollectionSchema.references_exthosts_i.getSolrFieldName(), exthosts.size()); + change = true; + } + return change; + } catch (IOException e) { + } catch (SpaceExceededException e) { } return false; } diff --git a/source/net/yacy/kelondro/data/citation/CitationReference.java b/source/net/yacy/kelondro/data/citation/CitationReference.java index daa39d313..01cad40dc 100644 --- a/source/net/yacy/kelondro/data/citation/CitationReference.java +++ b/source/net/yacy/kelondro/data/citation/CitationReference.java @@ -96,6 +96,13 @@ public class CitationReference implements Reference, Serializable { public byte[] urlhash() { return this.entry.getColBytes(col_urlhash, true); } + + public byte[] hosthash() { + byte[] uh = this.entry.getColBytes(col_urlhash, true); + byte[] hh = new byte[6]; + System.arraycopy(uh, 6, hh, 0, 6); + return hh; + } public int virtualAge() { return (int) this.entry.getColLong(col_lastModified); // this is the time in MicoDateDays format diff --git a/source/net/yacy/search/index/Segment.java b/source/net/yacy/search/index/Segment.java index 862c75ed1..4a615633d 100644 --- a/source/net/yacy/search/index/Segment.java +++ b/source/net/yacy/search/index/Segment.java @@ -543,10 +543,7 @@ public class Segment { } // ENRICH DOCUMENT WITH RANKING INFORMATION - if (this.urlCitationIndex != null && this.fulltext.getDefaultConfiguration().contains(CollectionSchema.references_i)) { - int references = this.urlCitationIndex.count(url.hash()); - vector.setField(CollectionSchema.references_i.getSolrFieldName(), references); - } + this.fulltext.getDefaultConfiguration().postprocessing_references(this, null, vector, url); // STORE TO SOLR String error = null; diff --git a/source/net/yacy/search/schema/CollectionConfiguration.java b/source/net/yacy/search/schema/CollectionConfiguration.java index f3181a884..1f82e29e0 100644 --- a/source/net/yacy/search/schema/CollectionConfiguration.java +++ b/source/net/yacy/search/schema/CollectionConfiguration.java @@ -796,7 +796,7 @@ public class CollectionConfiguration extends SchemaConfiguration implements Seri } // refresh the link count; it's 'cheap' to do this here - if (postprocessing_references(segment, doc, sid, url, CollectionSchema.references_i)) proccount_referencechange++; + if (postprocessing_references(segment, doc, sid, url)) proccount_referencechange++; // all processing steps checked, remove the processing tag sid.removeField(CollectionSchema.process_sxt.getSolrFieldName()); diff --git a/source/net/yacy/search/schema/CollectionSchema.java b/source/net/yacy/search/schema/CollectionSchema.java index 03adcacdc..13d79db9f 100644 --- a/source/net/yacy/search/schema/CollectionSchema.java +++ b/source/net/yacy/search/schema/CollectionSchema.java @@ -49,7 +49,10 @@ public enum CollectionSchema implements SchemaDeclaration { failtype_s(SolrType.string, true, true, false, false, false, "fail type if a page was not loaded. This field is either empty, 'excl' or 'fail'"), httpstatus_i(SolrType.num_integer, true, true, false, false, false, "html status return code (i.e. \"200\" for ok), -1 if not loaded"), httpstatus_redirect_s(SolrType.num_integer, true, true, false, false, false, "html status return code (i.e. \"200\" for ok), -1 if not loaded"), - references_i(SolrType.num_integer, true, true, false, false, false, "number of unique http references; used for ranking"), + references_i(SolrType.num_integer, true, true, false, false, false, "number of unique http references, should be equal to references_internal_i + references_external_i"), + references_internal_i(SolrType.num_integer, true, true, false, false, false, "number of unique http references from same host as referenced url"), + references_external_i(SolrType.num_integer, true, true, false, false, false, "number of unique http references from external hosts"), + references_exthosts_i(SolrType.num_integer, true, true, false, false, false, "number of external hosts which provide http references"), clickdepth_i(SolrType.num_integer, true, true, false, false, false, "depth of web page according to number of clicks from the 'main' page, which is the page that appears if only the host is entered as url"), process_sxt(SolrType.string, true, true, true, false, false, "needed (post-)processing steps on this metadata set"),