added three new field for a better ranking: references_internal_i,

references_external_i and references_exthosts_i. These can be used to
count and evaluate the number of external links to every web page. An
experimental ranking function can be i.e.:
div(add(references_internal_i,product(references_external_i,references_exthosts_i)),add(clickdepth_i,1))
This commit is contained in:
Michael Peter Christen 2013-04-12 16:17:14 +02:00
parent 082e3274d6
commit ada3f27de7
6 changed files with 75 additions and 14 deletions

View File

@ -63,9 +63,18 @@ httpstatus_i
## redirect url if the error code is 299 < httpstatus_i < 310
#httpstatus_redirect_s
## number of unique http references; used for ranking
## number of unique http references, should be equal to references_internal_i + references_external_i
references_i
## number of unique http references from same host as referenced url
references_internal_i
## number of unique http references from external hosts
references_external_i
## number of external hosts which provide http references
references_exthosts_i
## depth of web page according to number of clicks from the 'main' page, which is the page that appears if only the host is entered as url
clickdepth_i

View File

@ -31,9 +31,17 @@ import org.apache.log4j.Logger;
import org.apache.solr.common.SolrDocument;
import org.apache.solr.common.SolrInputDocument;
import net.yacy.cora.order.Base64Order;
import net.yacy.cora.storage.Configuration;
import net.yacy.cora.storage.HandleSet;
import net.yacy.cora.util.SpaceExceededException;
import net.yacy.kelondro.data.citation.CitationReference;
import net.yacy.kelondro.data.meta.DigestURI;
import net.yacy.kelondro.index.RowHandleSet;
import net.yacy.kelondro.rwi.ReferenceContainer;
import net.yacy.kelondro.util.ByteBuffer;
import net.yacy.search.index.Segment;
import net.yacy.search.schema.CollectionSchema;
public class SchemaConfiguration extends Configuration implements Serializable {
@ -85,13 +93,50 @@ public class SchemaConfiguration extends Configuration implements Serializable {
return false;
}
public boolean postprocessing_references(Segment segment, SolrDocument doc, SolrInputDocument sid, DigestURI url, SchemaDeclaration referencesfield) {
if (!this.contains(referencesfield)) return false;
Integer oldreferences = (Integer) doc.getFieldValue(referencesfield.getSolrFieldName());
int references = segment.urlCitation().count(url.hash());
if (references > 0 && (oldreferences == null || oldreferences.intValue() != references)) {
sid.setField(referencesfield.getSolrFieldName(), references);
return true;
public boolean postprocessing_references(Segment segment, SolrDocument doc, SolrInputDocument sid, DigestURI url) {
if (!(this.contains(CollectionSchema.references_i) || this.contains(CollectionSchema.references_internal_i) ||
this.contains(CollectionSchema.references_external_i) || this.contains(CollectionSchema.references_exthosts_i))) return false;
Integer all_old = doc == null ? null : (Integer) doc.getFieldValue(CollectionSchema.references_i.getSolrFieldName());
Integer internal_old = doc == null ? null : (Integer) doc.getFieldValue(CollectionSchema.references_internal_i.getSolrFieldName());
Integer external_old = doc == null ? null : (Integer) doc.getFieldValue(CollectionSchema.references_external_i.getSolrFieldName());
Integer exthosts_old = doc == null ? null : (Integer) doc.getFieldValue(CollectionSchema.references_exthosts_i.getSolrFieldName());
ReferenceContainer<CitationReference> references;
try {
int all = 0, internal = 0, external = 0;
references = segment.urlCitation().get(url.hash(), null);
if (references == null) return false; // no references at all
//int references = segment.urlCitation().count(url.hash());
byte[] uh0 = url.hash();
Iterator<CitationReference> ri = references.entries();
HandleSet exthosts = new RowHandleSet(6, Base64Order.enhancedCoder, 0);
while (ri.hasNext()) {
CitationReference ref = ri.next();
byte[] hh = ref.hosthash();
exthosts.put(hh);
all++;
if (ByteBuffer.equals(hh, 0, uh0, 6, 6)) internal++; else external++;
}
boolean change = false;
if (all_old == null || all_old.intValue() != all) {
sid.setField(CollectionSchema.references_i.getSolrFieldName(), all);
change = true;
}
if (internal_old == null || internal_old.intValue() != internal) {
sid.setField(CollectionSchema.references_internal_i.getSolrFieldName(), internal);
change = true;
}
if (external_old == null || external_old.intValue() != external) {
sid.setField(CollectionSchema.references_external_i.getSolrFieldName(), external);
change = true;
}
if (exthosts_old == null || exthosts_old.intValue() != exthosts.size()) {
sid.setField(CollectionSchema.references_exthosts_i.getSolrFieldName(), exthosts.size());
change = true;
}
return change;
} catch (IOException e) {
} catch (SpaceExceededException e) {
}
return false;
}

View File

@ -96,6 +96,13 @@ public class CitationReference implements Reference, Serializable {
public byte[] urlhash() {
return this.entry.getColBytes(col_urlhash, true);
}
public byte[] hosthash() {
byte[] uh = this.entry.getColBytes(col_urlhash, true);
byte[] hh = new byte[6];
System.arraycopy(uh, 6, hh, 0, 6);
return hh;
}
public int virtualAge() {
return (int) this.entry.getColLong(col_lastModified); // this is the time in MicoDateDays format

View File

@ -543,10 +543,7 @@ public class Segment {
}
// ENRICH DOCUMENT WITH RANKING INFORMATION
if (this.urlCitationIndex != null && this.fulltext.getDefaultConfiguration().contains(CollectionSchema.references_i)) {
int references = this.urlCitationIndex.count(url.hash());
vector.setField(CollectionSchema.references_i.getSolrFieldName(), references);
}
this.fulltext.getDefaultConfiguration().postprocessing_references(this, null, vector, url);
// STORE TO SOLR
String error = null;

View File

@ -796,7 +796,7 @@ public class CollectionConfiguration extends SchemaConfiguration implements Seri
}
// refresh the link count; it's 'cheap' to do this here
if (postprocessing_references(segment, doc, sid, url, CollectionSchema.references_i)) proccount_referencechange++;
if (postprocessing_references(segment, doc, sid, url)) proccount_referencechange++;
// all processing steps checked, remove the processing tag
sid.removeField(CollectionSchema.process_sxt.getSolrFieldName());

View File

@ -49,7 +49,10 @@ public enum CollectionSchema implements SchemaDeclaration {
failtype_s(SolrType.string, true, true, false, false, false, "fail type if a page was not loaded. This field is either empty, 'excl' or 'fail'"),
httpstatus_i(SolrType.num_integer, true, true, false, false, false, "html status return code (i.e. \"200\" for ok), -1 if not loaded"),
httpstatus_redirect_s(SolrType.num_integer, true, true, false, false, false, "html status return code (i.e. \"200\" for ok), -1 if not loaded"),
references_i(SolrType.num_integer, true, true, false, false, false, "number of unique http references; used for ranking"),
references_i(SolrType.num_integer, true, true, false, false, false, "number of unique http references, should be equal to references_internal_i + references_external_i"),
references_internal_i(SolrType.num_integer, true, true, false, false, false, "number of unique http references from same host as referenced url"),
references_external_i(SolrType.num_integer, true, true, false, false, false, "number of unique http references from external hosts"),
references_exthosts_i(SolrType.num_integer, true, true, false, false, false, "number of external hosts which provide http references"),
clickdepth_i(SolrType.num_integer, true, true, false, false, false, "depth of web page according to number of clicks from the 'main' page, which is the page that appears if only the host is entered as url"),
process_sxt(SolrType.string, true, true, true, false, false, "needed (post-)processing steps on this metadata set"),