mirror of
https://github.com/yacy/yacy_search_server.git
synced 2024-09-19 00:01:41 +02:00
added three new field for a better ranking: references_internal_i,
references_external_i and references_exthosts_i. These can be used to count and evaluate the number of external links to every web page. An experimental ranking function can be i.e.: div(add(references_internal_i,product(references_external_i,references_exthosts_i)),add(clickdepth_i,1))
This commit is contained in:
parent
082e3274d6
commit
ada3f27de7
|
@ -63,9 +63,18 @@ httpstatus_i
|
|||
## redirect url if the error code is 299 < httpstatus_i < 310
|
||||
#httpstatus_redirect_s
|
||||
|
||||
## number of unique http references; used for ranking
|
||||
## number of unique http references, should be equal to references_internal_i + references_external_i
|
||||
references_i
|
||||
|
||||
## number of unique http references from same host as referenced url
|
||||
references_internal_i
|
||||
|
||||
## number of unique http references from external hosts
|
||||
references_external_i
|
||||
|
||||
## number of external hosts which provide http references
|
||||
references_exthosts_i
|
||||
|
||||
## depth of web page according to number of clicks from the 'main' page, which is the page that appears if only the host is entered as url
|
||||
clickdepth_i
|
||||
|
||||
|
|
|
@ -31,9 +31,17 @@ import org.apache.log4j.Logger;
|
|||
import org.apache.solr.common.SolrDocument;
|
||||
import org.apache.solr.common.SolrInputDocument;
|
||||
|
||||
import net.yacy.cora.order.Base64Order;
|
||||
import net.yacy.cora.storage.Configuration;
|
||||
import net.yacy.cora.storage.HandleSet;
|
||||
import net.yacy.cora.util.SpaceExceededException;
|
||||
import net.yacy.kelondro.data.citation.CitationReference;
|
||||
import net.yacy.kelondro.data.meta.DigestURI;
|
||||
import net.yacy.kelondro.index.RowHandleSet;
|
||||
import net.yacy.kelondro.rwi.ReferenceContainer;
|
||||
import net.yacy.kelondro.util.ByteBuffer;
|
||||
import net.yacy.search.index.Segment;
|
||||
import net.yacy.search.schema.CollectionSchema;
|
||||
|
||||
public class SchemaConfiguration extends Configuration implements Serializable {
|
||||
|
||||
|
@ -85,13 +93,50 @@ public class SchemaConfiguration extends Configuration implements Serializable {
|
|||
return false;
|
||||
}
|
||||
|
||||
public boolean postprocessing_references(Segment segment, SolrDocument doc, SolrInputDocument sid, DigestURI url, SchemaDeclaration referencesfield) {
|
||||
if (!this.contains(referencesfield)) return false;
|
||||
Integer oldreferences = (Integer) doc.getFieldValue(referencesfield.getSolrFieldName());
|
||||
int references = segment.urlCitation().count(url.hash());
|
||||
if (references > 0 && (oldreferences == null || oldreferences.intValue() != references)) {
|
||||
sid.setField(referencesfield.getSolrFieldName(), references);
|
||||
return true;
|
||||
public boolean postprocessing_references(Segment segment, SolrDocument doc, SolrInputDocument sid, DigestURI url) {
|
||||
if (!(this.contains(CollectionSchema.references_i) || this.contains(CollectionSchema.references_internal_i) ||
|
||||
this.contains(CollectionSchema.references_external_i) || this.contains(CollectionSchema.references_exthosts_i))) return false;
|
||||
Integer all_old = doc == null ? null : (Integer) doc.getFieldValue(CollectionSchema.references_i.getSolrFieldName());
|
||||
Integer internal_old = doc == null ? null : (Integer) doc.getFieldValue(CollectionSchema.references_internal_i.getSolrFieldName());
|
||||
Integer external_old = doc == null ? null : (Integer) doc.getFieldValue(CollectionSchema.references_external_i.getSolrFieldName());
|
||||
Integer exthosts_old = doc == null ? null : (Integer) doc.getFieldValue(CollectionSchema.references_exthosts_i.getSolrFieldName());
|
||||
ReferenceContainer<CitationReference> references;
|
||||
try {
|
||||
int all = 0, internal = 0, external = 0;
|
||||
references = segment.urlCitation().get(url.hash(), null);
|
||||
if (references == null) return false; // no references at all
|
||||
//int references = segment.urlCitation().count(url.hash());
|
||||
byte[] uh0 = url.hash();
|
||||
Iterator<CitationReference> ri = references.entries();
|
||||
HandleSet exthosts = new RowHandleSet(6, Base64Order.enhancedCoder, 0);
|
||||
while (ri.hasNext()) {
|
||||
CitationReference ref = ri.next();
|
||||
byte[] hh = ref.hosthash();
|
||||
exthosts.put(hh);
|
||||
all++;
|
||||
if (ByteBuffer.equals(hh, 0, uh0, 6, 6)) internal++; else external++;
|
||||
}
|
||||
|
||||
boolean change = false;
|
||||
if (all_old == null || all_old.intValue() != all) {
|
||||
sid.setField(CollectionSchema.references_i.getSolrFieldName(), all);
|
||||
change = true;
|
||||
}
|
||||
if (internal_old == null || internal_old.intValue() != internal) {
|
||||
sid.setField(CollectionSchema.references_internal_i.getSolrFieldName(), internal);
|
||||
change = true;
|
||||
}
|
||||
if (external_old == null || external_old.intValue() != external) {
|
||||
sid.setField(CollectionSchema.references_external_i.getSolrFieldName(), external);
|
||||
change = true;
|
||||
}
|
||||
if (exthosts_old == null || exthosts_old.intValue() != exthosts.size()) {
|
||||
sid.setField(CollectionSchema.references_exthosts_i.getSolrFieldName(), exthosts.size());
|
||||
change = true;
|
||||
}
|
||||
return change;
|
||||
} catch (IOException e) {
|
||||
} catch (SpaceExceededException e) {
|
||||
}
|
||||
return false;
|
||||
}
|
||||
|
|
|
@ -96,6 +96,13 @@ public class CitationReference implements Reference, Serializable {
|
|||
public byte[] urlhash() {
|
||||
return this.entry.getColBytes(col_urlhash, true);
|
||||
}
|
||||
|
||||
public byte[] hosthash() {
|
||||
byte[] uh = this.entry.getColBytes(col_urlhash, true);
|
||||
byte[] hh = new byte[6];
|
||||
System.arraycopy(uh, 6, hh, 0, 6);
|
||||
return hh;
|
||||
}
|
||||
|
||||
public int virtualAge() {
|
||||
return (int) this.entry.getColLong(col_lastModified); // this is the time in MicoDateDays format
|
||||
|
|
|
@ -543,10 +543,7 @@ public class Segment {
|
|||
}
|
||||
|
||||
// ENRICH DOCUMENT WITH RANKING INFORMATION
|
||||
if (this.urlCitationIndex != null && this.fulltext.getDefaultConfiguration().contains(CollectionSchema.references_i)) {
|
||||
int references = this.urlCitationIndex.count(url.hash());
|
||||
vector.setField(CollectionSchema.references_i.getSolrFieldName(), references);
|
||||
}
|
||||
this.fulltext.getDefaultConfiguration().postprocessing_references(this, null, vector, url);
|
||||
|
||||
// STORE TO SOLR
|
||||
String error = null;
|
||||
|
|
|
@ -796,7 +796,7 @@ public class CollectionConfiguration extends SchemaConfiguration implements Seri
|
|||
}
|
||||
|
||||
// refresh the link count; it's 'cheap' to do this here
|
||||
if (postprocessing_references(segment, doc, sid, url, CollectionSchema.references_i)) proccount_referencechange++;
|
||||
if (postprocessing_references(segment, doc, sid, url)) proccount_referencechange++;
|
||||
|
||||
// all processing steps checked, remove the processing tag
|
||||
sid.removeField(CollectionSchema.process_sxt.getSolrFieldName());
|
||||
|
|
|
@ -49,7 +49,10 @@ public enum CollectionSchema implements SchemaDeclaration {
|
|||
failtype_s(SolrType.string, true, true, false, false, false, "fail type if a page was not loaded. This field is either empty, 'excl' or 'fail'"),
|
||||
httpstatus_i(SolrType.num_integer, true, true, false, false, false, "html status return code (i.e. \"200\" for ok), -1 if not loaded"),
|
||||
httpstatus_redirect_s(SolrType.num_integer, true, true, false, false, false, "html status return code (i.e. \"200\" for ok), -1 if not loaded"),
|
||||
references_i(SolrType.num_integer, true, true, false, false, false, "number of unique http references; used for ranking"),
|
||||
references_i(SolrType.num_integer, true, true, false, false, false, "number of unique http references, should be equal to references_internal_i + references_external_i"),
|
||||
references_internal_i(SolrType.num_integer, true, true, false, false, false, "number of unique http references from same host as referenced url"),
|
||||
references_external_i(SolrType.num_integer, true, true, false, false, false, "number of unique http references from external hosts"),
|
||||
references_exthosts_i(SolrType.num_integer, true, true, false, false, false, "number of external hosts which provide http references"),
|
||||
clickdepth_i(SolrType.num_integer, true, true, false, false, false, "depth of web page according to number of clicks from the 'main' page, which is the page that appears if only the host is entered as url"),
|
||||
process_sxt(SolrType.string, true, true, true, false, false, "needed (post-)processing steps on this metadata set"),
|
||||
|
||||
|
|
Loading…
Reference in New Issue
Block a user