added two new fields, exact_signature_copycount_i and

fuzzy_signature_copycount_i, which count the number of copies of
non-unique documents and assigns this to each document. Thus, each
document there is a number assigned which shows how many copies of this
document exists.
These fields are disabled by default.
This commit is contained in:
Michael Peter Christen 2013-09-04 23:11:53 +02:00
parent a2511b5600
commit 85456f46b2
7 changed files with 129 additions and 92 deletions

View File

@ -42,6 +42,9 @@ exact_signature_l
## flag shows if exact_signature_l is unique at the time of document creation, used for double-check during search
exact_signature_unique_b
## counter for the number of documents which are not unique (== count of not-unique-flagged documents + 1)
#exact_signature_copycount_i
## 64 bit of the Lookup3Signature from EnhancedTextProfileSignature of text_t
fuzzy_signature_l
@ -51,6 +54,9 @@ fuzzy_signature_l
## flag shows if fuzzy_signature_l is unique at the time of document creation, used for double-check during search
fuzzy_signature_unique_b
## counter for the number of documents which are not unique (== count of not-unique-flagged documents + 1)
#fuzzy_signature_copycount_i
## the size of the raw source (mandatory field)
size_i

View File

@ -26,6 +26,6 @@ package net.yacy.cora.federate.solr;
*/
public enum ProcessType {
CLICKDEPTH, CITATION;
CLICKDEPTH, CITATION, UNIQUE;
}

View File

@ -28,8 +28,10 @@ import java.util.Date;
import java.util.Iterator;
import java.util.List;
import java.util.Map;
import java.util.Set;
import org.apache.solr.common.SolrDocument;
import org.apache.solr.common.SolrDocumentList;
import org.apache.solr.common.SolrInputDocument;
import net.yacy.cora.document.ASCII;
@ -75,6 +77,77 @@ public class SchemaConfiguration extends Configuration implements Serializable {
}
}
}
public boolean postprocessing_doublecontent(Segment segment, Set<String> uniqueURLs, SolrInputDocument sid, DigestURI url) {
boolean changed = false;
// FIND OUT IF THIS IS A DOUBLE DOCUMENT
String hostid = url.hosthash();
for (CollectionSchema[] checkfields: new CollectionSchema[][]{
{CollectionSchema.exact_signature_l, CollectionSchema.exact_signature_unique_b, CollectionSchema.exact_signature_copycount_i},
{CollectionSchema.fuzzy_signature_l, CollectionSchema.fuzzy_signature_unique_b, CollectionSchema.fuzzy_signature_copycount_i}}) {
CollectionSchema checkfield = checkfields[0];
CollectionSchema uniquefield = checkfields[1];
CollectionSchema countfield = checkfields[2];
if (this.contains(checkfield) && this.contains(uniquefield)) {
// lookup the document with the same signature
long signature = ((Long) sid.getField(checkfield.getSolrFieldName()).getValue()).longValue();
try {
long count = segment.fulltext().getDefaultConnector().getCountByQuery(CollectionSchema.host_id_s + ":\"" + hostid + "\" AND " + checkfield.getSolrFieldName() + ":\"" + Long.toString(signature) + "\"");
if (count > 1) {
String urlhash = ASCII.String(url.hash());
if (uniqueURLs.contains(urlhash)) {
// this is not the first appearance, therefore this is a non-unique document
sid.setField(uniquefield.getSolrFieldName(), false);
} else {
// this is the first appearance, therefore this shall be treated as unique document
sid.setField(uniquefield.getSolrFieldName(), true);
uniqueURLs.add(urlhash);
}
sid.setField(countfield.getSolrFieldName(), count);
changed = true;
}
} catch (final IOException e) {}
}
}
// CHECK IF TITLE AND DESCRIPTION IS UNIQUE (this is by default not switched on)
if (segment.fulltext().getDefaultConfiguration().contains(CollectionSchema.host_id_s)) {
uniquecheck: for (CollectionSchema[] checkfields: new CollectionSchema[][]{
{CollectionSchema.title, CollectionSchema.title_exact_signature_l, CollectionSchema.title_unique_b},
{CollectionSchema.description_txt, CollectionSchema.description_exact_signature_l, CollectionSchema.description_unique_b}}) {
CollectionSchema checkfield = checkfields[0];
CollectionSchema signaturefield = checkfields[1];
CollectionSchema uniquefield = checkfields[2];
if (this.contains(checkfield) && this.contains(signaturefield) && this.contains(uniquefield)) {
// lookup in the index within the same hosts for the same title or description
//String checkstring = checkfield == CollectionSchema.title ? document.dc_title() : document.dc_description();
Long checkhash = (Long) sid.getFieldValue(signaturefield.getSolrFieldName());
if (checkhash == null) {
sid.setField(uniquefield.getSolrFieldName(), false);
changed = true;
continue uniquecheck;
}
try {
if (segment.fulltext().getDefaultConnector().existsByQuery(CollectionSchema.host_id_s + ":\"" + hostid + "\" AND " + signaturefield.getSolrFieldName() + ":\"" + checkhash.toString() + "\"")) {
// switch unique attribute in new document
sid.setField(uniquefield.getSolrFieldName(), false);
// switch attribute also in all existing documents (which should be exactly only one!)
SolrDocumentList docs = segment.fulltext().getDefaultConnector().getDocumentListByQuery(CollectionSchema.host_id_s + ":\"" + hostid + "\" AND " + signaturefield.getSolrFieldName() + ":\"" + checkhash.toString() + "\" AND " + uniquefield.getSolrFieldName() + ":true", 0, 1000);
for (SolrDocument doc: docs) {
SolrInputDocument sidContext = segment.fulltext().getDefaultConfiguration().toSolrInputDocument(doc);
sidContext.setField(uniquefield.getSolrFieldName(), false);
segment.putDocumentInQueue(sidContext);
changed = true;
}
} else {
sid.setField(uniquefield.getSolrFieldName(), true);
}
} catch (final IOException e) {}
}
}
}
return changed;
}
public boolean postprocessing_clickdepth(Segment segment, SolrDocument doc, SolrInputDocument sid, DigestURI url, SchemaDeclaration clickdepthfield) {
if (!this.contains(clickdepthfield)) return false;

View File

@ -39,7 +39,6 @@ import java.util.concurrent.BlockingQueue;
import java.util.regex.Pattern;
import org.apache.solr.common.SolrDocument;
import org.apache.solr.common.SolrDocumentList;
import org.apache.solr.common.SolrInputDocument;
import net.yacy.cora.document.ASCII;
@ -603,61 +602,6 @@ public class Segment {
// CREATE SOLR DOCUMENT
final CollectionConfiguration.SolrVector vector = this.fulltext.getDefaultConfiguration().yacy2solr(id, collections, responseHeader, document, condenser, referrerURL, language, urlCitationIndex, this.fulltext.getWebgraphConfiguration());
// FIND OUT IF THIS IS A DOUBLE DOCUMENT
String hostid = url.hosthash();
for (CollectionSchema[] checkfields: new CollectionSchema[][]{
{CollectionSchema.exact_signature_l, CollectionSchema.exact_signature_unique_b},
{CollectionSchema.fuzzy_signature_l, CollectionSchema.fuzzy_signature_unique_b}}) {
CollectionSchema checkfield = checkfields[0];
CollectionSchema uniquefield = checkfields[1];
if (this.fulltext.getDefaultConfiguration().contains(checkfield) && this.fulltext.getDefaultConfiguration().contains(uniquefield)) {
// lookup the document with the same signature
long signature = ((Long) vector.getField(checkfield.getSolrFieldName()).getValue()).longValue();
try {
if (this.fulltext.getDefaultConnector().existsByQuery(CollectionSchema.host_id_s + ":\"" + hostid + "\" AND " + checkfield.getSolrFieldName() + ":\"" + Long.toString(signature) + "\"")) {
// change unique attribut in content
vector.setField(uniquefield.getSolrFieldName(), false);
}
} catch (final IOException e) {}
}
}
// CHECK IF TITLE AND DESCRIPTION IS UNIQUE (this is by default not switched on)
if (this.fulltext.getDefaultConfiguration().contains(CollectionSchema.host_id_s)) {
uniquecheck: for (CollectionSchema[] checkfields: new CollectionSchema[][]{
{CollectionSchema.title, CollectionSchema.title_exact_signature_l, CollectionSchema.title_unique_b},
{CollectionSchema.description_txt, CollectionSchema.description_exact_signature_l, CollectionSchema.description_unique_b}}) {
CollectionSchema checkfield = checkfields[0];
CollectionSchema signaturefield = checkfields[1];
CollectionSchema uniquefield = checkfields[2];
if (this.fulltext.getDefaultConfiguration().contains(checkfield) && this.fulltext.getDefaultConfiguration().contains(signaturefield) && this.fulltext.getDefaultConfiguration().contains(uniquefield)) {
// lookup in the index within the same hosts for the same title or description
//String checkstring = checkfield == CollectionSchema.title ? document.dc_title() : document.dc_description();
Long checkhash = (Long) vector.getFieldValue(signaturefield.getSolrFieldName());
if (checkhash == null) {
vector.setField(uniquefield.getSolrFieldName(), false);
continue uniquecheck;
}
try {
if (this.fulltext.getDefaultConnector().existsByQuery(CollectionSchema.host_id_s + ":\"" + hostid + "\" AND " + signaturefield.getSolrFieldName() + ":\"" + checkhash.toString() + "\"")) {
// switch unique attribute in new document
vector.setField(uniquefield.getSolrFieldName(), false);
// switch attribute also in all existing documents (which should be exactly only one!)
SolrDocumentList docs = this.fulltext.getDefaultConnector().getDocumentListByQuery(CollectionSchema.host_id_s + ":\"" + hostid + "\" AND " + signaturefield.getSolrFieldName() + ":\"" + checkhash.toString() + "\" AND " + uniquefield.getSolrFieldName() + ":true", 0, 1000);
for (SolrDocument doc: docs) {
SolrInputDocument sid = this.fulltext.getDefaultConfiguration().toSolrInputDocument(doc);
sid.setField(uniquefield.getSolrFieldName(), false);
this.putDocumentInQueue(sid);
//this.fulltext.getDefaultConnector().add(sid);
}
} else {
vector.setField(uniquefield.getSolrFieldName(), true);
}
} catch (final IOException e) {}
}
}
}
// ENRICH DOCUMENT WITH RANKING INFORMATION
if (this.connectedCitation()) {
this.fulltext.getDefaultConfiguration().postprocessing_references(this.getReferenceReportCache(), null, vector, url, null);

View File

@ -1408,42 +1408,42 @@ public final class SearchEvent {
public ImageResult oneImageResult(final int item, final long timeout) throws MalformedURLException {
if (item < imageViewed.size()) return nthImage(item);
if (imageSpare.size() > 0) return nextSpare();
ResultEntry ms = oneResult(item, timeout);
// check if the match was made in the url or in the image links
if (ms == null) throw new MalformedURLException("nUll");
SolrDocument doc = ms.getNode().getDocument();
Collection<Object> alt = doc.getFieldValues(CollectionSchema.images_alt_sxt.getSolrFieldName());
Collection<Object> img = doc.getFieldValues(CollectionSchema.images_urlstub_sxt.getSolrFieldName());
Collection<Object> prt = doc.getFieldValues(CollectionSchema.images_protocol_sxt.getSolrFieldName());
if (img != null) {
int c = 0;
for (Object i: img) {
String a = alt != null && alt.size() > c ? (String) SetTools.nth(alt, c) : "";
if (query.getQueryGoal().matches((String) i) || query.getQueryGoal().matches(a)) {
try {
DigestURI imageUrl = new DigestURI((prt != null && prt.size() > c ? SetTools.nth(prt, c) : "http") + "://" + i);
Object heightO = SetTools.nth(doc.getFieldValues(CollectionSchema.images_height_val.getSolrFieldName()), c);
Object widthO = SetTools.nth(doc.getFieldValues(CollectionSchema.images_width_val.getSolrFieldName()), c);
String id = ASCII.String(imageUrl.hash());
if (!imageViewed.containsKey(id) && !imageSpare.containsKey(id)) imageSpare.put(id, new ImageResult(ms.url(), imageUrl, "", a, widthO == null ? 0 : (Integer) widthO, heightO == null ? 0 : (Integer) heightO, 0));
} catch (MalformedURLException e) {
continue;
if (ms != null) {
SolrDocument doc = ms.getNode().getDocument();
Collection<Object> alt = doc.getFieldValues(CollectionSchema.images_alt_sxt.getSolrFieldName());
Collection<Object> img = doc.getFieldValues(CollectionSchema.images_urlstub_sxt.getSolrFieldName());
Collection<Object> prt = doc.getFieldValues(CollectionSchema.images_protocol_sxt.getSolrFieldName());
if (img != null) {
int c = 0;
for (Object i: img) {
String a = alt != null && alt.size() > c ? (String) SetTools.nth(alt, c) : "";
if (query.getQueryGoal().matches((String) i) || query.getQueryGoal().matches(a)) {
try {
DigestURI imageUrl = new DigestURI((prt != null && prt.size() > c ? SetTools.nth(prt, c) : "http") + "://" + i);
Object heightO = SetTools.nth(doc.getFieldValues(CollectionSchema.images_height_val.getSolrFieldName()), c);
Object widthO = SetTools.nth(doc.getFieldValues(CollectionSchema.images_width_val.getSolrFieldName()), c);
String id = ASCII.String(imageUrl.hash());
if (!imageViewed.containsKey(id) && !imageSpare.containsKey(id)) imageSpare.put(id, new ImageResult(ms.url(), imageUrl, "", a, widthO == null ? 0 : (Integer) widthO, heightO == null ? 0 : (Integer) heightO, 0));
} catch (MalformedURLException e) {
continue;
}
}
c++;
}
c++;
}
}
if (MultiProtocolURI.isImage(MultiProtocolURI.getFileExtension(ms.url().getFileName()))) {
String id = ASCII.String(ms.hash());
if (!imageViewed.containsKey(id) && !imageSpare.containsKey(id)) imageSpare.put(id, new ImageResult(ms.url(), ms.url(), "", ms.title(), 0, 0, 0));
}
if (img != null && img.size() > 0) {
DigestURI imageUrl = new DigestURI((prt != null && prt.size() > 0 ? SetTools.nth(prt, 0) : "http") + "://" + SetTools.nth(img, 0));
String imagetext = alt != null && alt.size() > 0 ? (String) SetTools.nth(alt, 0) : "";
String id = ASCII.String(imageUrl.hash());
if (!imageViewed.containsKey(id) && !imageSpare.containsKey(id)) imageSpare.put(id, new ImageResult(ms.url(), imageUrl, "", imagetext, 0, 0, 0));
if (MultiProtocolURI.isImage(MultiProtocolURI.getFileExtension(ms.url().getFileName()))) {
String id = ASCII.String(ms.hash());
if (!imageViewed.containsKey(id) && !imageSpare.containsKey(id)) imageSpare.put(id, new ImageResult(ms.url(), ms.url(), "", ms.title(), 0, 0, 0));
}
if (img != null && img.size() > 0) {
DigestURI imageUrl = new DigestURI((prt != null && prt.size() > 0 ? SetTools.nth(prt, 0) : "http") + "://" + SetTools.nth(img, 0));
String imagetext = alt != null && alt.size() > 0 ? (String) SetTools.nth(alt, 0) : "";
String id = ASCII.String(imageUrl.hash());
if (!imageViewed.containsKey(id) && !imageSpare.containsKey(id)) imageSpare.put(id, new ImageResult(ms.url(), imageUrl, "", imagetext, 0, 0, 0));
}
}
if (imageSpare.size() > 0) return nextSpare();
throw new MalformedURLException("no image url found");

View File

@ -448,11 +448,17 @@ public class CollectionConfiguration extends SchemaConfiguration implements Seri
add(doc, CollectionSchema.synonyms_sxt, synonyms);
}
add(doc, CollectionSchema.exact_signature_l, condenser.exactSignature());
add(doc, CollectionSchema.exact_signature_unique_b, true); // this must be corrected afterwards!
add(doc, CollectionSchema.exact_signature_unique_b, true); // this must be corrected afterwards during storage!
add(doc, CollectionSchema.exact_signature_copycount_i, 0); // this must be corrected afterwards during postprocessing!
add(doc, CollectionSchema.fuzzy_signature_l, condenser.fuzzySignature());
add(doc, CollectionSchema.fuzzy_signature_text_t, condenser.fuzzySignatureText());
add(doc, CollectionSchema.fuzzy_signature_unique_b, true); // this must be corrected afterwards!
add(doc, CollectionSchema.fuzzy_signature_unique_b, true); // this must be corrected afterwards during storage!
add(doc, CollectionSchema.fuzzy_signature_copycount_i, 0); // this must be corrected afterwards during postprocessing!
if (this.contains(CollectionSchema.exact_signature_unique_b) || this.contains(CollectionSchema.exact_signature_copycount_i) ||
this.contains(CollectionSchema.fuzzy_signature_l) || this.contains(CollectionSchema.fuzzy_signature_copycount_i)) {
processTypes.add(ProcessType.UNIQUE);
}
// get list of all links; they will be shrinked by urls that appear in other fields of the solr schema
Set<DigestURI> inboundLinks = document.inboundLinks();
Set<DigestURI> outboundLinks = document.outboundLinks();
@ -900,8 +906,9 @@ public class CollectionConfiguration extends SchemaConfiguration implements Seri
// process all documents
BlockingQueue<SolrDocument> docs = connector.concurrentDocumentsByQuery(CollectionSchema.process_sxt.getSolrFieldName() + ":[* TO *]", 0, 10000, 60000, 50);
SolrDocument doc;
int proccount = 0, proccount_clickdepthchange = 0, proccount_referencechange = 0, proccount_citationchange = 0;
int proccount = 0, proccount_clickdepthchange = 0, proccount_referencechange = 0, proccount_citationchange = 0, proccount_uniquechange = 0;
Map<String, Long> hostExtentCache = new HashMap<String, Long>(); // a mapping from the host id to the number of documents which contain this host-id
Set<String> uniqueURLs = new HashSet<String>();
try {
while ((doc = docs.take()) != AbstractSolrConnector.POISON_DOCUMENT) {
// for each to-be-processed entry work on the process tag
@ -929,6 +936,10 @@ public class CollectionConfiguration extends SchemaConfiguration implements Seri
proccount_citationchange++;
}
}
if (tagtype == ProcessType.UNIQUE) {
if (postprocessing_doublecontent(segment, uniqueURLs, sid, url)) proccount_uniquechange++;
}
}
@ -954,7 +965,8 @@ public class CollectionConfiguration extends SchemaConfiguration implements Seri
}
ConcurrentLog.info("CollectionConfiguration", "cleanup_processing: re-calculated " + proccount+ " new documents, " +
proccount_clickdepthchange + " clickdepth changes, " +
proccount_referencechange + " reference-count changes," +
proccount_referencechange + " reference-count changes, " +
proccount_uniquechange + " unique field changes, " +
proccount_citationchange + " citation ranking changes.");
} catch (final InterruptedException e) {
}

View File

@ -43,9 +43,11 @@ public enum CollectionSchema implements SchemaDeclaration {
md5_s(SolrType.string, true, true, false, false, false, "the md5 of the raw source"),// String md5();
exact_signature_l(SolrType.num_long, true, true, false, false, false, "the 64 bit hash of the org.apache.solr.update.processor.Lookup3Signature of text_t"),
exact_signature_unique_b(SolrType.bool, true, true, false, false, false, "flag shows if exact_signature_l is unique at the time of document creation, used for double-check during search"),
exact_signature_copycount_i(SolrType.num_integer, true, true, false, false, false, "counter for the number of documents which are not unique (== count of not-unique-flagged documents + 1)"),
fuzzy_signature_l(SolrType.num_long, true, true, false, false, false, "64 bit of the Lookup3Signature from EnhancedTextProfileSignature of text_t"),
fuzzy_signature_text_t(SolrType.text_general, true, true, false, false, false, "intermediate data produced in EnhancedTextProfileSignature: a list of word frequencies"),
fuzzy_signature_unique_b(SolrType.bool, true, true, false, false, false, "flag shows if fuzzy_signature_l is unique at the time of document creation, used for double-check during search"),
fuzzy_signature_copycount_i(SolrType.num_integer, true, true, false, false, false, "counter for the number of documents which are not unique (== count of not-unique-flagged documents + 1)"),
size_i(SolrType.num_integer, true, true, false, false, false, "the size of the raw source"),// int size();
failreason_s(SolrType.string, true, true, false, false, false, "fail reason if a page was not loaded. if the page was loaded then this field is empty"),
failtype_s(SolrType.string, true, true, false, false, false, "fail type if a page was not loaded. This field is either empty, 'excl' or 'fail'"),