From 85456f46b2d1f1bf2b6518a5fcc13b8f577e5acb Mon Sep 17 00:00:00 2001 From: Michael Peter Christen Date: Wed, 4 Sep 2013 23:11:53 +0200 Subject: [PATCH] added two new fields, exact_signature_copycount_i and fuzzy_signature_copycount_i, which count the number of copies of non-unique documents and assigns this to each document. Thus, each document there is a number assigned which shows how many copies of this document exists. These fields are disabled by default. --- defaults/solr.collection.schema | 6 ++ .../yacy/cora/federate/solr/ProcessType.java | 2 +- .../federate/solr/SchemaConfiguration.java | 73 +++++++++++++++++++ source/net/yacy/search/index/Segment.java | 56 -------------- source/net/yacy/search/query/SearchEvent.java | 60 +++++++-------- .../schema/CollectionConfiguration.java | 22 ++++-- .../yacy/search/schema/CollectionSchema.java | 2 + 7 files changed, 129 insertions(+), 92 deletions(-) diff --git a/defaults/solr.collection.schema b/defaults/solr.collection.schema index 4e6aef38d..a8e814f93 100644 --- a/defaults/solr.collection.schema +++ b/defaults/solr.collection.schema @@ -42,6 +42,9 @@ exact_signature_l ## flag shows if exact_signature_l is unique at the time of document creation, used for double-check during search exact_signature_unique_b +## counter for the number of documents which are not unique (== count of not-unique-flagged documents + 1) +#exact_signature_copycount_i + ## 64 bit of the Lookup3Signature from EnhancedTextProfileSignature of text_t fuzzy_signature_l @@ -51,6 +54,9 @@ fuzzy_signature_l ## flag shows if fuzzy_signature_l is unique at the time of document creation, used for double-check during search fuzzy_signature_unique_b +## counter for the number of documents which are not unique (== count of not-unique-flagged documents + 1) +#fuzzy_signature_copycount_i + ## the size of the raw source (mandatory field) size_i diff --git a/source/net/yacy/cora/federate/solr/ProcessType.java b/source/net/yacy/cora/federate/solr/ProcessType.java index cef9d1338..1d4439e1f 100644 --- a/source/net/yacy/cora/federate/solr/ProcessType.java +++ b/source/net/yacy/cora/federate/solr/ProcessType.java @@ -26,6 +26,6 @@ package net.yacy.cora.federate.solr; */ public enum ProcessType { - CLICKDEPTH, CITATION; + CLICKDEPTH, CITATION, UNIQUE; } diff --git a/source/net/yacy/cora/federate/solr/SchemaConfiguration.java b/source/net/yacy/cora/federate/solr/SchemaConfiguration.java index f16a56725..1e0ab0f0d 100644 --- a/source/net/yacy/cora/federate/solr/SchemaConfiguration.java +++ b/source/net/yacy/cora/federate/solr/SchemaConfiguration.java @@ -28,8 +28,10 @@ import java.util.Date; import java.util.Iterator; import java.util.List; import java.util.Map; +import java.util.Set; import org.apache.solr.common.SolrDocument; +import org.apache.solr.common.SolrDocumentList; import org.apache.solr.common.SolrInputDocument; import net.yacy.cora.document.ASCII; @@ -75,6 +77,77 @@ public class SchemaConfiguration extends Configuration implements Serializable { } } } + + public boolean postprocessing_doublecontent(Segment segment, Set uniqueURLs, SolrInputDocument sid, DigestURI url) { + boolean changed = false; + // FIND OUT IF THIS IS A DOUBLE DOCUMENT + String hostid = url.hosthash(); + for (CollectionSchema[] checkfields: new CollectionSchema[][]{ + {CollectionSchema.exact_signature_l, CollectionSchema.exact_signature_unique_b, CollectionSchema.exact_signature_copycount_i}, + {CollectionSchema.fuzzy_signature_l, CollectionSchema.fuzzy_signature_unique_b, CollectionSchema.fuzzy_signature_copycount_i}}) { + CollectionSchema checkfield = checkfields[0]; + CollectionSchema uniquefield = checkfields[1]; + CollectionSchema countfield = checkfields[2]; + if (this.contains(checkfield) && this.contains(uniquefield)) { + // lookup the document with the same signature + long signature = ((Long) sid.getField(checkfield.getSolrFieldName()).getValue()).longValue(); + try { + long count = segment.fulltext().getDefaultConnector().getCountByQuery(CollectionSchema.host_id_s + ":\"" + hostid + "\" AND " + checkfield.getSolrFieldName() + ":\"" + Long.toString(signature) + "\""); + if (count > 1) { + String urlhash = ASCII.String(url.hash()); + if (uniqueURLs.contains(urlhash)) { + // this is not the first appearance, therefore this is a non-unique document + sid.setField(uniquefield.getSolrFieldName(), false); + } else { + // this is the first appearance, therefore this shall be treated as unique document + sid.setField(uniquefield.getSolrFieldName(), true); + uniqueURLs.add(urlhash); + } + sid.setField(countfield.getSolrFieldName(), count); + changed = true; + } + } catch (final IOException e) {} + } + } + + // CHECK IF TITLE AND DESCRIPTION IS UNIQUE (this is by default not switched on) + if (segment.fulltext().getDefaultConfiguration().contains(CollectionSchema.host_id_s)) { + uniquecheck: for (CollectionSchema[] checkfields: new CollectionSchema[][]{ + {CollectionSchema.title, CollectionSchema.title_exact_signature_l, CollectionSchema.title_unique_b}, + {CollectionSchema.description_txt, CollectionSchema.description_exact_signature_l, CollectionSchema.description_unique_b}}) { + CollectionSchema checkfield = checkfields[0]; + CollectionSchema signaturefield = checkfields[1]; + CollectionSchema uniquefield = checkfields[2]; + if (this.contains(checkfield) && this.contains(signaturefield) && this.contains(uniquefield)) { + // lookup in the index within the same hosts for the same title or description + //String checkstring = checkfield == CollectionSchema.title ? document.dc_title() : document.dc_description(); + Long checkhash = (Long) sid.getFieldValue(signaturefield.getSolrFieldName()); + if (checkhash == null) { + sid.setField(uniquefield.getSolrFieldName(), false); + changed = true; + continue uniquecheck; + } + try { + if (segment.fulltext().getDefaultConnector().existsByQuery(CollectionSchema.host_id_s + ":\"" + hostid + "\" AND " + signaturefield.getSolrFieldName() + ":\"" + checkhash.toString() + "\"")) { + // switch unique attribute in new document + sid.setField(uniquefield.getSolrFieldName(), false); + // switch attribute also in all existing documents (which should be exactly only one!) + SolrDocumentList docs = segment.fulltext().getDefaultConnector().getDocumentListByQuery(CollectionSchema.host_id_s + ":\"" + hostid + "\" AND " + signaturefield.getSolrFieldName() + ":\"" + checkhash.toString() + "\" AND " + uniquefield.getSolrFieldName() + ":true", 0, 1000); + for (SolrDocument doc: docs) { + SolrInputDocument sidContext = segment.fulltext().getDefaultConfiguration().toSolrInputDocument(doc); + sidContext.setField(uniquefield.getSolrFieldName(), false); + segment.putDocumentInQueue(sidContext); + changed = true; + } + } else { + sid.setField(uniquefield.getSolrFieldName(), true); + } + } catch (final IOException e) {} + } + } + } + return changed; + } public boolean postprocessing_clickdepth(Segment segment, SolrDocument doc, SolrInputDocument sid, DigestURI url, SchemaDeclaration clickdepthfield) { if (!this.contains(clickdepthfield)) return false; diff --git a/source/net/yacy/search/index/Segment.java b/source/net/yacy/search/index/Segment.java index d33d020a5..e6092b92e 100644 --- a/source/net/yacy/search/index/Segment.java +++ b/source/net/yacy/search/index/Segment.java @@ -39,7 +39,6 @@ import java.util.concurrent.BlockingQueue; import java.util.regex.Pattern; import org.apache.solr.common.SolrDocument; -import org.apache.solr.common.SolrDocumentList; import org.apache.solr.common.SolrInputDocument; import net.yacy.cora.document.ASCII; @@ -603,61 +602,6 @@ public class Segment { // CREATE SOLR DOCUMENT final CollectionConfiguration.SolrVector vector = this.fulltext.getDefaultConfiguration().yacy2solr(id, collections, responseHeader, document, condenser, referrerURL, language, urlCitationIndex, this.fulltext.getWebgraphConfiguration()); - // FIND OUT IF THIS IS A DOUBLE DOCUMENT - String hostid = url.hosthash(); - for (CollectionSchema[] checkfields: new CollectionSchema[][]{ - {CollectionSchema.exact_signature_l, CollectionSchema.exact_signature_unique_b}, - {CollectionSchema.fuzzy_signature_l, CollectionSchema.fuzzy_signature_unique_b}}) { - CollectionSchema checkfield = checkfields[0]; - CollectionSchema uniquefield = checkfields[1]; - if (this.fulltext.getDefaultConfiguration().contains(checkfield) && this.fulltext.getDefaultConfiguration().contains(uniquefield)) { - // lookup the document with the same signature - long signature = ((Long) vector.getField(checkfield.getSolrFieldName()).getValue()).longValue(); - try { - if (this.fulltext.getDefaultConnector().existsByQuery(CollectionSchema.host_id_s + ":\"" + hostid + "\" AND " + checkfield.getSolrFieldName() + ":\"" + Long.toString(signature) + "\"")) { - // change unique attribut in content - vector.setField(uniquefield.getSolrFieldName(), false); - } - } catch (final IOException e) {} - } - } - - // CHECK IF TITLE AND DESCRIPTION IS UNIQUE (this is by default not switched on) - if (this.fulltext.getDefaultConfiguration().contains(CollectionSchema.host_id_s)) { - uniquecheck: for (CollectionSchema[] checkfields: new CollectionSchema[][]{ - {CollectionSchema.title, CollectionSchema.title_exact_signature_l, CollectionSchema.title_unique_b}, - {CollectionSchema.description_txt, CollectionSchema.description_exact_signature_l, CollectionSchema.description_unique_b}}) { - CollectionSchema checkfield = checkfields[0]; - CollectionSchema signaturefield = checkfields[1]; - CollectionSchema uniquefield = checkfields[2]; - if (this.fulltext.getDefaultConfiguration().contains(checkfield) && this.fulltext.getDefaultConfiguration().contains(signaturefield) && this.fulltext.getDefaultConfiguration().contains(uniquefield)) { - // lookup in the index within the same hosts for the same title or description - //String checkstring = checkfield == CollectionSchema.title ? document.dc_title() : document.dc_description(); - Long checkhash = (Long) vector.getFieldValue(signaturefield.getSolrFieldName()); - if (checkhash == null) { - vector.setField(uniquefield.getSolrFieldName(), false); - continue uniquecheck; - } - try { - if (this.fulltext.getDefaultConnector().existsByQuery(CollectionSchema.host_id_s + ":\"" + hostid + "\" AND " + signaturefield.getSolrFieldName() + ":\"" + checkhash.toString() + "\"")) { - // switch unique attribute in new document - vector.setField(uniquefield.getSolrFieldName(), false); - // switch attribute also in all existing documents (which should be exactly only one!) - SolrDocumentList docs = this.fulltext.getDefaultConnector().getDocumentListByQuery(CollectionSchema.host_id_s + ":\"" + hostid + "\" AND " + signaturefield.getSolrFieldName() + ":\"" + checkhash.toString() + "\" AND " + uniquefield.getSolrFieldName() + ":true", 0, 1000); - for (SolrDocument doc: docs) { - SolrInputDocument sid = this.fulltext.getDefaultConfiguration().toSolrInputDocument(doc); - sid.setField(uniquefield.getSolrFieldName(), false); - this.putDocumentInQueue(sid); - //this.fulltext.getDefaultConnector().add(sid); - } - } else { - vector.setField(uniquefield.getSolrFieldName(), true); - } - } catch (final IOException e) {} - } - } - } - // ENRICH DOCUMENT WITH RANKING INFORMATION if (this.connectedCitation()) { this.fulltext.getDefaultConfiguration().postprocessing_references(this.getReferenceReportCache(), null, vector, url, null); diff --git a/source/net/yacy/search/query/SearchEvent.java b/source/net/yacy/search/query/SearchEvent.java index 3d0dfea47..864318075 100644 --- a/source/net/yacy/search/query/SearchEvent.java +++ b/source/net/yacy/search/query/SearchEvent.java @@ -1408,42 +1408,42 @@ public final class SearchEvent { public ImageResult oneImageResult(final int item, final long timeout) throws MalformedURLException { if (item < imageViewed.size()) return nthImage(item); - if (imageSpare.size() > 0) return nextSpare(); ResultEntry ms = oneResult(item, timeout); // check if the match was made in the url or in the image links - if (ms == null) throw new MalformedURLException("nUll"); - SolrDocument doc = ms.getNode().getDocument(); - Collection alt = doc.getFieldValues(CollectionSchema.images_alt_sxt.getSolrFieldName()); - Collection img = doc.getFieldValues(CollectionSchema.images_urlstub_sxt.getSolrFieldName()); - Collection prt = doc.getFieldValues(CollectionSchema.images_protocol_sxt.getSolrFieldName()); - if (img != null) { - int c = 0; - for (Object i: img) { - String a = alt != null && alt.size() > c ? (String) SetTools.nth(alt, c) : ""; - if (query.getQueryGoal().matches((String) i) || query.getQueryGoal().matches(a)) { - try { - DigestURI imageUrl = new DigestURI((prt != null && prt.size() > c ? SetTools.nth(prt, c) : "http") + "://" + i); - Object heightO = SetTools.nth(doc.getFieldValues(CollectionSchema.images_height_val.getSolrFieldName()), c); - Object widthO = SetTools.nth(doc.getFieldValues(CollectionSchema.images_width_val.getSolrFieldName()), c); - String id = ASCII.String(imageUrl.hash()); - if (!imageViewed.containsKey(id) && !imageSpare.containsKey(id)) imageSpare.put(id, new ImageResult(ms.url(), imageUrl, "", a, widthO == null ? 0 : (Integer) widthO, heightO == null ? 0 : (Integer) heightO, 0)); - } catch (MalformedURLException e) { - continue; + if (ms != null) { + SolrDocument doc = ms.getNode().getDocument(); + Collection alt = doc.getFieldValues(CollectionSchema.images_alt_sxt.getSolrFieldName()); + Collection img = doc.getFieldValues(CollectionSchema.images_urlstub_sxt.getSolrFieldName()); + Collection prt = doc.getFieldValues(CollectionSchema.images_protocol_sxt.getSolrFieldName()); + if (img != null) { + int c = 0; + for (Object i: img) { + String a = alt != null && alt.size() > c ? (String) SetTools.nth(alt, c) : ""; + if (query.getQueryGoal().matches((String) i) || query.getQueryGoal().matches(a)) { + try { + DigestURI imageUrl = new DigestURI((prt != null && prt.size() > c ? SetTools.nth(prt, c) : "http") + "://" + i); + Object heightO = SetTools.nth(doc.getFieldValues(CollectionSchema.images_height_val.getSolrFieldName()), c); + Object widthO = SetTools.nth(doc.getFieldValues(CollectionSchema.images_width_val.getSolrFieldName()), c); + String id = ASCII.String(imageUrl.hash()); + if (!imageViewed.containsKey(id) && !imageSpare.containsKey(id)) imageSpare.put(id, new ImageResult(ms.url(), imageUrl, "", a, widthO == null ? 0 : (Integer) widthO, heightO == null ? 0 : (Integer) heightO, 0)); + } catch (MalformedURLException e) { + continue; + } } + c++; } - c++; } - } - if (MultiProtocolURI.isImage(MultiProtocolURI.getFileExtension(ms.url().getFileName()))) { - String id = ASCII.String(ms.hash()); - if (!imageViewed.containsKey(id) && !imageSpare.containsKey(id)) imageSpare.put(id, new ImageResult(ms.url(), ms.url(), "", ms.title(), 0, 0, 0)); - } - if (img != null && img.size() > 0) { - DigestURI imageUrl = new DigestURI((prt != null && prt.size() > 0 ? SetTools.nth(prt, 0) : "http") + "://" + SetTools.nth(img, 0)); - String imagetext = alt != null && alt.size() > 0 ? (String) SetTools.nth(alt, 0) : ""; - String id = ASCII.String(imageUrl.hash()); - if (!imageViewed.containsKey(id) && !imageSpare.containsKey(id)) imageSpare.put(id, new ImageResult(ms.url(), imageUrl, "", imagetext, 0, 0, 0)); + if (MultiProtocolURI.isImage(MultiProtocolURI.getFileExtension(ms.url().getFileName()))) { + String id = ASCII.String(ms.hash()); + if (!imageViewed.containsKey(id) && !imageSpare.containsKey(id)) imageSpare.put(id, new ImageResult(ms.url(), ms.url(), "", ms.title(), 0, 0, 0)); + } + if (img != null && img.size() > 0) { + DigestURI imageUrl = new DigestURI((prt != null && prt.size() > 0 ? SetTools.nth(prt, 0) : "http") + "://" + SetTools.nth(img, 0)); + String imagetext = alt != null && alt.size() > 0 ? (String) SetTools.nth(alt, 0) : ""; + String id = ASCII.String(imageUrl.hash()); + if (!imageViewed.containsKey(id) && !imageSpare.containsKey(id)) imageSpare.put(id, new ImageResult(ms.url(), imageUrl, "", imagetext, 0, 0, 0)); + } } if (imageSpare.size() > 0) return nextSpare(); throw new MalformedURLException("no image url found"); diff --git a/source/net/yacy/search/schema/CollectionConfiguration.java b/source/net/yacy/search/schema/CollectionConfiguration.java index a72598c7c..46295b3bb 100644 --- a/source/net/yacy/search/schema/CollectionConfiguration.java +++ b/source/net/yacy/search/schema/CollectionConfiguration.java @@ -448,11 +448,17 @@ public class CollectionConfiguration extends SchemaConfiguration implements Seri add(doc, CollectionSchema.synonyms_sxt, synonyms); } add(doc, CollectionSchema.exact_signature_l, condenser.exactSignature()); - add(doc, CollectionSchema.exact_signature_unique_b, true); // this must be corrected afterwards! + add(doc, CollectionSchema.exact_signature_unique_b, true); // this must be corrected afterwards during storage! + add(doc, CollectionSchema.exact_signature_copycount_i, 0); // this must be corrected afterwards during postprocessing! add(doc, CollectionSchema.fuzzy_signature_l, condenser.fuzzySignature()); add(doc, CollectionSchema.fuzzy_signature_text_t, condenser.fuzzySignatureText()); - add(doc, CollectionSchema.fuzzy_signature_unique_b, true); // this must be corrected afterwards! - + add(doc, CollectionSchema.fuzzy_signature_unique_b, true); // this must be corrected afterwards during storage! + add(doc, CollectionSchema.fuzzy_signature_copycount_i, 0); // this must be corrected afterwards during postprocessing! + if (this.contains(CollectionSchema.exact_signature_unique_b) || this.contains(CollectionSchema.exact_signature_copycount_i) || + this.contains(CollectionSchema.fuzzy_signature_l) || this.contains(CollectionSchema.fuzzy_signature_copycount_i)) { + processTypes.add(ProcessType.UNIQUE); + } + // get list of all links; they will be shrinked by urls that appear in other fields of the solr schema Set inboundLinks = document.inboundLinks(); Set outboundLinks = document.outboundLinks(); @@ -900,8 +906,9 @@ public class CollectionConfiguration extends SchemaConfiguration implements Seri // process all documents BlockingQueue docs = connector.concurrentDocumentsByQuery(CollectionSchema.process_sxt.getSolrFieldName() + ":[* TO *]", 0, 10000, 60000, 50); SolrDocument doc; - int proccount = 0, proccount_clickdepthchange = 0, proccount_referencechange = 0, proccount_citationchange = 0; + int proccount = 0, proccount_clickdepthchange = 0, proccount_referencechange = 0, proccount_citationchange = 0, proccount_uniquechange = 0; Map hostExtentCache = new HashMap(); // a mapping from the host id to the number of documents which contain this host-id + Set uniqueURLs = new HashSet(); try { while ((doc = docs.take()) != AbstractSolrConnector.POISON_DOCUMENT) { // for each to-be-processed entry work on the process tag @@ -929,6 +936,10 @@ public class CollectionConfiguration extends SchemaConfiguration implements Seri proccount_citationchange++; } } + + if (tagtype == ProcessType.UNIQUE) { + if (postprocessing_doublecontent(segment, uniqueURLs, sid, url)) proccount_uniquechange++; + } } @@ -954,7 +965,8 @@ public class CollectionConfiguration extends SchemaConfiguration implements Seri } ConcurrentLog.info("CollectionConfiguration", "cleanup_processing: re-calculated " + proccount+ " new documents, " + proccount_clickdepthchange + " clickdepth changes, " + - proccount_referencechange + " reference-count changes," + + proccount_referencechange + " reference-count changes, " + + proccount_uniquechange + " unique field changes, " + proccount_citationchange + " citation ranking changes."); } catch (final InterruptedException e) { } diff --git a/source/net/yacy/search/schema/CollectionSchema.java b/source/net/yacy/search/schema/CollectionSchema.java index ef09662c1..9b4009717 100644 --- a/source/net/yacy/search/schema/CollectionSchema.java +++ b/source/net/yacy/search/schema/CollectionSchema.java @@ -43,9 +43,11 @@ public enum CollectionSchema implements SchemaDeclaration { md5_s(SolrType.string, true, true, false, false, false, "the md5 of the raw source"),// String md5(); exact_signature_l(SolrType.num_long, true, true, false, false, false, "the 64 bit hash of the org.apache.solr.update.processor.Lookup3Signature of text_t"), exact_signature_unique_b(SolrType.bool, true, true, false, false, false, "flag shows if exact_signature_l is unique at the time of document creation, used for double-check during search"), + exact_signature_copycount_i(SolrType.num_integer, true, true, false, false, false, "counter for the number of documents which are not unique (== count of not-unique-flagged documents + 1)"), fuzzy_signature_l(SolrType.num_long, true, true, false, false, false, "64 bit of the Lookup3Signature from EnhancedTextProfileSignature of text_t"), fuzzy_signature_text_t(SolrType.text_general, true, true, false, false, false, "intermediate data produced in EnhancedTextProfileSignature: a list of word frequencies"), fuzzy_signature_unique_b(SolrType.bool, true, true, false, false, false, "flag shows if fuzzy_signature_l is unique at the time of document creation, used for double-check during search"), + fuzzy_signature_copycount_i(SolrType.num_integer, true, true, false, false, false, "counter for the number of documents which are not unique (== count of not-unique-flagged documents + 1)"), size_i(SolrType.num_integer, true, true, false, false, false, "the size of the raw source"),// int size(); failreason_s(SolrType.string, true, true, false, false, false, "fail reason if a page was not loaded. if the page was loaded then this field is empty"), failtype_s(SolrType.string, true, true, false, false, false, "fail type if a page was not loaded. This field is either empty, 'excl' or 'fail'"),