mirror of
https://github.com/yacy/yacy_search_server.git
synced 2024-09-19 00:01:41 +02:00
added two new fields, exact_signature_copycount_i and
fuzzy_signature_copycount_i, which count the number of copies of non-unique documents and assigns this to each document. Thus, each document there is a number assigned which shows how many copies of this document exists. These fields are disabled by default.
This commit is contained in:
parent
a2511b5600
commit
85456f46b2
|
@ -42,6 +42,9 @@ exact_signature_l
|
|||
## flag shows if exact_signature_l is unique at the time of document creation, used for double-check during search
|
||||
exact_signature_unique_b
|
||||
|
||||
## counter for the number of documents which are not unique (== count of not-unique-flagged documents + 1)
|
||||
#exact_signature_copycount_i
|
||||
|
||||
## 64 bit of the Lookup3Signature from EnhancedTextProfileSignature of text_t
|
||||
fuzzy_signature_l
|
||||
|
||||
|
@ -51,6 +54,9 @@ fuzzy_signature_l
|
|||
## flag shows if fuzzy_signature_l is unique at the time of document creation, used for double-check during search
|
||||
fuzzy_signature_unique_b
|
||||
|
||||
## counter for the number of documents which are not unique (== count of not-unique-flagged documents + 1)
|
||||
#fuzzy_signature_copycount_i
|
||||
|
||||
## the size of the raw source (mandatory field)
|
||||
size_i
|
||||
|
||||
|
|
|
@ -26,6 +26,6 @@ package net.yacy.cora.federate.solr;
|
|||
*/
|
||||
public enum ProcessType {
|
||||
|
||||
CLICKDEPTH, CITATION;
|
||||
CLICKDEPTH, CITATION, UNIQUE;
|
||||
|
||||
}
|
||||
|
|
|
@ -28,8 +28,10 @@ import java.util.Date;
|
|||
import java.util.Iterator;
|
||||
import java.util.List;
|
||||
import java.util.Map;
|
||||
import java.util.Set;
|
||||
|
||||
import org.apache.solr.common.SolrDocument;
|
||||
import org.apache.solr.common.SolrDocumentList;
|
||||
import org.apache.solr.common.SolrInputDocument;
|
||||
|
||||
import net.yacy.cora.document.ASCII;
|
||||
|
@ -75,6 +77,77 @@ public class SchemaConfiguration extends Configuration implements Serializable {
|
|||
}
|
||||
}
|
||||
}
|
||||
|
||||
public boolean postprocessing_doublecontent(Segment segment, Set<String> uniqueURLs, SolrInputDocument sid, DigestURI url) {
|
||||
boolean changed = false;
|
||||
// FIND OUT IF THIS IS A DOUBLE DOCUMENT
|
||||
String hostid = url.hosthash();
|
||||
for (CollectionSchema[] checkfields: new CollectionSchema[][]{
|
||||
{CollectionSchema.exact_signature_l, CollectionSchema.exact_signature_unique_b, CollectionSchema.exact_signature_copycount_i},
|
||||
{CollectionSchema.fuzzy_signature_l, CollectionSchema.fuzzy_signature_unique_b, CollectionSchema.fuzzy_signature_copycount_i}}) {
|
||||
CollectionSchema checkfield = checkfields[0];
|
||||
CollectionSchema uniquefield = checkfields[1];
|
||||
CollectionSchema countfield = checkfields[2];
|
||||
if (this.contains(checkfield) && this.contains(uniquefield)) {
|
||||
// lookup the document with the same signature
|
||||
long signature = ((Long) sid.getField(checkfield.getSolrFieldName()).getValue()).longValue();
|
||||
try {
|
||||
long count = segment.fulltext().getDefaultConnector().getCountByQuery(CollectionSchema.host_id_s + ":\"" + hostid + "\" AND " + checkfield.getSolrFieldName() + ":\"" + Long.toString(signature) + "\"");
|
||||
if (count > 1) {
|
||||
String urlhash = ASCII.String(url.hash());
|
||||
if (uniqueURLs.contains(urlhash)) {
|
||||
// this is not the first appearance, therefore this is a non-unique document
|
||||
sid.setField(uniquefield.getSolrFieldName(), false);
|
||||
} else {
|
||||
// this is the first appearance, therefore this shall be treated as unique document
|
||||
sid.setField(uniquefield.getSolrFieldName(), true);
|
||||
uniqueURLs.add(urlhash);
|
||||
}
|
||||
sid.setField(countfield.getSolrFieldName(), count);
|
||||
changed = true;
|
||||
}
|
||||
} catch (final IOException e) {}
|
||||
}
|
||||
}
|
||||
|
||||
// CHECK IF TITLE AND DESCRIPTION IS UNIQUE (this is by default not switched on)
|
||||
if (segment.fulltext().getDefaultConfiguration().contains(CollectionSchema.host_id_s)) {
|
||||
uniquecheck: for (CollectionSchema[] checkfields: new CollectionSchema[][]{
|
||||
{CollectionSchema.title, CollectionSchema.title_exact_signature_l, CollectionSchema.title_unique_b},
|
||||
{CollectionSchema.description_txt, CollectionSchema.description_exact_signature_l, CollectionSchema.description_unique_b}}) {
|
||||
CollectionSchema checkfield = checkfields[0];
|
||||
CollectionSchema signaturefield = checkfields[1];
|
||||
CollectionSchema uniquefield = checkfields[2];
|
||||
if (this.contains(checkfield) && this.contains(signaturefield) && this.contains(uniquefield)) {
|
||||
// lookup in the index within the same hosts for the same title or description
|
||||
//String checkstring = checkfield == CollectionSchema.title ? document.dc_title() : document.dc_description();
|
||||
Long checkhash = (Long) sid.getFieldValue(signaturefield.getSolrFieldName());
|
||||
if (checkhash == null) {
|
||||
sid.setField(uniquefield.getSolrFieldName(), false);
|
||||
changed = true;
|
||||
continue uniquecheck;
|
||||
}
|
||||
try {
|
||||
if (segment.fulltext().getDefaultConnector().existsByQuery(CollectionSchema.host_id_s + ":\"" + hostid + "\" AND " + signaturefield.getSolrFieldName() + ":\"" + checkhash.toString() + "\"")) {
|
||||
// switch unique attribute in new document
|
||||
sid.setField(uniquefield.getSolrFieldName(), false);
|
||||
// switch attribute also in all existing documents (which should be exactly only one!)
|
||||
SolrDocumentList docs = segment.fulltext().getDefaultConnector().getDocumentListByQuery(CollectionSchema.host_id_s + ":\"" + hostid + "\" AND " + signaturefield.getSolrFieldName() + ":\"" + checkhash.toString() + "\" AND " + uniquefield.getSolrFieldName() + ":true", 0, 1000);
|
||||
for (SolrDocument doc: docs) {
|
||||
SolrInputDocument sidContext = segment.fulltext().getDefaultConfiguration().toSolrInputDocument(doc);
|
||||
sidContext.setField(uniquefield.getSolrFieldName(), false);
|
||||
segment.putDocumentInQueue(sidContext);
|
||||
changed = true;
|
||||
}
|
||||
} else {
|
||||
sid.setField(uniquefield.getSolrFieldName(), true);
|
||||
}
|
||||
} catch (final IOException e) {}
|
||||
}
|
||||
}
|
||||
}
|
||||
return changed;
|
||||
}
|
||||
|
||||
public boolean postprocessing_clickdepth(Segment segment, SolrDocument doc, SolrInputDocument sid, DigestURI url, SchemaDeclaration clickdepthfield) {
|
||||
if (!this.contains(clickdepthfield)) return false;
|
||||
|
|
|
@ -39,7 +39,6 @@ import java.util.concurrent.BlockingQueue;
|
|||
import java.util.regex.Pattern;
|
||||
|
||||
import org.apache.solr.common.SolrDocument;
|
||||
import org.apache.solr.common.SolrDocumentList;
|
||||
import org.apache.solr.common.SolrInputDocument;
|
||||
|
||||
import net.yacy.cora.document.ASCII;
|
||||
|
@ -603,61 +602,6 @@ public class Segment {
|
|||
// CREATE SOLR DOCUMENT
|
||||
final CollectionConfiguration.SolrVector vector = this.fulltext.getDefaultConfiguration().yacy2solr(id, collections, responseHeader, document, condenser, referrerURL, language, urlCitationIndex, this.fulltext.getWebgraphConfiguration());
|
||||
|
||||
// FIND OUT IF THIS IS A DOUBLE DOCUMENT
|
||||
String hostid = url.hosthash();
|
||||
for (CollectionSchema[] checkfields: new CollectionSchema[][]{
|
||||
{CollectionSchema.exact_signature_l, CollectionSchema.exact_signature_unique_b},
|
||||
{CollectionSchema.fuzzy_signature_l, CollectionSchema.fuzzy_signature_unique_b}}) {
|
||||
CollectionSchema checkfield = checkfields[0];
|
||||
CollectionSchema uniquefield = checkfields[1];
|
||||
if (this.fulltext.getDefaultConfiguration().contains(checkfield) && this.fulltext.getDefaultConfiguration().contains(uniquefield)) {
|
||||
// lookup the document with the same signature
|
||||
long signature = ((Long) vector.getField(checkfield.getSolrFieldName()).getValue()).longValue();
|
||||
try {
|
||||
if (this.fulltext.getDefaultConnector().existsByQuery(CollectionSchema.host_id_s + ":\"" + hostid + "\" AND " + checkfield.getSolrFieldName() + ":\"" + Long.toString(signature) + "\"")) {
|
||||
// change unique attribut in content
|
||||
vector.setField(uniquefield.getSolrFieldName(), false);
|
||||
}
|
||||
} catch (final IOException e) {}
|
||||
}
|
||||
}
|
||||
|
||||
// CHECK IF TITLE AND DESCRIPTION IS UNIQUE (this is by default not switched on)
|
||||
if (this.fulltext.getDefaultConfiguration().contains(CollectionSchema.host_id_s)) {
|
||||
uniquecheck: for (CollectionSchema[] checkfields: new CollectionSchema[][]{
|
||||
{CollectionSchema.title, CollectionSchema.title_exact_signature_l, CollectionSchema.title_unique_b},
|
||||
{CollectionSchema.description_txt, CollectionSchema.description_exact_signature_l, CollectionSchema.description_unique_b}}) {
|
||||
CollectionSchema checkfield = checkfields[0];
|
||||
CollectionSchema signaturefield = checkfields[1];
|
||||
CollectionSchema uniquefield = checkfields[2];
|
||||
if (this.fulltext.getDefaultConfiguration().contains(checkfield) && this.fulltext.getDefaultConfiguration().contains(signaturefield) && this.fulltext.getDefaultConfiguration().contains(uniquefield)) {
|
||||
// lookup in the index within the same hosts for the same title or description
|
||||
//String checkstring = checkfield == CollectionSchema.title ? document.dc_title() : document.dc_description();
|
||||
Long checkhash = (Long) vector.getFieldValue(signaturefield.getSolrFieldName());
|
||||
if (checkhash == null) {
|
||||
vector.setField(uniquefield.getSolrFieldName(), false);
|
||||
continue uniquecheck;
|
||||
}
|
||||
try {
|
||||
if (this.fulltext.getDefaultConnector().existsByQuery(CollectionSchema.host_id_s + ":\"" + hostid + "\" AND " + signaturefield.getSolrFieldName() + ":\"" + checkhash.toString() + "\"")) {
|
||||
// switch unique attribute in new document
|
||||
vector.setField(uniquefield.getSolrFieldName(), false);
|
||||
// switch attribute also in all existing documents (which should be exactly only one!)
|
||||
SolrDocumentList docs = this.fulltext.getDefaultConnector().getDocumentListByQuery(CollectionSchema.host_id_s + ":\"" + hostid + "\" AND " + signaturefield.getSolrFieldName() + ":\"" + checkhash.toString() + "\" AND " + uniquefield.getSolrFieldName() + ":true", 0, 1000);
|
||||
for (SolrDocument doc: docs) {
|
||||
SolrInputDocument sid = this.fulltext.getDefaultConfiguration().toSolrInputDocument(doc);
|
||||
sid.setField(uniquefield.getSolrFieldName(), false);
|
||||
this.putDocumentInQueue(sid);
|
||||
//this.fulltext.getDefaultConnector().add(sid);
|
||||
}
|
||||
} else {
|
||||
vector.setField(uniquefield.getSolrFieldName(), true);
|
||||
}
|
||||
} catch (final IOException e) {}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
// ENRICH DOCUMENT WITH RANKING INFORMATION
|
||||
if (this.connectedCitation()) {
|
||||
this.fulltext.getDefaultConfiguration().postprocessing_references(this.getReferenceReportCache(), null, vector, url, null);
|
||||
|
|
|
@ -1408,42 +1408,42 @@ public final class SearchEvent {
|
|||
|
||||
public ImageResult oneImageResult(final int item, final long timeout) throws MalformedURLException {
|
||||
if (item < imageViewed.size()) return nthImage(item);
|
||||
if (imageSpare.size() > 0) return nextSpare();
|
||||
|
||||
ResultEntry ms = oneResult(item, timeout);
|
||||
// check if the match was made in the url or in the image links
|
||||
if (ms == null) throw new MalformedURLException("nUll");
|
||||
SolrDocument doc = ms.getNode().getDocument();
|
||||
Collection<Object> alt = doc.getFieldValues(CollectionSchema.images_alt_sxt.getSolrFieldName());
|
||||
Collection<Object> img = doc.getFieldValues(CollectionSchema.images_urlstub_sxt.getSolrFieldName());
|
||||
Collection<Object> prt = doc.getFieldValues(CollectionSchema.images_protocol_sxt.getSolrFieldName());
|
||||
if (img != null) {
|
||||
int c = 0;
|
||||
for (Object i: img) {
|
||||
String a = alt != null && alt.size() > c ? (String) SetTools.nth(alt, c) : "";
|
||||
if (query.getQueryGoal().matches((String) i) || query.getQueryGoal().matches(a)) {
|
||||
try {
|
||||
DigestURI imageUrl = new DigestURI((prt != null && prt.size() > c ? SetTools.nth(prt, c) : "http") + "://" + i);
|
||||
Object heightO = SetTools.nth(doc.getFieldValues(CollectionSchema.images_height_val.getSolrFieldName()), c);
|
||||
Object widthO = SetTools.nth(doc.getFieldValues(CollectionSchema.images_width_val.getSolrFieldName()), c);
|
||||
String id = ASCII.String(imageUrl.hash());
|
||||
if (!imageViewed.containsKey(id) && !imageSpare.containsKey(id)) imageSpare.put(id, new ImageResult(ms.url(), imageUrl, "", a, widthO == null ? 0 : (Integer) widthO, heightO == null ? 0 : (Integer) heightO, 0));
|
||||
} catch (MalformedURLException e) {
|
||||
continue;
|
||||
if (ms != null) {
|
||||
SolrDocument doc = ms.getNode().getDocument();
|
||||
Collection<Object> alt = doc.getFieldValues(CollectionSchema.images_alt_sxt.getSolrFieldName());
|
||||
Collection<Object> img = doc.getFieldValues(CollectionSchema.images_urlstub_sxt.getSolrFieldName());
|
||||
Collection<Object> prt = doc.getFieldValues(CollectionSchema.images_protocol_sxt.getSolrFieldName());
|
||||
if (img != null) {
|
||||
int c = 0;
|
||||
for (Object i: img) {
|
||||
String a = alt != null && alt.size() > c ? (String) SetTools.nth(alt, c) : "";
|
||||
if (query.getQueryGoal().matches((String) i) || query.getQueryGoal().matches(a)) {
|
||||
try {
|
||||
DigestURI imageUrl = new DigestURI((prt != null && prt.size() > c ? SetTools.nth(prt, c) : "http") + "://" + i);
|
||||
Object heightO = SetTools.nth(doc.getFieldValues(CollectionSchema.images_height_val.getSolrFieldName()), c);
|
||||
Object widthO = SetTools.nth(doc.getFieldValues(CollectionSchema.images_width_val.getSolrFieldName()), c);
|
||||
String id = ASCII.String(imageUrl.hash());
|
||||
if (!imageViewed.containsKey(id) && !imageSpare.containsKey(id)) imageSpare.put(id, new ImageResult(ms.url(), imageUrl, "", a, widthO == null ? 0 : (Integer) widthO, heightO == null ? 0 : (Integer) heightO, 0));
|
||||
} catch (MalformedURLException e) {
|
||||
continue;
|
||||
}
|
||||
}
|
||||
c++;
|
||||
}
|
||||
c++;
|
||||
}
|
||||
}
|
||||
if (MultiProtocolURI.isImage(MultiProtocolURI.getFileExtension(ms.url().getFileName()))) {
|
||||
String id = ASCII.String(ms.hash());
|
||||
if (!imageViewed.containsKey(id) && !imageSpare.containsKey(id)) imageSpare.put(id, new ImageResult(ms.url(), ms.url(), "", ms.title(), 0, 0, 0));
|
||||
}
|
||||
if (img != null && img.size() > 0) {
|
||||
DigestURI imageUrl = new DigestURI((prt != null && prt.size() > 0 ? SetTools.nth(prt, 0) : "http") + "://" + SetTools.nth(img, 0));
|
||||
String imagetext = alt != null && alt.size() > 0 ? (String) SetTools.nth(alt, 0) : "";
|
||||
String id = ASCII.String(imageUrl.hash());
|
||||
if (!imageViewed.containsKey(id) && !imageSpare.containsKey(id)) imageSpare.put(id, new ImageResult(ms.url(), imageUrl, "", imagetext, 0, 0, 0));
|
||||
if (MultiProtocolURI.isImage(MultiProtocolURI.getFileExtension(ms.url().getFileName()))) {
|
||||
String id = ASCII.String(ms.hash());
|
||||
if (!imageViewed.containsKey(id) && !imageSpare.containsKey(id)) imageSpare.put(id, new ImageResult(ms.url(), ms.url(), "", ms.title(), 0, 0, 0));
|
||||
}
|
||||
if (img != null && img.size() > 0) {
|
||||
DigestURI imageUrl = new DigestURI((prt != null && prt.size() > 0 ? SetTools.nth(prt, 0) : "http") + "://" + SetTools.nth(img, 0));
|
||||
String imagetext = alt != null && alt.size() > 0 ? (String) SetTools.nth(alt, 0) : "";
|
||||
String id = ASCII.String(imageUrl.hash());
|
||||
if (!imageViewed.containsKey(id) && !imageSpare.containsKey(id)) imageSpare.put(id, new ImageResult(ms.url(), imageUrl, "", imagetext, 0, 0, 0));
|
||||
}
|
||||
}
|
||||
if (imageSpare.size() > 0) return nextSpare();
|
||||
throw new MalformedURLException("no image url found");
|
||||
|
|
|
@ -448,11 +448,17 @@ public class CollectionConfiguration extends SchemaConfiguration implements Seri
|
|||
add(doc, CollectionSchema.synonyms_sxt, synonyms);
|
||||
}
|
||||
add(doc, CollectionSchema.exact_signature_l, condenser.exactSignature());
|
||||
add(doc, CollectionSchema.exact_signature_unique_b, true); // this must be corrected afterwards!
|
||||
add(doc, CollectionSchema.exact_signature_unique_b, true); // this must be corrected afterwards during storage!
|
||||
add(doc, CollectionSchema.exact_signature_copycount_i, 0); // this must be corrected afterwards during postprocessing!
|
||||
add(doc, CollectionSchema.fuzzy_signature_l, condenser.fuzzySignature());
|
||||
add(doc, CollectionSchema.fuzzy_signature_text_t, condenser.fuzzySignatureText());
|
||||
add(doc, CollectionSchema.fuzzy_signature_unique_b, true); // this must be corrected afterwards!
|
||||
|
||||
add(doc, CollectionSchema.fuzzy_signature_unique_b, true); // this must be corrected afterwards during storage!
|
||||
add(doc, CollectionSchema.fuzzy_signature_copycount_i, 0); // this must be corrected afterwards during postprocessing!
|
||||
if (this.contains(CollectionSchema.exact_signature_unique_b) || this.contains(CollectionSchema.exact_signature_copycount_i) ||
|
||||
this.contains(CollectionSchema.fuzzy_signature_l) || this.contains(CollectionSchema.fuzzy_signature_copycount_i)) {
|
||||
processTypes.add(ProcessType.UNIQUE);
|
||||
}
|
||||
|
||||
// get list of all links; they will be shrinked by urls that appear in other fields of the solr schema
|
||||
Set<DigestURI> inboundLinks = document.inboundLinks();
|
||||
Set<DigestURI> outboundLinks = document.outboundLinks();
|
||||
|
@ -900,8 +906,9 @@ public class CollectionConfiguration extends SchemaConfiguration implements Seri
|
|||
// process all documents
|
||||
BlockingQueue<SolrDocument> docs = connector.concurrentDocumentsByQuery(CollectionSchema.process_sxt.getSolrFieldName() + ":[* TO *]", 0, 10000, 60000, 50);
|
||||
SolrDocument doc;
|
||||
int proccount = 0, proccount_clickdepthchange = 0, proccount_referencechange = 0, proccount_citationchange = 0;
|
||||
int proccount = 0, proccount_clickdepthchange = 0, proccount_referencechange = 0, proccount_citationchange = 0, proccount_uniquechange = 0;
|
||||
Map<String, Long> hostExtentCache = new HashMap<String, Long>(); // a mapping from the host id to the number of documents which contain this host-id
|
||||
Set<String> uniqueURLs = new HashSet<String>();
|
||||
try {
|
||||
while ((doc = docs.take()) != AbstractSolrConnector.POISON_DOCUMENT) {
|
||||
// for each to-be-processed entry work on the process tag
|
||||
|
@ -929,6 +936,10 @@ public class CollectionConfiguration extends SchemaConfiguration implements Seri
|
|||
proccount_citationchange++;
|
||||
}
|
||||
}
|
||||
|
||||
if (tagtype == ProcessType.UNIQUE) {
|
||||
if (postprocessing_doublecontent(segment, uniqueURLs, sid, url)) proccount_uniquechange++;
|
||||
}
|
||||
|
||||
}
|
||||
|
||||
|
@ -954,7 +965,8 @@ public class CollectionConfiguration extends SchemaConfiguration implements Seri
|
|||
}
|
||||
ConcurrentLog.info("CollectionConfiguration", "cleanup_processing: re-calculated " + proccount+ " new documents, " +
|
||||
proccount_clickdepthchange + " clickdepth changes, " +
|
||||
proccount_referencechange + " reference-count changes," +
|
||||
proccount_referencechange + " reference-count changes, " +
|
||||
proccount_uniquechange + " unique field changes, " +
|
||||
proccount_citationchange + " citation ranking changes.");
|
||||
} catch (final InterruptedException e) {
|
||||
}
|
||||
|
|
|
@ -43,9 +43,11 @@ public enum CollectionSchema implements SchemaDeclaration {
|
|||
md5_s(SolrType.string, true, true, false, false, false, "the md5 of the raw source"),// String md5();
|
||||
exact_signature_l(SolrType.num_long, true, true, false, false, false, "the 64 bit hash of the org.apache.solr.update.processor.Lookup3Signature of text_t"),
|
||||
exact_signature_unique_b(SolrType.bool, true, true, false, false, false, "flag shows if exact_signature_l is unique at the time of document creation, used for double-check during search"),
|
||||
exact_signature_copycount_i(SolrType.num_integer, true, true, false, false, false, "counter for the number of documents which are not unique (== count of not-unique-flagged documents + 1)"),
|
||||
fuzzy_signature_l(SolrType.num_long, true, true, false, false, false, "64 bit of the Lookup3Signature from EnhancedTextProfileSignature of text_t"),
|
||||
fuzzy_signature_text_t(SolrType.text_general, true, true, false, false, false, "intermediate data produced in EnhancedTextProfileSignature: a list of word frequencies"),
|
||||
fuzzy_signature_unique_b(SolrType.bool, true, true, false, false, false, "flag shows if fuzzy_signature_l is unique at the time of document creation, used for double-check during search"),
|
||||
fuzzy_signature_copycount_i(SolrType.num_integer, true, true, false, false, false, "counter for the number of documents which are not unique (== count of not-unique-flagged documents + 1)"),
|
||||
size_i(SolrType.num_integer, true, true, false, false, false, "the size of the raw source"),// int size();
|
||||
failreason_s(SolrType.string, true, true, false, false, false, "fail reason if a page was not loaded. if the page was loaded then this field is empty"),
|
||||
failtype_s(SolrType.string, true, true, false, false, false, "fail type if a page was not loaded. This field is either empty, 'excl' or 'fail'"),
|
||||
|
|
Loading…
Reference in New Issue
Block a user