added two more fields source_cr_host_norm_i,target_cr_host_norm_i in

webgraph and an addition to postprocessing to copy all cr ranking
attributes to the link edges associated to the postprocessing documents
This commit is contained in:
Michael Peter Christen 2013-09-27 16:57:05 +02:00
parent a52f3a597e
commit b28d43decc
5 changed files with 88 additions and 33 deletions

View File

@ -74,6 +74,10 @@ source_id_s
## depth of web page according to number of clicks from the 'main' page, which is the page that appears if only the host is entered as url (source)
#source_clickdepth_i
## copy of the citation rank norm value from the source link
source_cr_host_norm_i
## host of the url (source)
#source_host_s
@ -171,6 +175,10 @@ target_path_folders_sxt
## depth of web page according to number of clicks from the 'main' page, which is the page that appears if only the host is entered as url (target)
#target_clickdepth_i
## copy of the citation rank norm value from the target link; this is only filled if the target host is identical to the source host
target_cr_host_norm_i
## host of the url (target)
#target_host_s

View File

@ -33,6 +33,7 @@ import java.util.Set;
import org.apache.solr.common.SolrDocument;
import org.apache.solr.common.SolrDocumentList;
import org.apache.solr.common.SolrInputDocument;
import org.apache.solr.common.SolrInputField;
import net.yacy.cora.document.encoding.ASCII;
import net.yacy.cora.document.id.DigestURL;
@ -78,6 +79,34 @@ public class SchemaConfiguration extends Configuration implements Serializable {
}
}
/**
* Convert a SolrDocument to a SolrInputDocument.
* This is useful if a document from the search index shall be modified and indexed again.
* This shall be used as replacement of ClientUtils.toSolrInputDocument because we remove some fields
* which are created automatically during the indexing process.
* @param doc the solr document
* @return a solr input document
*/
public SolrInputDocument toSolrInputDocument(final SolrDocument doc, Set<String> omitFields) {
SolrInputDocument sid = new SolrInputDocument();
for (String name: doc.getFieldNames()) {
if (this.contains(name) && (omitFields == null || !omitFields.contains(name))) { // check each field if enabled in local Solr schema
sid.addField(name, doc.getFieldValue(name), 1.0f);
}
}
return sid;
}
public SolrDocument toSolrDocument(final SolrInputDocument doc, Set<String> omitFields) {
SolrDocument sd = new SolrDocument();
for (SolrInputField field: doc) {
if (this.contains(field.getName()) && (omitFields == null || !omitFields.contains(field.getName()))) { // check each field if enabled in local Solr schema
sd.setField(field.getName(), field.getValue());
}
}
return sd;
}
public boolean postprocessing_doublecontent(Segment segment, Set<String> uniqueURLs, SolrInputDocument sid, DigestURL url) {
boolean changed = false;
// FIND OUT IF THIS IS A DOUBLE DOCUMENT

View File

@ -48,8 +48,6 @@ import net.yacy.cora.util.ConcurrentLog;
import net.yacy.crawler.data.CrawlProfile;
import net.yacy.crawler.data.CrawlQueues;
import net.yacy.crawler.data.NoticedURL;
import net.yacy.crawler.data.ResultURLs;
import net.yacy.crawler.data.ResultURLs.EventOrigin;
import net.yacy.crawler.retrieval.FTPLoader;
import net.yacy.crawler.retrieval.HTTPLoader;
import net.yacy.crawler.retrieval.Request;

View File

@ -84,8 +84,8 @@ import net.yacy.search.query.QueryParams;
import net.yacy.search.schema.WebgraphConfiguration.Subgraph;
import org.apache.solr.common.SolrDocument;
import org.apache.solr.common.SolrException;
import org.apache.solr.common.SolrInputDocument;
import org.apache.solr.common.SolrInputField;
public class CollectionConfiguration extends SchemaConfiguration implements Serializable {
@ -169,32 +169,12 @@ public class CollectionConfiguration extends SchemaConfiguration implements Seri
omitFields.add(CollectionSchema.coordinate_p_1_coordinate.getSolrFieldName());
}
/**
* Convert a SolrDocument to a SolrInputDocument.
* This is useful if a document from the search index shall be modified and indexed again.
* This shall be used as replacement of ClientUtils.toSolrInputDocument because we remove some fields
* which are created automatically during the indexing process.
* @param doc the solr document
* @return a solr input document
*/
public SolrInputDocument toSolrInputDocument(final SolrDocument doc) {
SolrInputDocument sid = new SolrInputDocument();
for (String name: doc.getFieldNames()) {
if (this.contains(name) && !omitFields.contains(name)) { // check each field if enabled in local Solr schema
sid.addField(name, doc.getFieldValue(name), 1.0f);
}
}
return sid;
return toSolrInputDocument(doc, omitFields);
}
public SolrDocument toSolrDocument(final SolrInputDocument doc) {
SolrDocument sd = new SolrDocument();
for (SolrInputField field: doc) {
if (this.contains(field.getName()) && !omitFields.contains(field.getName())) { // check each field if enabled in local Solr schema
sd.setField(field.getName(), field.getValue());
}
}
return sd;
return toSolrDocument(doc, omitFields);
}
/**
@ -904,16 +884,18 @@ public class CollectionConfiguration extends SchemaConfiguration implements Seri
public int postprocessing(final Segment segment, String harvestkey) {
if (!this.contains(CollectionSchema.process_sxt)) return 0;
if (!segment.connectedCitation()) return 0;
SolrConnector connector = segment.fulltext().getDefaultConnector();
connector.commit(true); // make sure that we have latest information that can be found
SolrConnector collectionConnector = segment.fulltext().getDefaultConnector();
SolrConnector webgraphConnector = segment.fulltext().getWebgraphConnector();
collectionConnector.commit(true); // make sure that we have latest information that can be found
ReferenceReportCache rrCache = segment.getReferenceReportCache();
Map<byte[], CRV> ranking = new TreeMap<byte[], CRV>(Base64Order.enhancedCoder);
ReversibleScoreMap<String> hostscore = null;
try {
// collect hosts from index which shall take part in citation computation
ReversibleScoreMap<String> hostscore = connector.getFacets(
hostscore = collectionConnector.getFacets(
(harvestkey == null ? "" : CollectionSchema.harvestkey_s.getSolrFieldName() + ":\"" + harvestkey + "\" AND ") +
CollectionSchema.process_sxt.getSolrFieldName() + ":" + ProcessType.CITATION.toString(),
10000, CollectionSchema.host_s.getSolrFieldName()).get(CollectionSchema.host_s.getSolrFieldName());
10000000, CollectionSchema.host_s.getSolrFieldName()).get(CollectionSchema.host_s.getSolrFieldName());
if (hostscore == null) hostscore = new ClusteredScoreMap<String>();
// for each host, do a citation rank computation
for (String host: hostscore.keyList(true)) {
@ -931,14 +913,49 @@ public class CollectionConfiguration extends SchemaConfiguration implements Seri
ranking.putAll(crn); // accumulate this here for usage in document update later
}
} catch (final IOException e2) {
hostscore = new ClusteredScoreMap<String>();
}
// process all documents
BlockingQueue<SolrDocument> docs = connector.concurrentDocumentsByQuery(
// process all documents at the webgraph for the outgoing links of this document
SolrDocument doc;
if (webgraphConnector != null) {
for (String host: hostscore.keyList(true)) {
if (hostscore.get(host) <= 0) continue;
// select all webgraph edges and modify their cr value
BlockingQueue<SolrDocument> docs = webgraphConnector.concurrentDocumentsByQuery(
WebgraphSchema.source_host_s.getSolrFieldName() + ":\"" + host + "\"",
0, 10000000, 60000, 50);
try {
while ((doc = docs.take()) != AbstractSolrConnector.POISON_DOCUMENT) {
boolean changed = false;
SolrInputDocument sid = segment.fulltext().getWebgraphConfiguration().toSolrInputDocument(doc, null);
byte[] id = ASCII.getBytes((String) doc.getFieldValue(WebgraphSchema.source_id_s.getSolrFieldName()));
CRV crv = ranking.get(id);
if (crv != null) {
sid.setField(WebgraphSchema.source_cr_host_norm_i.getSolrFieldName(), crv.crn);
changed = true;
}
id = ASCII.getBytes((String) doc.getFieldValue(WebgraphSchema.target_id_s.getSolrFieldName()));
crv = ranking.get(id);
if (crv != null) {
sid.setField(WebgraphSchema.target_cr_host_norm_i.getSolrFieldName(), crv.crn);
changed = true;
}
if (changed) try {
webgraphConnector.add(sid);
} catch (SolrException e) {
} catch (IOException e) {
}
}
} catch (final InterruptedException e) {}
}
}
// process all documents in collection
BlockingQueue<SolrDocument> docs = collectionConnector.concurrentDocumentsByQuery(
(harvestkey == null ? "" : CollectionSchema.harvestkey_s.getSolrFieldName() + ":\"" + harvestkey + "\" AND ") +
CollectionSchema.process_sxt.getSolrFieldName() + ":[* TO *]",
0, 10000, 60000, 50);
SolrDocument doc;
int proccount = 0, proccount_clickdepthchange = 0, proccount_referencechange = 0, proccount_citationchange = 0, proccount_uniquechange = 0;
Map<String, Long> hostExtentCache = new HashMap<String, Long>(); // a mapping from the host id to the number of documents which contain this host-id
Set<String> uniqueURLs = new HashSet<String>();
@ -992,7 +1009,8 @@ public class CollectionConfiguration extends SchemaConfiguration implements Seri
// send back to index
//connector.deleteById(ASCII.String(id));
connector.add(sid);
collectionConnector.add(sid);
proccount++;
} catch (final Throwable e1) {
}

View File

@ -52,6 +52,7 @@ public enum WebgraphSchema implements SchemaDeclaration {
source_parameter_key_sxt(SolrType.string, true, true, true, false, false, "the keys from key-value pairs in the search part of the url (source)"),
source_parameter_value_sxt(SolrType.string, true, true, true, false, false, "the values from key-value pairs in the search part of the url (source)"),
source_clickdepth_i(SolrType.num_integer, true, true, false, false, false, "depth of web page according to number of clicks from the 'main' page, which is the page that appears if only the host is entered as url (source)"),
source_cr_host_norm_i(SolrType.num_integer, true, true, false, false, false, "copy of the citation rank norm value from the source link"),
source_host_s(SolrType.string, true, true, false, false, false, "host of the url (source)"),
source_host_id_s(SolrType.string, true, true, false, false, false, "id of the host (source)"),
@ -86,6 +87,7 @@ public enum WebgraphSchema implements SchemaDeclaration {
target_parameter_key_sxt(SolrType.string, true, true, true, false, false, "the keys from key-value pairs in the search part of the url (target)"),
target_parameter_value_sxt(SolrType.string, true, true, true, false, true, "the values from key-value pairs in the search part of the url (target)"),
target_clickdepth_i(SolrType.num_integer, true, true, false, false, false, "depth of web page according to number of clicks from the 'main' page, which is the page that appears if only the host is entered as url (target)"),
target_cr_host_norm_i(SolrType.num_integer, true, true, false, false, false, "copy of the citation rank norm value from the target link; this is only filled if the target host is identical to the source host"),
target_host_s(SolrType.string, true, true, false, false, true, "host of the url (target)"),
target_host_id_s(SolrType.string, true, true, false, false, false, "id of the host (target)"),