added two more fields source_cr_host_norm_i,target_cr_host_norm_i in

webgraph and an addition to postprocessing to copy all cr ranking attributes to the link edges associated to the postprocessing documents
2024-09-19 00:01:41 +02:00 · 2013-09-27 16:57:05 +02:00 · 2013-09-27 16:57:05 +02:00 · b28d43decc
commit b28d43decc
parent a52f3a597e
5 changed files with 88 additions and 33 deletions
--- a/defaults/solr.webgraph.schema
+++ b/defaults/solr.webgraph.schema
@ -74,6 +74,10 @@ source_id_s
 ## depth of web page according to number of clicks from the 'main' page, which is the page that appears if only the host is entered as url (source)
 #source_clickdepth_i

+## copy of the citation rank norm value from the source link
+source_cr_host_norm_i
+
+
 ## host of the url (source)
 #source_host_s

@ -171,6 +175,10 @@ target_path_folders_sxt
 ## depth of web page according to number of clicks from the 'main' page, which is the page that appears if only the host is entered as url (target)
 #target_clickdepth_i

+## copy of the citation rank norm value from the target link; this is only filled if the target host is identical to the source host
+target_cr_host_norm_i
+
+
 ## host of the url (target)
 #target_host_s

--- a/source/net/yacy/cora/federate/solr/SchemaConfiguration.java
+++ b/source/net/yacy/cora/federate/solr/SchemaConfiguration.java
@ -33,6 +33,7 @@ import java.util.Set;
 import org.apache.solr.common.SolrDocument;
 import org.apache.solr.common.SolrDocumentList;
 import org.apache.solr.common.SolrInputDocument;
+import org.apache.solr.common.SolrInputField;

 import net.yacy.cora.document.encoding.ASCII;
 import net.yacy.cora.document.id.DigestURL;
@ -78,6 +79,34 @@ public class SchemaConfiguration extends Configuration implements Serializable {
        }
    }
    
+    /**
+     * Convert a SolrDocument to a SolrInputDocument.
+     * This is useful if a document from the search index shall be modified and indexed again.
+     * This shall be used as replacement of ClientUtils.toSolrInputDocument because we remove some fields
+     * which are created automatically during the indexing process.
+     * @param doc the solr document
+     * @return a solr input document
+     */
+    public SolrInputDocument toSolrInputDocument(final SolrDocument doc, Set<String> omitFields) {
+        SolrInputDocument sid = new SolrInputDocument();
+        for (String name: doc.getFieldNames()) {
+            if (this.contains(name) && (omitFields == null || !omitFields.contains(name))) { // check each field if enabled in local Solr schema
+                sid.addField(name, doc.getFieldValue(name), 1.0f);
+            }
+        }
+        return sid;
+    }
+    
+    public SolrDocument toSolrDocument(final SolrInputDocument doc, Set<String> omitFields) {
+        SolrDocument sd = new SolrDocument();
+        for (SolrInputField field: doc) {
+            if (this.contains(field.getName()) && (omitFields == null || !omitFields.contains(field.getName()))) { // check each field if enabled in local Solr schema
+                sd.setField(field.getName(), field.getValue());
+            }
+        }
+        return sd;
+    }
+    
    public boolean postprocessing_doublecontent(Segment segment, Set<String> uniqueURLs, SolrInputDocument sid, DigestURL url) {
        boolean changed = false;
        // FIND OUT IF THIS IS A DOUBLE DOCUMENT
--- a/source/net/yacy/crawler/CrawlStacker.java
+++ b/source/net/yacy/crawler/CrawlStacker.java
@ -48,8 +48,6 @@ import net.yacy.cora.util.ConcurrentLog;
 import net.yacy.crawler.data.CrawlProfile;
 import net.yacy.crawler.data.CrawlQueues;
 import net.yacy.crawler.data.NoticedURL;
-import net.yacy.crawler.data.ResultURLs;
-import net.yacy.crawler.data.ResultURLs.EventOrigin;
 import net.yacy.crawler.retrieval.FTPLoader;
 import net.yacy.crawler.retrieval.HTTPLoader;
 import net.yacy.crawler.retrieval.Request;
--- a/source/net/yacy/search/schema/CollectionConfiguration.java
+++ b/source/net/yacy/search/schema/CollectionConfiguration.java
@ -84,8 +84,8 @@ import net.yacy.search.query.QueryParams;
 import net.yacy.search.schema.WebgraphConfiguration.Subgraph;

 import org.apache.solr.common.SolrDocument;
+import org.apache.solr.common.SolrException;
 import org.apache.solr.common.SolrInputDocument;
-import org.apache.solr.common.SolrInputField;


 public class CollectionConfiguration extends SchemaConfiguration implements Serializable {
@ -169,32 +169,12 @@ public class CollectionConfiguration extends SchemaConfiguration implements Seri
        omitFields.add(CollectionSchema.coordinate_p_1_coordinate.getSolrFieldName());
    }
    
-    /**
-     * Convert a SolrDocument to a SolrInputDocument.
-     * This is useful if a document from the search index shall be modified and indexed again.
-     * This shall be used as replacement of ClientUtils.toSolrInputDocument because we remove some fields
-     * which are created automatically during the indexing process.
-     * @param doc the solr document
-     * @return a solr input document
-     */
    public SolrInputDocument toSolrInputDocument(final SolrDocument doc) {
-        SolrInputDocument sid = new SolrInputDocument();
-        for (String name: doc.getFieldNames()) {
-            if (this.contains(name) && !omitFields.contains(name)) { // check each field if enabled in local Solr schema
-                sid.addField(name, doc.getFieldValue(name), 1.0f);
-            }
-        }
-        return sid;
+        return toSolrInputDocument(doc, omitFields);
    }
    
    public SolrDocument toSolrDocument(final SolrInputDocument doc) {
-        SolrDocument sd = new SolrDocument();
-        for (SolrInputField field: doc) {
-            if (this.contains(field.getName()) && !omitFields.contains(field.getName())) { // check each field if enabled in local Solr schema
-                sd.setField(field.getName(), field.getValue());
-            }
-        }
-        return sd;
+        return toSolrDocument(doc, omitFields);
    }
    
    /**
@ -904,16 +884,18 @@ public class CollectionConfiguration extends SchemaConfiguration implements Seri
    public int postprocessing(final Segment segment, String harvestkey) {
        if (!this.contains(CollectionSchema.process_sxt)) return 0;
        if (!segment.connectedCitation()) return 0;
-        SolrConnector connector = segment.fulltext().getDefaultConnector();
-        connector.commit(true); // make sure that we have latest information that can be found
+        SolrConnector collectionConnector = segment.fulltext().getDefaultConnector();
+        SolrConnector webgraphConnector = segment.fulltext().getWebgraphConnector();
+        collectionConnector.commit(true); // make sure that we have latest information that can be found
        ReferenceReportCache rrCache = segment.getReferenceReportCache();
        Map<byte[], CRV> ranking = new TreeMap<byte[], CRV>(Base64Order.enhancedCoder);
+        ReversibleScoreMap<String> hostscore = null;
        try {
            // collect hosts from index which shall take part in citation computation
-            ReversibleScoreMap<String> hostscore = connector.getFacets(
+            hostscore = collectionConnector.getFacets(
                    (harvestkey == null ? "" : CollectionSchema.harvestkey_s.getSolrFieldName() + ":\"" + harvestkey + "\" AND ") +
                    CollectionSchema.process_sxt.getSolrFieldName() + ":" + ProcessType.CITATION.toString(),
-                    10000, CollectionSchema.host_s.getSolrFieldName()).get(CollectionSchema.host_s.getSolrFieldName());
+                    10000000, CollectionSchema.host_s.getSolrFieldName()).get(CollectionSchema.host_s.getSolrFieldName());
            if (hostscore == null) hostscore = new ClusteredScoreMap<String>();
            // for each host, do a citation rank computation
            for (String host: hostscore.keyList(true)) {
@ -931,14 +913,49 @@ public class CollectionConfiguration extends SchemaConfiguration implements Seri
                ranking.putAll(crn); // accumulate this here for usage in document update later
            }
        } catch (final IOException e2) {
+            hostscore = new ClusteredScoreMap<String>();
        }
        
-        // process all documents
-        BlockingQueue<SolrDocument> docs = connector.concurrentDocumentsByQuery(
+        // process all documents at the webgraph for the outgoing links of this document
+        SolrDocument doc;
+        if (webgraphConnector != null) {
+            for (String host: hostscore.keyList(true)) {
+                if (hostscore.get(host) <= 0) continue;
+                // select all webgraph edges and modify their cr value
+                BlockingQueue<SolrDocument> docs = webgraphConnector.concurrentDocumentsByQuery(
+                        WebgraphSchema.source_host_s.getSolrFieldName() + ":\"" + host + "\"",
+                        0, 10000000, 60000, 50);
+                try {
+                    while ((doc = docs.take()) != AbstractSolrConnector.POISON_DOCUMENT) {
+                        boolean changed = false;
+                        SolrInputDocument sid = segment.fulltext().getWebgraphConfiguration().toSolrInputDocument(doc, null);
+                        byte[] id = ASCII.getBytes((String) doc.getFieldValue(WebgraphSchema.source_id_s.getSolrFieldName()));
+                        CRV crv = ranking.get(id);
+                        if (crv != null) {
+                            sid.setField(WebgraphSchema.source_cr_host_norm_i.getSolrFieldName(), crv.crn);
+                            changed = true;
+                        }
+                        id = ASCII.getBytes((String) doc.getFieldValue(WebgraphSchema.target_id_s.getSolrFieldName()));
+                        crv = ranking.get(id);
+                        if (crv != null) {
+                            sid.setField(WebgraphSchema.target_cr_host_norm_i.getSolrFieldName(), crv.crn);
+                            changed = true;
+                        }
+                        if (changed) try {
+                            webgraphConnector.add(sid);
+                        } catch (SolrException e) {
+                        } catch (IOException e) {
+                       }
+                    }
+                } catch (final InterruptedException e) {}
+            }
+        }
+        
+        // process all documents in collection
+        BlockingQueue<SolrDocument> docs = collectionConnector.concurrentDocumentsByQuery(
                (harvestkey == null ? "" : CollectionSchema.harvestkey_s.getSolrFieldName() + ":\"" + harvestkey + "\" AND ") +
                CollectionSchema.process_sxt.getSolrFieldName() + ":[* TO *]",
                0, 10000, 60000, 50);
-        SolrDocument doc;
        int proccount = 0, proccount_clickdepthchange = 0, proccount_referencechange = 0, proccount_citationchange = 0, proccount_uniquechange = 0;
        Map<String, Long> hostExtentCache = new HashMap<String, Long>(); // a mapping from the host id to the number of documents which contain this host-id
        Set<String> uniqueURLs = new HashSet<String>();
@ -992,7 +1009,8 @@ public class CollectionConfiguration extends SchemaConfiguration implements Seri
                    
                    // send back to index
                    //connector.deleteById(ASCII.String(id));
-                    connector.add(sid);
+                    collectionConnector.add(sid);
+                    
                    proccount++;
                } catch (final Throwable e1) {
                }
--- a/source/net/yacy/search/schema/WebgraphSchema.java
+++ b/source/net/yacy/search/schema/WebgraphSchema.java
@ -52,6 +52,7 @@ public enum WebgraphSchema implements SchemaDeclaration {
    source_parameter_key_sxt(SolrType.string, true, true, true, false, false, "the keys from key-value pairs in the search part of the url (source)"),
    source_parameter_value_sxt(SolrType.string, true, true, true, false, false, "the values from key-value pairs in the search part of the url (source)"),
    source_clickdepth_i(SolrType.num_integer, true, true, false, false, false, "depth of web page according to number of clicks from the 'main' page, which is the page that appears if only the host is entered as url (source)"),
+    source_cr_host_norm_i(SolrType.num_integer, true, true, false, false, false, "copy of the citation rank norm value from the source link"),

    source_host_s(SolrType.string, true, true, false, false, false, "host of the url (source)"),
    source_host_id_s(SolrType.string, true, true, false, false, false, "id of the host (source)"),
@ -86,6 +87,7 @@ public enum WebgraphSchema implements SchemaDeclaration {
    target_parameter_key_sxt(SolrType.string, true, true, true, false, false, "the keys from key-value pairs in the search part of the url (target)"),
    target_parameter_value_sxt(SolrType.string, true, true, true, false, true, "the values from key-value pairs in the search part of the url (target)"),
    target_clickdepth_i(SolrType.num_integer, true, true, false, false, false, "depth of web page according to number of clicks from the 'main' page, which is the page that appears if only the host is entered as url (target)"),
+    target_cr_host_norm_i(SolrType.num_integer, true, true, false, false, false, "copy of the citation rank norm value from the target link; this is only filled if the target host is identical to the source host"),

    target_host_s(SolrType.string, true, true, false, false, true, "host of the url (target)"),
    target_host_id_s(SolrType.string, true, true, false, false, false, "id of the host (target)"),