more fixes in postprocessing: partitioning of the complete queue to

enable smaller queries
2024-09-19 00:01:41 +02:00 · 2014-10-31 17:30:24 +01:00 · 2014-10-31 17:30:24 +01:00 · 327e83bfe7
commit 327e83bfe7
parent 2bc6199408
3 changed files with 175 additions and 158 deletions
--- a/source/net/yacy/cora/federate/solr/connector/AbstractSolrConnector.java
+++ b/source/net/yacy/cora/federate/solr/connector/AbstractSolrConnector.java
@ -269,6 +269,7 @@ public abstract class AbstractSolrConnector implements SolrConnector {
            public void run() {
                this.setName("AbstractSolrConnector:concurrentIDsByQuery(" + querystring + ")");
                int o = offset;
+                try {
                    while (System.currentTimeMillis() < endtime) {
                        try {
                            SolrDocumentList sdl = getDocumentListByQuery(querystring, sort, o, Math.min(maxcount, pagesize_ids), CollectionSchema.id.getSolrFieldName());
@ -285,10 +286,12 @@ public abstract class AbstractSolrConnector implements SolrConnector {
                            break;
                        }
                    }
+                } catch (Throwable e) {} finally {
                    for (int i = 0; i < concurrency; i++) {
                        try {queue.put(AbstractSolrConnector.POISON_ID);} catch (final InterruptedException e1) {}
                    }
                }
+            }
        };
        t.start();
        return queue;
--- a/source/net/yacy/cora/federate/solr/connector/SolrServerConnector.java
+++ b/source/net/yacy/cora/federate/solr/connector/SolrServerConnector.java
@ -300,7 +300,7 @@ public abstract class SolrServerConnector extends AbstractSolrConnector implemen
        QueryResponse rsp;
        int retry = 0;
        Throwable error = null;
-        while (retry++ < 60) {
+        while (retry++ < 10) {
            try {
                if (q != null) Thread.currentThread().setName("solr query: q = " + q + (fq == null ? "" : ", fq = " + fq) + (sort == null ? "" : ", sort = " + sort) + "; retry = " + retry + "; fl = " + fl); // for debugging in Threaddump
                rsp = this.server.query(params);
--- a/source/net/yacy/search/schema/CollectionConfiguration.java
+++ b/source/net/yacy/search/schema/CollectionConfiguration.java
@ -1252,11 +1252,27 @@ public class CollectionConfiguration extends SchemaConfiguration implements Seri
        // process all documents in collection
        final Map<String, Long> hostExtentCache = new HashMap<String, Long>(); // a mapping from the host id to the number of documents which contain this host-id
        final Set<String> uniqueURLs = new ConcurrentHashSet<String>(); // will be used in a concurrent environment
-        try {
        final Set<String> omitFields = new HashSet<String>();
        omitFields.add(CollectionSchema.process_sxt.getSolrFieldName());
        omitFields.add(CollectionSchema.harvestkey_s.getSolrFieldName());
+        final Collection<String> failids = new ArrayList<String>();
+        final AtomicInteger countcheck = new AtomicInteger(0);
+        final AtomicInteger proccount = new AtomicInteger();
+        final AtomicInteger proccount_referencechange = new AtomicInteger();
+        final AtomicInteger proccount_citationchange = new AtomicInteger();
+        try {
+            // partitioning of the index, get a facet for a partitioning key
            final long count = collectionConnector.getCountByQuery(collection1query);
+            String partitioningKey = CollectionSchema.responsetime_i.getSolrFieldName();
+            Map<String, ReversibleScoreMap<String>> partitioningFacet = collectionConnector.getFacets(collection1query, 100000, partitioningKey);
+            ReversibleScoreMap<String> partitioning = partitioningFacet.get(partitioningKey);
+            long emptyCount = collectionConnector.getCountByQuery(partitioningKey + ":\"\" AND (" + collection1query + ")");
+            if (emptyCount > 0) partitioning.inc("", (int) emptyCount);
+            for (String partitioningValue: partitioning) {
+                String partitioningQuery = partitioningKey + ":\"" + partitioningValue + "\" AND (" + collection1query + ")";
+                postprocessingActivity = "collecting " + partitioning.get(partitioningValue) + " documents from partition \"" + partitioningValue + "\" (averall " + count + ") from the collection for harvestkey " + harvestkey + ", partitioned by " + partitioningKey;
+
+                // start collection of documents 
                final long start = System.currentTimeMillis();
                final int concurrency = Math.max(1, Math.min((int) (MemoryControl.available() / (100L * 1024L * 1024L)), Runtime.getRuntime().availableProcessors()));
                //final int concurrency = 1;
@ -1264,10 +1280,9 @@ public class CollectionConfiguration extends SchemaConfiguration implements Seri
                        this.contains(CollectionSchema.references_internal_i) &&
                        this.contains(CollectionSchema.references_external_i) &&
                        this.contains(CollectionSchema.references_exthosts_i);
-            postprocessingActivity = "collecting " + count + " documents from the collection for harvestkey " + harvestkey;
                ConcurrentLog.info("CollectionConfiguration", postprocessingActivity);
                final BlockingQueue<SolrDocument> docs = collectionConnector.concurrentDocumentsByQuery(
-                    collection1query,
+                        partitioningQuery,
                        (this.contains(CollectionSchema.http_unique_b) || this.contains(CollectionSchema.www_unique_b)) ?
                        CollectionSchema.host_subdomain_s.getSolrFieldName() + " asc," + // sort on subdomain to get hosts without subdomain first; that gives an opportunity to set www_unique_b flag to false
                        CollectionSchema.url_protocol_s.getSolrFieldName() + " asc" // sort on protocol to get http before https; that gives an opportunity to set http_unique_b flag to false
@ -1297,11 +1312,6 @@ public class CollectionConfiguration extends SchemaConfiguration implements Seri
                        CollectionSchema.inboundlinkscount_i.getSolrFieldName(),
                        CollectionSchema.robots_i.getSolrFieldName()} :
                        this.allFields());
-            final AtomicInteger proccount = new AtomicInteger();
-            final AtomicInteger proccount_referencechange = new AtomicInteger();
-            final AtomicInteger proccount_citationchange = new AtomicInteger();
-            final AtomicInteger countcheck = new AtomicInteger(0);
-            final Collection<String> failids = new ArrayList<String>();
                final Thread rewriteThread[] = new Thread[concurrency];
                for (int rewrite_start = 0; rewrite_start < concurrency; rewrite_start++) {
                    rewriteThread[rewrite_start] = new Thread() {
@ -1403,15 +1413,19 @@ public class CollectionConfiguration extends SchemaConfiguration implements Seri
                }
                // wait for termination
                for (int rewrite_start = 0; rewrite_start < concurrency; rewrite_start++) rewriteThread[rewrite_start].join();
+            }
            
            if (failids.size() > 0) {
                ConcurrentLog.info("CollectionConfiguration", "cleanup_processing: deleting " + failids.size() + " documents which have permanent execution fails");
                collectionConnector.deleteByIds(failids);
            }
-            if (count != countcheck.get()) ConcurrentLog.warn("CollectionConfiguration", "ambiguous collection document count for harvestkey " + harvestkey + ": expected=" + count + ", counted=" + countcheck); // big gap for harvestkey = null
+            if (count != countcheck.get()) ConcurrentLog.warn("CollectionConfiguration", "ambiguous collection document count for harvestkey " + harvestkey + ": expected=" + count + ", counted=" + countcheck + "; countquery=" + collection1query); // big gap for harvestkey = null
            ConcurrentLog.info("CollectionConfiguration", "cleanup_processing: re-calculated " + proccount + " new documents, " +
                        proccount_referencechange + " reference-count changes, " +
                        proccount_citationchange + " citation ranking changes.");
+            
+            
+            
        } catch (final InterruptedException e2) {
            ConcurrentLog.warn("CollectionConfiguration", e2.getMessage(), e2);
        } catch (IOException e3) {