more fixes in postprocessing: partitioning of the complete queue to

enable smaller queries
This commit is contained in:
Michael Peter Christen 2014-10-31 17:30:24 +01:00
parent 2bc6199408
commit 327e83bfe7
3 changed files with 175 additions and 158 deletions

View File

@ -269,6 +269,7 @@ public abstract class AbstractSolrConnector implements SolrConnector {
public void run() {
this.setName("AbstractSolrConnector:concurrentIDsByQuery(" + querystring + ")");
int o = offset;
try {
while (System.currentTimeMillis() < endtime) {
try {
SolrDocumentList sdl = getDocumentListByQuery(querystring, sort, o, Math.min(maxcount, pagesize_ids), CollectionSchema.id.getSolrFieldName());
@ -285,10 +286,12 @@ public abstract class AbstractSolrConnector implements SolrConnector {
break;
}
}
} catch (Throwable e) {} finally {
for (int i = 0; i < concurrency; i++) {
try {queue.put(AbstractSolrConnector.POISON_ID);} catch (final InterruptedException e1) {}
}
}
}
};
t.start();
return queue;

View File

@ -300,7 +300,7 @@ public abstract class SolrServerConnector extends AbstractSolrConnector implemen
QueryResponse rsp;
int retry = 0;
Throwable error = null;
while (retry++ < 60) {
while (retry++ < 10) {
try {
if (q != null) Thread.currentThread().setName("solr query: q = " + q + (fq == null ? "" : ", fq = " + fq) + (sort == null ? "" : ", sort = " + sort) + "; retry = " + retry + "; fl = " + fl); // for debugging in Threaddump
rsp = this.server.query(params);

View File

@ -1252,11 +1252,27 @@ public class CollectionConfiguration extends SchemaConfiguration implements Seri
// process all documents in collection
final Map<String, Long> hostExtentCache = new HashMap<String, Long>(); // a mapping from the host id to the number of documents which contain this host-id
final Set<String> uniqueURLs = new ConcurrentHashSet<String>(); // will be used in a concurrent environment
try {
final Set<String> omitFields = new HashSet<String>();
omitFields.add(CollectionSchema.process_sxt.getSolrFieldName());
omitFields.add(CollectionSchema.harvestkey_s.getSolrFieldName());
final Collection<String> failids = new ArrayList<String>();
final AtomicInteger countcheck = new AtomicInteger(0);
final AtomicInteger proccount = new AtomicInteger();
final AtomicInteger proccount_referencechange = new AtomicInteger();
final AtomicInteger proccount_citationchange = new AtomicInteger();
try {
// partitioning of the index, get a facet for a partitioning key
final long count = collectionConnector.getCountByQuery(collection1query);
String partitioningKey = CollectionSchema.responsetime_i.getSolrFieldName();
Map<String, ReversibleScoreMap<String>> partitioningFacet = collectionConnector.getFacets(collection1query, 100000, partitioningKey);
ReversibleScoreMap<String> partitioning = partitioningFacet.get(partitioningKey);
long emptyCount = collectionConnector.getCountByQuery(partitioningKey + ":\"\" AND (" + collection1query + ")");
if (emptyCount > 0) partitioning.inc("", (int) emptyCount);
for (String partitioningValue: partitioning) {
String partitioningQuery = partitioningKey + ":\"" + partitioningValue + "\" AND (" + collection1query + ")";
postprocessingActivity = "collecting " + partitioning.get(partitioningValue) + " documents from partition \"" + partitioningValue + "\" (averall " + count + ") from the collection for harvestkey " + harvestkey + ", partitioned by " + partitioningKey;
// start collection of documents
final long start = System.currentTimeMillis();
final int concurrency = Math.max(1, Math.min((int) (MemoryControl.available() / (100L * 1024L * 1024L)), Runtime.getRuntime().availableProcessors()));
//final int concurrency = 1;
@ -1264,10 +1280,9 @@ public class CollectionConfiguration extends SchemaConfiguration implements Seri
this.contains(CollectionSchema.references_internal_i) &&
this.contains(CollectionSchema.references_external_i) &&
this.contains(CollectionSchema.references_exthosts_i);
postprocessingActivity = "collecting " + count + " documents from the collection for harvestkey " + harvestkey;
ConcurrentLog.info("CollectionConfiguration", postprocessingActivity);
final BlockingQueue<SolrDocument> docs = collectionConnector.concurrentDocumentsByQuery(
collection1query,
partitioningQuery,
(this.contains(CollectionSchema.http_unique_b) || this.contains(CollectionSchema.www_unique_b)) ?
CollectionSchema.host_subdomain_s.getSolrFieldName() + " asc," + // sort on subdomain to get hosts without subdomain first; that gives an opportunity to set www_unique_b flag to false
CollectionSchema.url_protocol_s.getSolrFieldName() + " asc" // sort on protocol to get http before https; that gives an opportunity to set http_unique_b flag to false
@ -1297,11 +1312,6 @@ public class CollectionConfiguration extends SchemaConfiguration implements Seri
CollectionSchema.inboundlinkscount_i.getSolrFieldName(),
CollectionSchema.robots_i.getSolrFieldName()} :
this.allFields());
final AtomicInteger proccount = new AtomicInteger();
final AtomicInteger proccount_referencechange = new AtomicInteger();
final AtomicInteger proccount_citationchange = new AtomicInteger();
final AtomicInteger countcheck = new AtomicInteger(0);
final Collection<String> failids = new ArrayList<String>();
final Thread rewriteThread[] = new Thread[concurrency];
for (int rewrite_start = 0; rewrite_start < concurrency; rewrite_start++) {
rewriteThread[rewrite_start] = new Thread() {
@ -1403,15 +1413,19 @@ public class CollectionConfiguration extends SchemaConfiguration implements Seri
}
// wait for termination
for (int rewrite_start = 0; rewrite_start < concurrency; rewrite_start++) rewriteThread[rewrite_start].join();
}
if (failids.size() > 0) {
ConcurrentLog.info("CollectionConfiguration", "cleanup_processing: deleting " + failids.size() + " documents which have permanent execution fails");
collectionConnector.deleteByIds(failids);
}
if (count != countcheck.get()) ConcurrentLog.warn("CollectionConfiguration", "ambiguous collection document count for harvestkey " + harvestkey + ": expected=" + count + ", counted=" + countcheck); // big gap for harvestkey = null
if (count != countcheck.get()) ConcurrentLog.warn("CollectionConfiguration", "ambiguous collection document count for harvestkey " + harvestkey + ": expected=" + count + ", counted=" + countcheck + "; countquery=" + collection1query); // big gap for harvestkey = null
ConcurrentLog.info("CollectionConfiguration", "cleanup_processing: re-calculated " + proccount + " new documents, " +
proccount_referencechange + " reference-count changes, " +
proccount_citationchange + " citation ranking changes.");
} catch (final InterruptedException e2) {
ConcurrentLog.warn("CollectionConfiguration", e2.getMessage(), e2);
} catch (IOException e3) {