mirror of
https://github.com/yacy/yacy_search_server.git
synced 2024-09-19 00:01:41 +02:00
more fixes in postprocessing: partitioning of the complete queue to
enable smaller queries
This commit is contained in:
parent
2bc6199408
commit
327e83bfe7
|
@ -269,6 +269,7 @@ public abstract class AbstractSolrConnector implements SolrConnector {
|
|||
public void run() {
|
||||
this.setName("AbstractSolrConnector:concurrentIDsByQuery(" + querystring + ")");
|
||||
int o = offset;
|
||||
try {
|
||||
while (System.currentTimeMillis() < endtime) {
|
||||
try {
|
||||
SolrDocumentList sdl = getDocumentListByQuery(querystring, sort, o, Math.min(maxcount, pagesize_ids), CollectionSchema.id.getSolrFieldName());
|
||||
|
@ -285,10 +286,12 @@ public abstract class AbstractSolrConnector implements SolrConnector {
|
|||
break;
|
||||
}
|
||||
}
|
||||
} catch (Throwable e) {} finally {
|
||||
for (int i = 0; i < concurrency; i++) {
|
||||
try {queue.put(AbstractSolrConnector.POISON_ID);} catch (final InterruptedException e1) {}
|
||||
}
|
||||
}
|
||||
}
|
||||
};
|
||||
t.start();
|
||||
return queue;
|
||||
|
|
|
@ -300,7 +300,7 @@ public abstract class SolrServerConnector extends AbstractSolrConnector implemen
|
|||
QueryResponse rsp;
|
||||
int retry = 0;
|
||||
Throwable error = null;
|
||||
while (retry++ < 60) {
|
||||
while (retry++ < 10) {
|
||||
try {
|
||||
if (q != null) Thread.currentThread().setName("solr query: q = " + q + (fq == null ? "" : ", fq = " + fq) + (sort == null ? "" : ", sort = " + sort) + "; retry = " + retry + "; fl = " + fl); // for debugging in Threaddump
|
||||
rsp = this.server.query(params);
|
||||
|
|
|
@ -1252,11 +1252,27 @@ public class CollectionConfiguration extends SchemaConfiguration implements Seri
|
|||
// process all documents in collection
|
||||
final Map<String, Long> hostExtentCache = new HashMap<String, Long>(); // a mapping from the host id to the number of documents which contain this host-id
|
||||
final Set<String> uniqueURLs = new ConcurrentHashSet<String>(); // will be used in a concurrent environment
|
||||
try {
|
||||
final Set<String> omitFields = new HashSet<String>();
|
||||
omitFields.add(CollectionSchema.process_sxt.getSolrFieldName());
|
||||
omitFields.add(CollectionSchema.harvestkey_s.getSolrFieldName());
|
||||
final Collection<String> failids = new ArrayList<String>();
|
||||
final AtomicInteger countcheck = new AtomicInteger(0);
|
||||
final AtomicInteger proccount = new AtomicInteger();
|
||||
final AtomicInteger proccount_referencechange = new AtomicInteger();
|
||||
final AtomicInteger proccount_citationchange = new AtomicInteger();
|
||||
try {
|
||||
// partitioning of the index, get a facet for a partitioning key
|
||||
final long count = collectionConnector.getCountByQuery(collection1query);
|
||||
String partitioningKey = CollectionSchema.responsetime_i.getSolrFieldName();
|
||||
Map<String, ReversibleScoreMap<String>> partitioningFacet = collectionConnector.getFacets(collection1query, 100000, partitioningKey);
|
||||
ReversibleScoreMap<String> partitioning = partitioningFacet.get(partitioningKey);
|
||||
long emptyCount = collectionConnector.getCountByQuery(partitioningKey + ":\"\" AND (" + collection1query + ")");
|
||||
if (emptyCount > 0) partitioning.inc("", (int) emptyCount);
|
||||
for (String partitioningValue: partitioning) {
|
||||
String partitioningQuery = partitioningKey + ":\"" + partitioningValue + "\" AND (" + collection1query + ")";
|
||||
postprocessingActivity = "collecting " + partitioning.get(partitioningValue) + " documents from partition \"" + partitioningValue + "\" (averall " + count + ") from the collection for harvestkey " + harvestkey + ", partitioned by " + partitioningKey;
|
||||
|
||||
// start collection of documents
|
||||
final long start = System.currentTimeMillis();
|
||||
final int concurrency = Math.max(1, Math.min((int) (MemoryControl.available() / (100L * 1024L * 1024L)), Runtime.getRuntime().availableProcessors()));
|
||||
//final int concurrency = 1;
|
||||
|
@ -1264,10 +1280,9 @@ public class CollectionConfiguration extends SchemaConfiguration implements Seri
|
|||
this.contains(CollectionSchema.references_internal_i) &&
|
||||
this.contains(CollectionSchema.references_external_i) &&
|
||||
this.contains(CollectionSchema.references_exthosts_i);
|
||||
postprocessingActivity = "collecting " + count + " documents from the collection for harvestkey " + harvestkey;
|
||||
ConcurrentLog.info("CollectionConfiguration", postprocessingActivity);
|
||||
final BlockingQueue<SolrDocument> docs = collectionConnector.concurrentDocumentsByQuery(
|
||||
collection1query,
|
||||
partitioningQuery,
|
||||
(this.contains(CollectionSchema.http_unique_b) || this.contains(CollectionSchema.www_unique_b)) ?
|
||||
CollectionSchema.host_subdomain_s.getSolrFieldName() + " asc," + // sort on subdomain to get hosts without subdomain first; that gives an opportunity to set www_unique_b flag to false
|
||||
CollectionSchema.url_protocol_s.getSolrFieldName() + " asc" // sort on protocol to get http before https; that gives an opportunity to set http_unique_b flag to false
|
||||
|
@ -1297,11 +1312,6 @@ public class CollectionConfiguration extends SchemaConfiguration implements Seri
|
|||
CollectionSchema.inboundlinkscount_i.getSolrFieldName(),
|
||||
CollectionSchema.robots_i.getSolrFieldName()} :
|
||||
this.allFields());
|
||||
final AtomicInteger proccount = new AtomicInteger();
|
||||
final AtomicInteger proccount_referencechange = new AtomicInteger();
|
||||
final AtomicInteger proccount_citationchange = new AtomicInteger();
|
||||
final AtomicInteger countcheck = new AtomicInteger(0);
|
||||
final Collection<String> failids = new ArrayList<String>();
|
||||
final Thread rewriteThread[] = new Thread[concurrency];
|
||||
for (int rewrite_start = 0; rewrite_start < concurrency; rewrite_start++) {
|
||||
rewriteThread[rewrite_start] = new Thread() {
|
||||
|
@ -1403,15 +1413,19 @@ public class CollectionConfiguration extends SchemaConfiguration implements Seri
|
|||
}
|
||||
// wait for termination
|
||||
for (int rewrite_start = 0; rewrite_start < concurrency; rewrite_start++) rewriteThread[rewrite_start].join();
|
||||
}
|
||||
|
||||
if (failids.size() > 0) {
|
||||
ConcurrentLog.info("CollectionConfiguration", "cleanup_processing: deleting " + failids.size() + " documents which have permanent execution fails");
|
||||
collectionConnector.deleteByIds(failids);
|
||||
}
|
||||
if (count != countcheck.get()) ConcurrentLog.warn("CollectionConfiguration", "ambiguous collection document count for harvestkey " + harvestkey + ": expected=" + count + ", counted=" + countcheck); // big gap for harvestkey = null
|
||||
if (count != countcheck.get()) ConcurrentLog.warn("CollectionConfiguration", "ambiguous collection document count for harvestkey " + harvestkey + ": expected=" + count + ", counted=" + countcheck + "; countquery=" + collection1query); // big gap for harvestkey = null
|
||||
ConcurrentLog.info("CollectionConfiguration", "cleanup_processing: re-calculated " + proccount + " new documents, " +
|
||||
proccount_referencechange + " reference-count changes, " +
|
||||
proccount_citationchange + " citation ranking changes.");
|
||||
|
||||
|
||||
|
||||
} catch (final InterruptedException e2) {
|
||||
ConcurrentLog.warn("CollectionConfiguration", e2.getMessage(), e2);
|
||||
} catch (IOException e3) {
|
||||
|
|
Loading…
Reference in New Issue
Block a user