no sorting if http/www unique fields are not demanded (makes query

faster) and some code restrucuring
This commit is contained in:
Michael Peter Christen 2014-08-04 12:59:38 +02:00
parent 1609763be5
commit 338f574bdc

View File

@ -982,14 +982,12 @@ public class CollectionConfiguration extends SchemaConfiguration implements Seri
public static final String collection1query(final Segment segment, final String harvestkey) {
return (harvestkey == null || !segment.fulltext().getDefaultConfiguration().contains(CollectionSchema.harvestkey_s) ?
"" :
CollectionSchema.harvestkey_s.getSolrFieldName() + ":\"" + harvestkey + "\" AND ") +
"" : CollectionSchema.harvestkey_s.getSolrFieldName() + ":\"" + harvestkey + "\" AND ") +
CollectionSchema.process_sxt.getSolrFieldName() + AbstractSolrConnector.CATCHALL_DTERM;
}
public static final String webgraphquery(final Segment segment, final String harvestkey) {
return (harvestkey == null || !segment.fulltext().getWebgraphConfiguration().contains(WebgraphSchema.harvestkey_s) ?
"" :
WebgraphSchema.harvestkey_s.getSolrFieldName() + ":\"" + harvestkey + "\" AND ") +
"" : WebgraphSchema.harvestkey_s.getSolrFieldName() + ":\"" + harvestkey + "\" AND ") +
WebgraphSchema.process_sxt.getSolrFieldName() + AbstractSolrConnector.CATCHALL_DTERM;
}
@ -1242,9 +1240,10 @@ public class CollectionConfiguration extends SchemaConfiguration implements Seri
ConcurrentLog.info("CollectionConfiguration", postprocessingActivity);
BlockingQueue<SolrDocument> docs = collectionConnector.concurrentDocumentsByQuery(
collection1query,
(this.contains(CollectionSchema.http_unique_b) || this.contains(CollectionSchema.www_unique_b)) ?
CollectionSchema.host_subdomain_s.getSolrFieldName() + " asc," + // sort on subdomain to get hosts without subdomain first; that gives an opportunity to set www_unique_b flag to false
CollectionSchema.url_protocol_s.getSolrFieldName() + " asc," + // sort on protocol to get http before https; that gives an opportunity to set http_unique_b flag to false
CollectionSchema.url_chars_i.getSolrFieldName() + " asc",
CollectionSchema.url_protocol_s.getSolrFieldName() + " asc" // sort on protocol to get http before https; that gives an opportunity to set http_unique_b flag to false
: null, // null sort is faster!
0, 100000000, Long.MAX_VALUE, 200, 1);
int countcheck = 0;
Collection<String> failids = new ArrayList<String>();
@ -1376,12 +1375,11 @@ public class CollectionConfiguration extends SchemaConfiguration implements Seri
String urlhash = ASCII.String(url.hash());
String hostid = url.hosthash();
Conjunction con = new Conjunction();
con.addOperand(new Negation(new Literal(CollectionSchema.id, urlhash)));
con.addOperand(new Literal(CollectionSchema.host_id_s, hostid));
Disjunction dnf = new Disjunction();
uniquecheck: for (CollectionSchema[] checkfields: new CollectionSchema[][]{
CollectionSchema[][] doccheckschema = new CollectionSchema[][]{
{CollectionSchema.exact_signature_l, CollectionSchema.exact_signature_unique_b, CollectionSchema.exact_signature_copycount_i},
{CollectionSchema.fuzzy_signature_l, CollectionSchema.fuzzy_signature_unique_b, CollectionSchema.fuzzy_signature_copycount_i}}) {
{CollectionSchema.fuzzy_signature_l, CollectionSchema.fuzzy_signature_unique_b, CollectionSchema.fuzzy_signature_copycount_i}};
uniquecheck: for (CollectionSchema[] checkfields: doccheckschema) {
CollectionSchema signaturefield = checkfields[0];
CollectionSchema uniquefield = checkfields[1];
CollectionSchema countfield = checkfields[2];
@ -1396,6 +1394,8 @@ public class CollectionConfiguration extends SchemaConfiguration implements Seri
}
}
con.addOperand(dnf);
con.addOperand(new Negation(new Literal(CollectionSchema.id, urlhash)));
con.addOperand(new Literal(CollectionSchema.host_id_s, hostid));
String query = con.toString();
SolrDocumentList docsAkk;
try {
@ -1403,11 +1403,9 @@ public class CollectionConfiguration extends SchemaConfiguration implements Seri
CollectionSchema.id.getSolrFieldName(), CollectionSchema.exact_signature_l.getSolrFieldName(), CollectionSchema.fuzzy_signature_l.getSolrFieldName());
} catch (final IOException e) {
ConcurrentLog.logException(e);
docsAkk = new SolrDocumentList();
docsAkk = new SolrDocumentList();
}
uniquecheck: for (CollectionSchema[] checkfields: new CollectionSchema[][]{
{CollectionSchema.exact_signature_l, CollectionSchema.exact_signature_unique_b, CollectionSchema.exact_signature_copycount_i},
{CollectionSchema.fuzzy_signature_l, CollectionSchema.fuzzy_signature_unique_b, CollectionSchema.fuzzy_signature_copycount_i}}) {
if (docsAkk.getNumFound() > 0) uniquecheck: for (CollectionSchema[] checkfields: doccheckschema) {
CollectionSchema signaturefield = checkfields[0];
CollectionSchema uniquefield = checkfields[1];
CollectionSchema countfield = checkfields[2];
@ -1437,13 +1435,15 @@ public class CollectionConfiguration extends SchemaConfiguration implements Seri
Integer httpstatus_i = this.contains(CollectionSchema.httpstatus_i) ? (Integer) sid.getFieldValue(CollectionSchema.httpstatus_i.getSolrFieldName()) : null;
String canonical_s = this.contains(CollectionSchema.canonical_s) ? (String) sid.getFieldValue(CollectionSchema.canonical_s.getSolrFieldName()) : null;
Boolean canonical_equal_sku_b = this.contains(CollectionSchema.canonical_equal_sku_b) ? (Boolean) sid.getFieldValue(CollectionSchema.canonical_equal_sku_b.getSolrFieldName()) : null;
CollectionSchema[][] metadatacheckschema = new CollectionSchema[][]{
{CollectionSchema.title, CollectionSchema.title_exact_signature_l, CollectionSchema.title_unique_b},
{CollectionSchema.description_txt, CollectionSchema.description_exact_signature_l, CollectionSchema.description_unique_b}};
if (segment.fulltext().getDefaultConfiguration().contains(CollectionSchema.host_id_s) &&
(robots_i == null || (robots_i.intValue() & (1 << 9)) == 0 /*noindex in http X-ROBOTS*/ && (robots_i.intValue() & (1 << 3)) == 0 /*noindex in html metas*/ ) &&
(canonical_s == null || canonical_s.length() == 0 || (canonical_equal_sku_b != null && canonical_equal_sku_b.booleanValue()) || url.toNormalform(true).equals(canonical_s)) &&
(httpstatus_i == null || httpstatus_i.intValue() == 200)) {
uniquecheck: for (CollectionSchema[] checkfields: new CollectionSchema[][] {
{CollectionSchema.title, CollectionSchema.title_exact_signature_l, CollectionSchema.title_unique_b},
{CollectionSchema.description_txt, CollectionSchema.description_exact_signature_l, CollectionSchema.description_unique_b}}) {
uniquecheck: for (CollectionSchema[] checkfields: metadatacheckschema) {
CollectionSchema checkfield = checkfields[0];
CollectionSchema signaturefield = checkfields[1];
CollectionSchema uniquefield = checkfields[2];