mirror of
https://github.com/yacy/yacy_search_server.git
synced 2024-09-19 00:01:41 +02:00
enhanced postprocessing: fixed bugs, enable proper postprocessing also
without the harvestingkey, remove crawl profiles after postprocessing, speed-up for clickdepth computation.
This commit is contained in:
parent
299f51cb7f
commit
74d0256e93
|
@ -244,7 +244,7 @@ public class SchemaConfiguration extends Configuration implements Serializable {
|
||||||
}
|
}
|
||||||
|
|
||||||
public boolean contains(SchemaDeclaration field) {
|
public boolean contains(SchemaDeclaration field) {
|
||||||
return this.contains(field.name());
|
return this.contains(field.getSolrFieldName());
|
||||||
}
|
}
|
||||||
|
|
||||||
public void add(final SolrInputDocument doc, final SchemaDeclaration key, final String value) {
|
public void add(final SolrInputDocument doc, final SchemaDeclaration key, final String value) {
|
||||||
|
|
|
@ -555,19 +555,25 @@ public final class CrawlSwitchboard {
|
||||||
return hasDoneSomething;
|
return hasDoneSomething;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
public Set<String> getActiveProfiles() {
|
||||||
|
// find all profiles that are candidates for deletion
|
||||||
|
Set<String> profileKeys = new HashSet<String>();
|
||||||
|
for (final byte[] handle: this.getActive()) {
|
||||||
|
CrawlProfile entry;
|
||||||
|
entry = new CrawlProfile(this.getActive(handle));
|
||||||
|
if (!CrawlSwitchboard.DEFAULT_PROFILES.contains(entry.name())) {
|
||||||
|
profileKeys.add(ASCII.String(handle));
|
||||||
|
}
|
||||||
|
}
|
||||||
|
return profileKeys;
|
||||||
|
}
|
||||||
|
|
||||||
public Set<String> getFinishesProfiles(CrawlQueues crawlQueues) {
|
public Set<String> getFinishesProfiles(CrawlQueues crawlQueues) {
|
||||||
// clear the counter cache
|
// clear the counter cache
|
||||||
this.profilesActiveCrawlsCounter.clear();
|
this.profilesActiveCrawlsCounter.clear();
|
||||||
|
|
||||||
// find all profiles that are candidates for deletion
|
// find all profiles that are candidates for deletion
|
||||||
Set<String> deletionCandidate = new HashSet<String>();
|
Set<String> deletionCandidate = getActiveProfiles();
|
||||||
for (final byte[] handle: this.getActive()) {
|
|
||||||
CrawlProfile entry;
|
|
||||||
entry = new CrawlProfile(this.getActive(handle));
|
|
||||||
if (!CrawlSwitchboard.DEFAULT_PROFILES.contains(entry.name())) {
|
|
||||||
deletionCandidate.add(ASCII.String(handle));
|
|
||||||
}
|
|
||||||
}
|
|
||||||
if (deletionCandidate.size() == 0) return new HashSet<String>(0);
|
if (deletionCandidate.size() == 0) return new HashSet<String>(0);
|
||||||
|
|
||||||
// iterate through all the queues and see if one of these handles appear there
|
// iterate through all the queues and see if one of these handles appear there
|
||||||
|
@ -602,6 +608,13 @@ public final class CrawlSwitchboard {
|
||||||
return deletionCandidate;
|
return deletionCandidate;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
public boolean allCrawlsFinished(CrawlQueues crawlQueues) {
|
||||||
|
if (!crawlQueues.noticeURL.isEmpty()) return false;
|
||||||
|
// look into the CrawlQueues.worker as well
|
||||||
|
if (switchboard.crawlQueues.activeWorkerEntries().length > 0) return false;
|
||||||
|
return true;
|
||||||
|
}
|
||||||
|
|
||||||
public void cleanProfiles(Set<String> deletionCandidate) {
|
public void cleanProfiles(Set<String> deletionCandidate) {
|
||||||
// all entries that are left are candidates for deletion; do that now
|
// all entries that are left are candidates for deletion; do that now
|
||||||
for (String h: deletionCandidate) {
|
for (String h: deletionCandidate) {
|
||||||
|
|
|
@ -59,6 +59,7 @@ import java.util.ArrayList;
|
||||||
import java.util.Collection;
|
import java.util.Collection;
|
||||||
import java.util.Date;
|
import java.util.Date;
|
||||||
import java.util.HashMap;
|
import java.util.HashMap;
|
||||||
|
import java.util.HashSet;
|
||||||
import java.util.Iterator;
|
import java.util.Iterator;
|
||||||
import java.util.LinkedHashSet;
|
import java.util.LinkedHashSet;
|
||||||
import java.util.List;
|
import java.util.List;
|
||||||
|
@ -2136,27 +2137,6 @@ public final class Switchboard extends serverSwitch {
|
||||||
ResultURLs.clearStack(origin);
|
ResultURLs.clearStack(origin);
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
// clean up profiles
|
|
||||||
checkInterruption();
|
|
||||||
|
|
||||||
if (!this.crawlJobIsPaused(SwitchboardConstants.CRAWLJOB_LOCAL_CRAWL)) {
|
|
||||||
Set<String> deletionCandidates = this.crawler.getFinishesProfiles(this.crawlQueues);
|
|
||||||
int cleanup = deletionCandidates.size();
|
|
||||||
if (cleanup > 0) {
|
|
||||||
// run postprocessing on these profiles
|
|
||||||
postprocessingRunning = true;
|
|
||||||
int proccount = 0;
|
|
||||||
for (String profileHash: deletionCandidates) {
|
|
||||||
proccount += index.fulltext().getDefaultConfiguration().postprocessing(index, profileHash);
|
|
||||||
proccount += index.fulltext().getWebgraphConfiguration().postprocessing(index, profileHash);
|
|
||||||
}
|
|
||||||
postprocessingRunning = false;
|
|
||||||
|
|
||||||
this.crawler.cleanProfiles(deletionCandidates);
|
|
||||||
log.info("cleanup removed " + cleanup + " crawl profiles, post-processed " + proccount + " documents");
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
// clean up news
|
// clean up news
|
||||||
checkInterruption();
|
checkInterruption();
|
||||||
|
@ -2289,37 +2269,67 @@ public final class Switchboard extends serverSwitch {
|
||||||
JenaTripleStore.saveAll();
|
JenaTripleStore.saveAll();
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
||||||
|
// clean up profiles
|
||||||
|
checkInterruption();
|
||||||
|
|
||||||
// if no crawl is running and processing is activated:
|
// if no crawl is running and processing is activated:
|
||||||
// execute the (post-) processing steps for all entries that have a process tag assigned
|
// execute the (post-) processing steps for all entries that have a process tag assigned
|
||||||
if (this.crawlQueues.coreCrawlJobSize() == 0) {
|
if (!this.crawlJobIsPaused(SwitchboardConstants.CRAWLJOB_LOCAL_CRAWL)) {
|
||||||
if (this.crawlQueues.noticeURL.isEmpty()) {
|
|
||||||
Domains.clear();
|
|
||||||
this.crawlQueues.noticeURL.clear(); // flushes more caches
|
|
||||||
}
|
|
||||||
postprocessingRunning = true;
|
|
||||||
int proccount = 0;
|
int proccount = 0;
|
||||||
proccount += index.fulltext().getDefaultConfiguration().postprocessing(index, null);
|
if (index.fulltext().getDefaultConfiguration().contains(CollectionSchema.harvestkey_s.getSolrFieldName())) {
|
||||||
proccount += index.fulltext().getWebgraphConfiguration().postprocessing(index, null);
|
Set<String> deletionCandidates = this.crawler.getFinishesProfiles(this.crawlQueues);
|
||||||
long idleSearch = System.currentTimeMillis() - this.localSearchLastAccess;
|
int cleanup = deletionCandidates.size();
|
||||||
long idleAdmin = System.currentTimeMillis() - this.adminAuthenticationLastAccess;
|
if (cleanup > 0) {
|
||||||
long deltaOptimize = System.currentTimeMillis() - this.optimizeLastRun;
|
// run postprocessing on these profiles
|
||||||
boolean optimizeRequired = deltaOptimize > 60000 * 60 * 3; // 3 hours
|
postprocessingRunning = true;
|
||||||
int opts = Math.max(1, (int) (index.fulltext().collectionSize() / 5000000));
|
for (String profileHash: deletionCandidates) {
|
||||||
|
proccount += index.fulltext().getDefaultConfiguration().postprocessing(index, profileHash);
|
||||||
log.info("Solr auto-optimization: idleSearch=" + idleSearch + ", idleAdmin=" + idleAdmin + ", deltaOptimize=" + deltaOptimize + ", proccount=" + proccount);
|
proccount += index.fulltext().getWebgraphConfiguration().postprocessing(index, profileHash);
|
||||||
if (idleAdmin > 600000) {
|
}
|
||||||
// only run optimization if the admin is idle (10 minutes)
|
|
||||||
if (proccount > 0) {
|
this.crawler.cleanProfiles(deletionCandidates);
|
||||||
opts++; // have postprocessings will force optimazion with one more Segment which is small an quick
|
log.info("cleanup removed " + cleanup + " crawl profiles, post-processed " + proccount + " documents");
|
||||||
optimizeRequired = true;
|
|
||||||
}
|
}
|
||||||
if (optimizeRequired) {
|
} else {
|
||||||
if (idleSearch < 600000) opts++; // < 10 minutes idle time will cause a optimization with one more Segment which is small an quick
|
if (this.crawler.allCrawlsFinished(this.crawlQueues)) {
|
||||||
log.info("Solr auto-optimization: running solr.optimize(" + opts + ")");
|
// run postprocessing on all profiles
|
||||||
index.fulltext().optimize(opts);
|
postprocessingRunning = true;
|
||||||
this.optimizeLastRun = System.currentTimeMillis();
|
proccount += index.fulltext().getDefaultConfiguration().postprocessing(index, null);
|
||||||
|
proccount += index.fulltext().getWebgraphConfiguration().postprocessing(index, null);
|
||||||
|
|
||||||
|
this.crawler.cleanProfiles(this.crawler.getActiveProfiles());
|
||||||
|
log.info("cleanup post-processed " + proccount + " documents");
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
if (this.crawler.allCrawlsFinished(this.crawlQueues)) {
|
||||||
|
// flush caches
|
||||||
|
Domains.clear();
|
||||||
|
this.crawlQueues.noticeURL.clear();
|
||||||
|
|
||||||
|
// do solr optimization
|
||||||
|
long idleSearch = System.currentTimeMillis() - this.localSearchLastAccess;
|
||||||
|
long idleAdmin = System.currentTimeMillis() - this.adminAuthenticationLastAccess;
|
||||||
|
long deltaOptimize = System.currentTimeMillis() - this.optimizeLastRun;
|
||||||
|
boolean optimizeRequired = deltaOptimize > 60000 * 60 * 3; // 3 hours
|
||||||
|
int opts = Math.max(1, (int) (index.fulltext().collectionSize() / 5000000));
|
||||||
|
|
||||||
|
log.info("Solr auto-optimization: idleSearch=" + idleSearch + ", idleAdmin=" + idleAdmin + ", deltaOptimize=" + deltaOptimize + ", proccount=" + proccount);
|
||||||
|
if (idleAdmin > 600000) {
|
||||||
|
// only run optimization if the admin is idle (10 minutes)
|
||||||
|
if (proccount > 0) {
|
||||||
|
opts++; // have postprocessings will force optimazion with one more Segment which is small an quick
|
||||||
|
optimizeRequired = true;
|
||||||
|
}
|
||||||
|
if (optimizeRequired) {
|
||||||
|
if (idleSearch < 600000) opts++; // < 10 minutes idle time will cause a optimization with one more Segment which is small an quick
|
||||||
|
log.info("Solr auto-optimization: running solr.optimize(" + opts + ")");
|
||||||
|
index.fulltext().optimize(opts);
|
||||||
|
this.optimizeLastRun = System.currentTimeMillis();
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
postprocessingRunning = false;
|
postprocessingRunning = false;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
|
@ -230,8 +230,8 @@ public class Segment {
|
||||||
final byte[] hosthash = new byte[6]; // the host of the url to be checked
|
final byte[] hosthash = new byte[6]; // the host of the url to be checked
|
||||||
System.arraycopy(searchhash, 6, hosthash, 0, 6);
|
System.arraycopy(searchhash, 6, hosthash, 0, 6);
|
||||||
|
|
||||||
long timeout = System.currentTimeMillis() + 10000;
|
long timeout = System.currentTimeMillis() + 1000;
|
||||||
for (int maxdepth = 0; maxdepth < 10 && System.currentTimeMillis() < timeout; maxdepth++) {
|
mainloop: for (int maxdepth = 0; maxdepth < 6 && System.currentTimeMillis() < timeout; maxdepth++) {
|
||||||
|
|
||||||
RowHandleSet checknext = new RowHandleSet(URIMetadataRow.rowdef.primaryKeyLength, URIMetadataRow.rowdef.objectOrder, 100);
|
RowHandleSet checknext = new RowHandleSet(URIMetadataRow.rowdef.primaryKeyLength, URIMetadataRow.rowdef.objectOrder, 100);
|
||||||
|
|
||||||
|
@ -247,12 +247,12 @@ public class Segment {
|
||||||
if (ref == null) continue nextloop;
|
if (ref == null) continue nextloop;
|
||||||
byte[] u = ref.urlhash();
|
byte[] u = ref.urlhash();
|
||||||
|
|
||||||
// check ignore
|
|
||||||
if (ignore.has(u)) continue nextloop;
|
|
||||||
|
|
||||||
// check if this is from the same host
|
// check if this is from the same host
|
||||||
if (!ByteBuffer.equals(u, 6, hosthash, 0, 6)) continue nextloop;
|
if (!ByteBuffer.equals(u, 6, hosthash, 0, 6)) continue nextloop;
|
||||||
|
|
||||||
|
// check ignore
|
||||||
|
if (ignore.has(u)) continue nextloop;
|
||||||
|
|
||||||
// check if the url is a root url
|
// check if the url is a root url
|
||||||
if (rootCandidates.has(u)) {
|
if (rootCandidates.has(u)) {
|
||||||
return leveldepth + 1;
|
return leveldepth + 1;
|
||||||
|
@ -262,10 +262,10 @@ public class Segment {
|
||||||
try {checknext.put(u);} catch (final SpaceExceededException e) {}
|
try {checknext.put(u);} catch (final SpaceExceededException e) {}
|
||||||
try {ignore.put(u);} catch (final SpaceExceededException e) {}
|
try {ignore.put(u);} catch (final SpaceExceededException e) {}
|
||||||
}
|
}
|
||||||
|
if (System.currentTimeMillis() > timeout) break mainloop;
|
||||||
}
|
}
|
||||||
leveldepth++;
|
leveldepth++;
|
||||||
levelhashes = checknext;
|
levelhashes = checknext;
|
||||||
|
|
||||||
}
|
}
|
||||||
return 999;
|
return 999;
|
||||||
}
|
}
|
||||||
|
|
|
@ -895,10 +895,9 @@ public class CollectionConfiguration extends SchemaConfiguration implements Seri
|
||||||
ReversibleScoreMap<String> hostscore = null;
|
ReversibleScoreMap<String> hostscore = null;
|
||||||
try {
|
try {
|
||||||
// collect hosts from index which shall take part in citation computation
|
// collect hosts from index which shall take part in citation computation
|
||||||
hostscore = collectionConnector.getFacets(
|
String query = (harvestkey == null || !segment.fulltext().getDefaultConfiguration().contains(CollectionSchema.harvestkey_s) ? "" : CollectionSchema.harvestkey_s.getSolrFieldName() + ":\"" + harvestkey + "\" AND ") +
|
||||||
(harvestkey == null ? "" : CollectionSchema.harvestkey_s.getSolrFieldName() + ":\"" + harvestkey + "\" AND ") +
|
CollectionSchema.process_sxt.getSolrFieldName() + ":" + ProcessType.CITATION.toString();
|
||||||
CollectionSchema.process_sxt.getSolrFieldName() + ":" + ProcessType.CITATION.toString(),
|
hostscore = collectionConnector.getFacets(query, 10000000, CollectionSchema.host_s.getSolrFieldName()).get(CollectionSchema.host_s.getSolrFieldName());
|
||||||
10000000, CollectionSchema.host_s.getSolrFieldName()).get(CollectionSchema.host_s.getSolrFieldName());
|
|
||||||
if (hostscore == null) hostscore = new ClusteredScoreMap<String>();
|
if (hostscore == null) hostscore = new ClusteredScoreMap<String>();
|
||||||
|
|
||||||
for (String host: hostscore.keyList(true)) {
|
for (String host: hostscore.keyList(true)) {
|
||||||
|
@ -906,9 +905,8 @@ public class CollectionConfiguration extends SchemaConfiguration implements Seri
|
||||||
// This shall fulfill the following requirement:
|
// This shall fulfill the following requirement:
|
||||||
// If a document A links to B and B contains a 'canonical C', then the citation rank coputation shall consider that A links to C and B does not link to C.
|
// If a document A links to B and B contains a 'canonical C', then the citation rank coputation shall consider that A links to C and B does not link to C.
|
||||||
// To do so, we first must collect all canonical links, find all references to them, get the anchor list of the documents and patch the citation reference of these links
|
// To do so, we first must collect all canonical links, find all references to them, get the anchor list of the documents and patch the citation reference of these links
|
||||||
BlockingQueue<SolrDocument> documents_with_canonical_tag = collectionConnector.concurrentDocumentsByQuery(
|
String patchquery = CollectionSchema.host_s.getSolrFieldName() + ":" + host + " AND " + CollectionSchema.canonical_s.getSolrFieldName() + ":[* TO *]";
|
||||||
CollectionSchema.host_s.getSolrFieldName() + ":" + host + " AND " + CollectionSchema.canonical_s.getSolrFieldName() + ":[* TO *]",
|
BlockingQueue<SolrDocument> documents_with_canonical_tag = collectionConnector.concurrentDocumentsByQuery(patchquery, 0, 10000000, 60000L, 50,
|
||||||
0, 10000000, 60000L, 50,
|
|
||||||
CollectionSchema.id.getSolrFieldName(), CollectionSchema.sku.getSolrFieldName(), CollectionSchema.canonical_s.getSolrFieldName());
|
CollectionSchema.id.getSolrFieldName(), CollectionSchema.sku.getSolrFieldName(), CollectionSchema.canonical_s.getSolrFieldName());
|
||||||
SolrDocument doc_B;
|
SolrDocument doc_B;
|
||||||
try {
|
try {
|
||||||
|
|
|
@ -302,10 +302,9 @@ public class WebgraphConfiguration extends SchemaConfiguration implements Serial
|
||||||
// that means we must search for those entries.
|
// that means we must search for those entries.
|
||||||
connector.commit(true); // make sure that we have latest information that can be found
|
connector.commit(true); // make sure that we have latest information that can be found
|
||||||
//BlockingQueue<SolrDocument> docs = index.fulltext().getSolr().concurrentQuery("*:*", 0, 1000, 60000, 10);
|
//BlockingQueue<SolrDocument> docs = index.fulltext().getSolr().concurrentQuery("*:*", 0, 1000, 60000, 10);
|
||||||
BlockingQueue<SolrDocument> docs = connector.concurrentDocumentsByQuery(
|
String query = (harvestkey == null || !this.contains(WebgraphSchema.harvestkey_s) ? "" : WebgraphSchema.harvestkey_s.getSolrFieldName() + ":\"" + harvestkey + "\" AND ") +
|
||||||
(harvestkey == null ? "" : CollectionSchema.harvestkey_s.getSolrFieldName() + ":\"" + harvestkey + "\" AND ") +
|
WebgraphSchema.process_sxt.getSolrFieldName() + ":[* TO *]";
|
||||||
WebgraphSchema.process_sxt.getSolrFieldName() + ":[* TO *]",
|
BlockingQueue<SolrDocument> docs = connector.concurrentDocumentsByQuery(query, 0, 100000, 60000, 50);
|
||||||
0, 100000, 60000, 50);
|
|
||||||
|
|
||||||
SolrDocument doc;
|
SolrDocument doc;
|
||||||
String protocol, urlstub, id;
|
String protocol, urlstub, id;
|
||||||
|
@ -341,9 +340,10 @@ public class WebgraphConfiguration extends SchemaConfiguration implements Serial
|
||||||
|
|
||||||
// all processing steps checked, remove the processing tag
|
// all processing steps checked, remove the processing tag
|
||||||
sid.removeField(WebgraphSchema.process_sxt.getSolrFieldName());
|
sid.removeField(WebgraphSchema.process_sxt.getSolrFieldName());
|
||||||
sid.removeField(WebgraphSchema.harvestkey_s.getSolrFieldName());
|
if (this.contains(WebgraphSchema.harvestkey_s)) sid.removeField(WebgraphSchema.harvestkey_s.getSolrFieldName());
|
||||||
|
|
||||||
// send back to index
|
// send back to index
|
||||||
|
connector.deleteById((String) doc.getFieldValue(WebgraphSchema.id.getSolrFieldName()));
|
||||||
connector.add(sid);
|
connector.add(sid);
|
||||||
proccount++;
|
proccount++;
|
||||||
} catch (final Throwable e1) {
|
} catch (final Throwable e1) {
|
||||||
|
|
Loading…
Reference in New Issue
Block a user