fix for smb crawl situation (lost too many urls)

This commit is contained in:
Michael Peter Christen 2012-12-26 19:15:11 +01:00
parent d456f69381
commit 8f3bd0c387
4 changed files with 9 additions and 7 deletions

View File

@ -68,6 +68,7 @@ public class Balancer {
private static final String indexSuffix = "A.db";
private static final int EcoFSBufferSize = 1000;
private static final int objectIndexBufferSize = 1000;
private static final int MAX_DOUBLE_PUSH_CHECK = 100000;
// class variables filled with external values
private final File cacheStacksPath;
@ -274,7 +275,7 @@ public class Balancer {
if (this.double_push_check.has(hash)) return "double occurrence in double_push_check";
if (this.urlFileIndex.has(hash)) return "double occurrence in urlFileIndex";
if (this.double_push_check.size() > 10000 || MemoryControl.shortStatus()) this.double_push_check.clear();
if (this.double_push_check.size() > MAX_DOUBLE_PUSH_CHECK || MemoryControl.shortStatus()) this.double_push_check.clear();
this.double_push_check.put(hash);
// add to index

View File

@ -465,14 +465,14 @@ public final class CrawlStacker {
((System.currentTimeMillis() - oldDate.getTime()) / 60000 / 60 / 24) + " days ago.");
} else {
if (dbocc == null) {
return "double in: LURL-DB";
return "double in: LURL-DB, oldDate = " + oldDate.toString();
}
if (this.log.isInfo()) this.log.logInfo("URL '" + urlstring + "' is double registered in '" + dbocc.toString() + "'. " + "Stack processing time:");
if (dbocc == HarvestProcess.ERRORS) {
final ZURL.Entry errorEntry = this.nextQueue.errorURL.get(url.hash());
return "double in: errors (" + errorEntry.anycause() + ")";
return "double in: errors (" + errorEntry.anycause() + "), oldDate = " + oldDate.toString();
}
return "double in: " + dbocc.toString();
return "double in: " + dbocc.toString() + ", oldDate = " + oldDate.toString();
}
}

View File

@ -148,9 +148,9 @@ public class CrawlQueues {
if (this.errorURL.exists(hash)) {
return HarvestProcess.ERRORS;
}
if (this.noticeURL.existsInStack(hash)) {
return HarvestProcess.CRAWLER;
}
//if (this.noticeURL.existsInStack(hash)) {
// return HarvestProcess.CRAWLER;
//} // this is disabled because it prevents propert crawling of smb shares. The cause is unknown
for (final Loader worker: this.workers.values()) {
if (Base64Order.enhancedCoder.equal(worker.request.url().hash(), hash)) {
return HarvestProcess.WORKER;

View File

@ -235,6 +235,7 @@ public class RobotsTxt {
}
public void ensureExist(final MultiProtocolURI theURL, final Set<String> thisAgents, boolean concurrent) {
if (theURL.isLocal()) return;
final String urlHostPort = getHostPort(theURL);
if (urlHostPort == null) return;
final BEncodedHeap robotsTable;