fix for smb crawl situation (lost too many urls)

This commit is contained in:
Michael Peter Christen 2012-12-26 19:15:11 +01:00
parent d456f69381
commit 8f3bd0c387
4 changed files with 9 additions and 7 deletions

View File

@ -68,6 +68,7 @@ public class Balancer {
private static final String indexSuffix = "A.db"; private static final String indexSuffix = "A.db";
private static final int EcoFSBufferSize = 1000; private static final int EcoFSBufferSize = 1000;
private static final int objectIndexBufferSize = 1000; private static final int objectIndexBufferSize = 1000;
private static final int MAX_DOUBLE_PUSH_CHECK = 100000;
// class variables filled with external values // class variables filled with external values
private final File cacheStacksPath; private final File cacheStacksPath;
@ -274,7 +275,7 @@ public class Balancer {
if (this.double_push_check.has(hash)) return "double occurrence in double_push_check"; if (this.double_push_check.has(hash)) return "double occurrence in double_push_check";
if (this.urlFileIndex.has(hash)) return "double occurrence in urlFileIndex"; if (this.urlFileIndex.has(hash)) return "double occurrence in urlFileIndex";
if (this.double_push_check.size() > 10000 || MemoryControl.shortStatus()) this.double_push_check.clear(); if (this.double_push_check.size() > MAX_DOUBLE_PUSH_CHECK || MemoryControl.shortStatus()) this.double_push_check.clear();
this.double_push_check.put(hash); this.double_push_check.put(hash);
// add to index // add to index

View File

@ -465,14 +465,14 @@ public final class CrawlStacker {
((System.currentTimeMillis() - oldDate.getTime()) / 60000 / 60 / 24) + " days ago."); ((System.currentTimeMillis() - oldDate.getTime()) / 60000 / 60 / 24) + " days ago.");
} else { } else {
if (dbocc == null) { if (dbocc == null) {
return "double in: LURL-DB"; return "double in: LURL-DB, oldDate = " + oldDate.toString();
} }
if (this.log.isInfo()) this.log.logInfo("URL '" + urlstring + "' is double registered in '" + dbocc.toString() + "'. " + "Stack processing time:"); if (this.log.isInfo()) this.log.logInfo("URL '" + urlstring + "' is double registered in '" + dbocc.toString() + "'. " + "Stack processing time:");
if (dbocc == HarvestProcess.ERRORS) { if (dbocc == HarvestProcess.ERRORS) {
final ZURL.Entry errorEntry = this.nextQueue.errorURL.get(url.hash()); final ZURL.Entry errorEntry = this.nextQueue.errorURL.get(url.hash());
return "double in: errors (" + errorEntry.anycause() + ")"; return "double in: errors (" + errorEntry.anycause() + "), oldDate = " + oldDate.toString();
} }
return "double in: " + dbocc.toString(); return "double in: " + dbocc.toString() + ", oldDate = " + oldDate.toString();
} }
} }

View File

@ -148,9 +148,9 @@ public class CrawlQueues {
if (this.errorURL.exists(hash)) { if (this.errorURL.exists(hash)) {
return HarvestProcess.ERRORS; return HarvestProcess.ERRORS;
} }
if (this.noticeURL.existsInStack(hash)) { //if (this.noticeURL.existsInStack(hash)) {
return HarvestProcess.CRAWLER; // return HarvestProcess.CRAWLER;
} //} // this is disabled because it prevents propert crawling of smb shares. The cause is unknown
for (final Loader worker: this.workers.values()) { for (final Loader worker: this.workers.values()) {
if (Base64Order.enhancedCoder.equal(worker.request.url().hash(), hash)) { if (Base64Order.enhancedCoder.equal(worker.request.url().hash(), hash)) {
return HarvestProcess.WORKER; return HarvestProcess.WORKER;

View File

@ -235,6 +235,7 @@ public class RobotsTxt {
} }
public void ensureExist(final MultiProtocolURI theURL, final Set<String> thisAgents, boolean concurrent) { public void ensureExist(final MultiProtocolURI theURL, final Set<String> thisAgents, boolean concurrent) {
if (theURL.isLocal()) return;
final String urlHostPort = getHostPort(theURL); final String urlHostPort = getHostPort(theURL);
if (urlHostPort == null) return; if (urlHostPort == null) return;
final BEncodedHeap robotsTable; final BEncodedHeap robotsTable;