mirror of
https://github.com/yacy/yacy_search_server.git
synced 2024-09-19 00:01:41 +02:00
fix for smb crawl situation (lost too many urls)
This commit is contained in:
parent
d456f69381
commit
8f3bd0c387
|
@ -68,6 +68,7 @@ public class Balancer {
|
||||||
private static final String indexSuffix = "A.db";
|
private static final String indexSuffix = "A.db";
|
||||||
private static final int EcoFSBufferSize = 1000;
|
private static final int EcoFSBufferSize = 1000;
|
||||||
private static final int objectIndexBufferSize = 1000;
|
private static final int objectIndexBufferSize = 1000;
|
||||||
|
private static final int MAX_DOUBLE_PUSH_CHECK = 100000;
|
||||||
|
|
||||||
// class variables filled with external values
|
// class variables filled with external values
|
||||||
private final File cacheStacksPath;
|
private final File cacheStacksPath;
|
||||||
|
@ -274,7 +275,7 @@ public class Balancer {
|
||||||
if (this.double_push_check.has(hash)) return "double occurrence in double_push_check";
|
if (this.double_push_check.has(hash)) return "double occurrence in double_push_check";
|
||||||
if (this.urlFileIndex.has(hash)) return "double occurrence in urlFileIndex";
|
if (this.urlFileIndex.has(hash)) return "double occurrence in urlFileIndex";
|
||||||
|
|
||||||
if (this.double_push_check.size() > 10000 || MemoryControl.shortStatus()) this.double_push_check.clear();
|
if (this.double_push_check.size() > MAX_DOUBLE_PUSH_CHECK || MemoryControl.shortStatus()) this.double_push_check.clear();
|
||||||
this.double_push_check.put(hash);
|
this.double_push_check.put(hash);
|
||||||
|
|
||||||
// add to index
|
// add to index
|
||||||
|
|
|
@ -465,14 +465,14 @@ public final class CrawlStacker {
|
||||||
((System.currentTimeMillis() - oldDate.getTime()) / 60000 / 60 / 24) + " days ago.");
|
((System.currentTimeMillis() - oldDate.getTime()) / 60000 / 60 / 24) + " days ago.");
|
||||||
} else {
|
} else {
|
||||||
if (dbocc == null) {
|
if (dbocc == null) {
|
||||||
return "double in: LURL-DB";
|
return "double in: LURL-DB, oldDate = " + oldDate.toString();
|
||||||
}
|
}
|
||||||
if (this.log.isInfo()) this.log.logInfo("URL '" + urlstring + "' is double registered in '" + dbocc.toString() + "'. " + "Stack processing time:");
|
if (this.log.isInfo()) this.log.logInfo("URL '" + urlstring + "' is double registered in '" + dbocc.toString() + "'. " + "Stack processing time:");
|
||||||
if (dbocc == HarvestProcess.ERRORS) {
|
if (dbocc == HarvestProcess.ERRORS) {
|
||||||
final ZURL.Entry errorEntry = this.nextQueue.errorURL.get(url.hash());
|
final ZURL.Entry errorEntry = this.nextQueue.errorURL.get(url.hash());
|
||||||
return "double in: errors (" + errorEntry.anycause() + ")";
|
return "double in: errors (" + errorEntry.anycause() + "), oldDate = " + oldDate.toString();
|
||||||
}
|
}
|
||||||
return "double in: " + dbocc.toString();
|
return "double in: " + dbocc.toString() + ", oldDate = " + oldDate.toString();
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
|
@ -148,9 +148,9 @@ public class CrawlQueues {
|
||||||
if (this.errorURL.exists(hash)) {
|
if (this.errorURL.exists(hash)) {
|
||||||
return HarvestProcess.ERRORS;
|
return HarvestProcess.ERRORS;
|
||||||
}
|
}
|
||||||
if (this.noticeURL.existsInStack(hash)) {
|
//if (this.noticeURL.existsInStack(hash)) {
|
||||||
return HarvestProcess.CRAWLER;
|
// return HarvestProcess.CRAWLER;
|
||||||
}
|
//} // this is disabled because it prevents propert crawling of smb shares. The cause is unknown
|
||||||
for (final Loader worker: this.workers.values()) {
|
for (final Loader worker: this.workers.values()) {
|
||||||
if (Base64Order.enhancedCoder.equal(worker.request.url().hash(), hash)) {
|
if (Base64Order.enhancedCoder.equal(worker.request.url().hash(), hash)) {
|
||||||
return HarvestProcess.WORKER;
|
return HarvestProcess.WORKER;
|
||||||
|
|
|
@ -235,6 +235,7 @@ public class RobotsTxt {
|
||||||
}
|
}
|
||||||
|
|
||||||
public void ensureExist(final MultiProtocolURI theURL, final Set<String> thisAgents, boolean concurrent) {
|
public void ensureExist(final MultiProtocolURI theURL, final Set<String> thisAgents, boolean concurrent) {
|
||||||
|
if (theURL.isLocal()) return;
|
||||||
final String urlHostPort = getHostPort(theURL);
|
final String urlHostPort = getHostPort(theURL);
|
||||||
if (urlHostPort == null) return;
|
if (urlHostPort == null) return;
|
||||||
final BEncodedHeap robotsTable;
|
final BEncodedHeap robotsTable;
|
||||||
|
|
Loading…
Reference in New Issue
Block a user