mirror of
https://github.com/yacy/yacy_search_server.git
synced 2024-09-19 00:01:41 +02:00
- added more logging to balancer
- changed balancer logic slightly git-svn-id: https://svn.berlios.de/svnroot/repos/yacy/trunk@6350 6c8d7289-2bf4-0310-a012-ef5d649a1542
This commit is contained in:
parent
3c4064932c
commit
2e6bdce086
|
@ -55,6 +55,7 @@ public class Balancer {
|
|||
private long minimumLocalDelta;
|
||||
private long minimumGlobalDelta;
|
||||
private long lastDomainStackFill;
|
||||
private int domStackInitSize;
|
||||
|
||||
public Balancer(
|
||||
final File cachePath,
|
||||
|
@ -69,6 +70,7 @@ public class Balancer {
|
|||
this.delayed = new TreeMap<Long, String>();
|
||||
this.minimumLocalDelta = minimumLocalDelta;
|
||||
this.minimumGlobalDelta = minimumGlobalDelta;
|
||||
this.domStackInitSize = Integer.MAX_VALUE;
|
||||
|
||||
// create a stack for newly entered entries
|
||||
if (!(cachePath.exists())) cachePath.mkdir(); // make the path
|
||||
|
@ -272,7 +274,6 @@ public class Balancer {
|
|||
}
|
||||
|
||||
private String nextFromDelayed() {
|
||||
if (this.delayed.size() == 0) return null;
|
||||
if (this.delayed.size() == 0) return null;
|
||||
Long first = this.delayed.firstKey();
|
||||
if (first.longValue() < System.currentTimeMillis()) {
|
||||
|
@ -281,6 +282,12 @@ public class Balancer {
|
|||
return null;
|
||||
}
|
||||
|
||||
private String anyFromDelayed() {
|
||||
if (this.delayed.size() == 0) return null;
|
||||
Long first = this.delayed.firstKey();
|
||||
return this.delayed.remove(first);
|
||||
}
|
||||
|
||||
/**
|
||||
* get the next entry in this crawl queue in such a way that the domain access time delta is maximized
|
||||
* and always above the given minimum delay time. An additional delay time is computed using the robots.txt
|
||||
|
@ -304,7 +311,14 @@ public class Balancer {
|
|||
filltop(delay, -2000, false);
|
||||
filltop(delay, -1000, false);
|
||||
filltop(delay, -500, false);
|
||||
filltop(delay, 0, true);
|
||||
filltop(delay, 0, true);
|
||||
filltop(delay, 500, true);
|
||||
filltop(delay, 1000, true);
|
||||
filltop(delay, 2000, true);
|
||||
filltop(delay, 3000, true);
|
||||
filltop(delay, 4000, true);
|
||||
filltop(delay, 6000, true);
|
||||
filltop(delay, Long.MAX_VALUE, true);
|
||||
|
||||
long sleeptime = 0;
|
||||
Request crawlEntry = null;
|
||||
|
@ -318,6 +332,9 @@ public class Balancer {
|
|||
nexthash = top.remove();
|
||||
//System.out.println("*** top.remove()=" + nexthash);
|
||||
}
|
||||
if (nexthash == null) {
|
||||
nexthash = anyFromDelayed();
|
||||
}
|
||||
|
||||
// check minimumDelta and if necessary force a sleep
|
||||
//final int s = urlFileIndex.size();
|
||||
|
@ -358,10 +375,9 @@ public class Balancer {
|
|||
assert nexthash.equals(new String(rowEntry.getPrimaryKeyBytes())) : "result = " + nexthash + ", rowEntry.getPrimaryKeyBytes() = " + new String(rowEntry.getPrimaryKeyBytes());
|
||||
assert nexthash.equals(crawlEntry.url().hash()) : "result = " + nexthash + ", crawlEntry.url().hash() = " + crawlEntry.url().hash();
|
||||
|
||||
if (this.domainStacks.size() <= 1) break;
|
||||
if (failhash != null && failhash.equals(nexthash)) break; // prevent endless loops
|
||||
|
||||
if (delay && sleeptime > 0) {
|
||||
if (delay && sleeptime > 0 && this.domStackInitSize > 1) {
|
||||
//System.out.println("*** putback: nexthash=" + nexthash + ", failhash="+failhash);
|
||||
// put that thing back to omit a delay here
|
||||
if (!delayed.values().contains(nexthash)) {
|
||||
|
@ -383,7 +399,7 @@ public class Balancer {
|
|||
// in best case, this should never happen if the balancer works propertly
|
||||
// this is only to protection against the worst case, where the crawler could
|
||||
// behave in a DoS-manner
|
||||
Log.logInfo("BALANCER", "forcing crawl-delay of " + sleeptime + " milliseconds for " + crawlEntry.url().getHost() + ((sleeptime > Math.max(minimumLocalDelta, minimumGlobalDelta)) ? " (forced latency)" : ""));
|
||||
Log.logInfo("BALANCER", "forcing crawl-delay of " + sleeptime + " milliseconds for " + crawlEntry.url().getHost() + ": " + Latency.waitingRemainingExplain(crawlEntry.url(), minimumLocalDelta, minimumGlobalDelta) + ", top.size() = " + top.size() + ", delayed.size() = " + delayed.size() + ", domainStacks.size() = " + domainStacks.size() + ", domainStacksInitSize = " + this.domStackInitSize);
|
||||
long loops = sleeptime / 3000;
|
||||
long rest = sleeptime % 3000;
|
||||
if (loops < 2) {
|
||||
|
@ -395,7 +411,6 @@ public class Balancer {
|
|||
Log.logInfo("BALANCER", "waiting for " + crawlEntry.url().getHost() + ": " + ((loops - i) * 3) + " seconds remaining...");
|
||||
try {synchronized(this) { this.wait(3000); }} catch (final InterruptedException e) {}
|
||||
}
|
||||
if (sleeptime > 3000 && this.domainStacks.size() > 1) this.domainStacks.remove(crawlEntry.url().hash().substring(6));
|
||||
}
|
||||
Latency.update(crawlEntry.url().hash().substring(6), crawlEntry.url().getHost());
|
||||
return crawlEntry;
|
||||
|
@ -420,6 +435,8 @@ public class Balancer {
|
|||
String besthash = null;
|
||||
while (i.hasNext()) {
|
||||
entry = i.next();
|
||||
|
||||
// clean up empty entries
|
||||
if (entry.getValue().size() == 0) {
|
||||
i.remove();
|
||||
continue;
|
||||
|
@ -450,7 +467,7 @@ public class Balancer {
|
|||
}
|
||||
|
||||
private void fillDomainStacks(int maxdomstacksize) throws IOException {
|
||||
if (this.domainStacks.size() > 0 && System.currentTimeMillis() - lastDomainStackFill < 200000L) return;
|
||||
if (this.domainStacks.size() > 0 && System.currentTimeMillis() - lastDomainStackFill < 120000L) return;
|
||||
this.domainStacks.clear();
|
||||
//synchronized (this.delayed) { delayed.clear(); }
|
||||
this.lastDomainStackFill = System.currentTimeMillis();
|
||||
|
@ -459,6 +476,8 @@ public class Balancer {
|
|||
pushHashToDomainStacks(new String(i.next()), 50);
|
||||
if (this.domainStacks.size() > maxdomstacksize) break;
|
||||
}
|
||||
Log.logInfo("BALANCER", "re-fill of domain stacks; fileIndex.size() = " + this.urlFileIndex.size() + ", domainStacks.size = " + domainStacks.size() + ", collection time = " + (System.currentTimeMillis() - this.lastDomainStackFill) + " ms");
|
||||
this.domStackInitSize = this.domainStacks.size();
|
||||
}
|
||||
|
||||
public ArrayList<Request> top(int count) {
|
||||
|
|
|
@ -212,7 +212,7 @@ public class CrawlQueues {
|
|||
|
||||
String queueCheck = crawlIsPossible(NoticedURL.STACK_TYPE_CORE, "Core");
|
||||
if (queueCheck != null) {
|
||||
if (log.isFine()) log.logFine("omitting de-queue/local: " + queueCheck);
|
||||
if (log.isFinest()) log.logFine("omitting de-queue/local: " + queueCheck);
|
||||
return false;
|
||||
}
|
||||
|
||||
|
|
|
@ -111,7 +111,7 @@ public class Latency {
|
|||
public static long waitingRemainingGuessed(String hosthash, final long minimumLocalDelta, final long minimumGlobalDelta) {
|
||||
assert hosthash.length() == 12 || hosthash.length() == 6;
|
||||
Host host = Latency.host((hosthash.length() == 6) ? hosthash : hosthash.substring(6));
|
||||
if (host == null) return 0;
|
||||
if (host == null) return Long.MIN_VALUE;
|
||||
|
||||
// the time since last access to the domain is the basis of the remaining calculation
|
||||
final long timeSinceLastAccess = System.currentTimeMillis() - host.lastacc();
|
||||
|
@ -150,14 +150,14 @@ public class Latency {
|
|||
*/
|
||||
public static long waitingRemaining(yacyURL url, final long minimumLocalDelta, final long minimumGlobalDelta) {
|
||||
|
||||
// find the minimum waiting time based on the network domain (local or global)
|
||||
final boolean local = url.isLocal();
|
||||
long waiting = (local) ? minimumLocalDelta : minimumGlobalDelta;
|
||||
|
||||
// first check if the domain was _ever_ accessed before
|
||||
String hosthash = url.hash().substring(6);
|
||||
Host host = host(hosthash);
|
||||
if (host == null) return 0; // no delay if host is new
|
||||
if (host == null) return Long.MIN_VALUE; // no delay if host is new
|
||||
|
||||
// find the minimum waiting time based on the network domain (local or global)
|
||||
final boolean local = url.isLocal();
|
||||
long waiting = (local) ? minimumLocalDelta : minimumGlobalDelta;
|
||||
|
||||
// the time since last access to the domain is the basis of the remaining calculation
|
||||
final long timeSinceLastAccess = (host == null) ? 0 : System.currentTimeMillis() - host.lastacc();
|
||||
|
@ -187,6 +187,45 @@ public class Latency {
|
|||
return Math.max(0, waiting - timeSinceLastAccess);
|
||||
}
|
||||
|
||||
|
||||
public static String waitingRemainingExplain(yacyURL url, final long minimumLocalDelta, final long minimumGlobalDelta) {
|
||||
|
||||
// first check if the domain was _ever_ accessed before
|
||||
String hosthash = url.hash().substring(6);
|
||||
Host host = host(hosthash);
|
||||
if (host == null) return "host " + hosthash + "/" + url.getHost() + " never accessed before -> 0"; // no delay if host is new
|
||||
|
||||
StringBuilder s = new StringBuilder(50);
|
||||
|
||||
// find the minimum waiting time based on the network domain (local or global)
|
||||
final boolean local = url.isLocal();
|
||||
long waiting = (local) ? minimumLocalDelta : minimumGlobalDelta;
|
||||
s.append("minimumDelta = ").append(waiting);
|
||||
|
||||
// the time since last access to the domain is the basis of the remaining calculation
|
||||
final long timeSinceLastAccess = (host == null) ? 0 : System.currentTimeMillis() - host.lastacc();
|
||||
s.append(", timeSinceLastAccess = ").append(timeSinceLastAccess);
|
||||
|
||||
// for CGI accesses, we double the minimum time
|
||||
// mostly there is a database access in the background
|
||||
// which creates a lot of unwanted IO on target site
|
||||
if (url.isCGI()) s.append(", isCGI = true -> double");
|
||||
|
||||
// if we have accessed the domain many times, get slower (the flux factor)
|
||||
if (!local && host != null) s.append(", flux = ").append(host.flux(waiting));
|
||||
|
||||
// find the delay as given by robots.txt on target site
|
||||
long robotsDelay = (local) ? 0 : Switchboard.getSwitchboard().robots.crawlDelayMillis(url);
|
||||
s.append(", robots.delay = ").append(robotsDelay);
|
||||
|
||||
// use the access latency as rule how fast we can access the server
|
||||
// this applies also to localhost, but differently, because it is not necessary to
|
||||
// consider so many external accesses
|
||||
if (host != null) s.append(", host.average = ").append(host.average());
|
||||
|
||||
return s.toString();
|
||||
}
|
||||
|
||||
public static final class Host {
|
||||
private long timeacc;
|
||||
private long lastacc;
|
||||
|
|
Loading…
Reference in New Issue
Block a user