better balancing and duetime-cumputation also for no-delay intranet

hosts
This commit is contained in:
Michael Peter Christen 2012-10-30 11:28:49 +01:00
parent c326aa8f67
commit 0833937c1c
7 changed files with 39 additions and 62 deletions

View File

@ -287,8 +287,8 @@ public class PerformanceQueues_p {
}
if ((post != null) && (post.containsKey("minimumDeltaSubmit"))) {
final long minimumLocalDelta = post.getLong("minimumLocalDelta", sb.crawlQueues.noticeURL.getMinimumLocalDelta());
final long minimumGlobalDelta = post.getLong("minimumGlobalDelta", sb.crawlQueues.noticeURL.getMinimumGlobalDelta());
final int minimumLocalDelta = post.getInt("minimumLocalDelta", sb.crawlQueues.noticeURL.getMinimumLocalDelta());
final int minimumGlobalDelta = post.getInt("minimumGlobalDelta", sb.crawlQueues.noticeURL.getMinimumGlobalDelta());
sb.setConfig("minimumLocalDelta", minimumLocalDelta);
sb.setConfig("minimumGlobalDelta", minimumGlobalDelta);
sb.crawlQueues.noticeURL.setMinimumDelta(minimumLocalDelta, minimumGlobalDelta);

View File

@ -70,8 +70,8 @@ public class Balancer {
// class variables filled with external values
private final File cacheStacksPath;
private long minimumLocalDelta;
private long minimumGlobalDelta;
private int minimumLocalDelta;
private int minimumGlobalDelta;
private final Set<String> myAgentIDs;
private BufferedObjectIndex urlFileIndex;
@ -86,8 +86,8 @@ public class Balancer {
public Balancer(
final File cachePath,
final String stackname,
final long minimumLocalDelta,
final long minimumGlobalDelta,
final int minimumLocalDelta,
final int minimumGlobalDelta,
final Set<String> myAgentIDs,
final boolean useTailCache,
final boolean exceed134217727) {
@ -118,15 +118,15 @@ public class Balancer {
Log.logInfo("Balancer", "opened balancer file with " + this.urlFileIndex.size() + " entries from " + f.toString());
}
public long getMinimumLocalDelta() {
public int getMinimumLocalDelta() {
return this.minimumLocalDelta;
}
public long getMinimumGlobalDelta() {
public int getMinimumGlobalDelta() {
return this.minimumGlobalDelta;
}
public void setMinimumDelta(final long minimumLocalDelta, final long minimumGlobalDelta) {
public void setMinimumDelta(final int minimumLocalDelta, final int minimumGlobalDelta) {
this.minimumLocalDelta = minimumLocalDelta;
this.minimumGlobalDelta = minimumGlobalDelta;
}
@ -289,7 +289,7 @@ public class Balancer {
Map<String, Integer[]> map = new TreeMap<String, Integer[]>(); // we use a tree map to get a stable ordering
for (Map.Entry<String, HandleSet> entry: this.domainStacks.entrySet()) {
int size = entry.getValue().size();
int delta = (int) Latency.waitingRemainingGuessed(entry.getKey(), this.minimumLocalDelta, this.minimumGlobalDelta);
int delta = Latency.waitingRemainingGuessed(entry.getKey(), this.minimumLocalDelta, this.minimumGlobalDelta);
map.put(entry.getKey(), new Integer[]{size, delta});
}
return map;
@ -297,7 +297,7 @@ public class Balancer {
/**
* Get the minimum sleep time for a given url. The result can also be negative to reflect the time since the last access
* The time can be as low as Long.MIN_VALUE to show that there should not be any limitation at all.
* The time can be as low as Integer.MIN_VALUE to show that there should not be any limitation at all.
* @param robots
* @param profileEntry
* @param crawlURL
@ -616,7 +616,7 @@ public class Balancer {
break;
}
count++;
if (!this.domainStacks.isEmpty() && count > 120 * this.domainStacks.size()) break;
if (this.domainStacks.size() >= 100 || (!this.domainStacks.isEmpty() && count > 600 * this.domainStacks.size())) break;
}
Log.logInfo("BALANCER", "re-fill of domain stacks; fileIndex.size() = " + this.urlFileIndex.size() + ", domainStacks.size = " + this.domainStacks.size() + ", collection time = " + (System.currentTimeMillis() - this.lastDomainStackFill) + " ms");
this.domStackInitSize = this.domainStacks.size();

View File

@ -97,8 +97,8 @@ public class Latency {
* @param thisAgents
* @return the waiting time in milliseconds; 0 if not known; -1 if host gives us special rights
*/
public static long waitingRobots(final MultiProtocolURI url, final RobotsTxt robots, final Set<String> thisAgents) {
long robotsDelay = 0;
public static int waitingRobots(final MultiProtocolURI url, final RobotsTxt robots, final Set<String> thisAgents) {
int robotsDelay = 0;
RobotsTxtEntry robotsEntry = robots.getEntry(url, thisAgents);
robotsDelay = (robotsEntry == null) ? 0 : robotsEntry.getCrawlDelayMillis();
if (robotsEntry != null && robotsDelay == 0 && robotsEntry.getAgentName() != null) return -1; // no limits if granted exclusively for this peer
@ -115,7 +115,7 @@ public class Latency {
* @return the remaining waiting time in milliseconds. The return value may be negative
* which expresses how long the time is over the minimum waiting time.
*/
public static long waitingRemainingGuessed(final String hostname, final long minimumLocalDelta, final long minimumGlobalDelta) {
public static int waitingRemainingGuessed(final String hostname, final int minimumLocalDelta, final int minimumGlobalDelta) {
if (hostname == null) return Integer.MIN_VALUE;
// first check if the domain was _ever_ accessed before
@ -123,15 +123,13 @@ public class Latency {
if (host == null) return Integer.MIN_VALUE; // no delay if host is new; use Integer because there is a cast to int somewhere
// find the minimum waiting time based on the network domain (local or global)
final boolean local = Domains.isLocal(hostname, null);
if (local) return minimumLocalDelta;
long waiting = minimumGlobalDelta;
int waiting = (Domains.isLocal(hostname, null)) ? minimumLocalDelta : minimumGlobalDelta;
// if we have accessed the domain many times, get slower (the flux factor)
waiting += host.flux(waiting);
// the time since last access to the domain is the basis of the remaining calculation
final long timeSinceLastAccess = System.currentTimeMillis() - host.lastacc();
final int timeSinceLastAccess = (int) (System.currentTimeMillis() - host.lastacc());
// use the access latency as rule how fast we can access the server
// this applies also to localhost, but differently, because it is not necessary to
@ -153,16 +151,14 @@ public class Latency {
* @param minimumGlobalDelta
* @return the remaining waiting time in milliseconds. can be negative to reflect the due-time after a possible nex loading time
*/
public static long waitingRemaining(final MultiProtocolURI url, final RobotsTxt robots, final Set<String> thisAgents, final long minimumLocalDelta, final long minimumGlobalDelta) {
public static int waitingRemaining(final MultiProtocolURI url, final RobotsTxt robots, final Set<String> thisAgents, final int minimumLocalDelta, final int minimumGlobalDelta) {
// first check if the domain was _ever_ accessed before
final Host host = host(url);
if (host == null) return Integer.MIN_VALUE; // no delay if host is new; use Integer because there is a cast to int somewhere
// find the minimum waiting time based on the network domain (local or global)
final boolean local = url.isLocal();
if (local) return minimumLocalDelta;
long waiting = minimumGlobalDelta;
int waiting = (url.isLocal()) ? minimumLocalDelta : minimumGlobalDelta;
// for CGI accesses, we double the minimum time
// mostly there is a database access in the background
@ -178,10 +174,10 @@ public class Latency {
waiting = Math.max(waiting, host.average() * 2);
// the time since last access to the domain is the basis of the remaining calculation
final long timeSinceLastAccess = System.currentTimeMillis() - host.lastacc();
final int timeSinceLastAccess = (int) (System.currentTimeMillis() - host.lastacc());
// find the delay as given by robots.txt on target site
long robotsDelay = waitingRobots(url, robots, thisAgents);
int robotsDelay = waitingRobots(url, robots, thisAgents);
if (robotsDelay < 0) return -timeSinceLastAccess; // no limits if granted exclusively for this peer
waiting = Math.max(waiting, robotsDelay);
@ -189,18 +185,16 @@ public class Latency {
}
public static String waitingRemainingExplain(final MultiProtocolURI url, final RobotsTxt robots, final Set<String> thisAgents, final long minimumLocalDelta, final long minimumGlobalDelta) {
public static String waitingRemainingExplain(final MultiProtocolURI url, final RobotsTxt robots, final Set<String> thisAgents, final int minimumLocalDelta, final int minimumGlobalDelta) {
// first check if the domain was _ever_ accessed before
final Host host = host(url);
if (host == null) return "host " + host + " never accessed before -> Long.MIN_VALUE"; // no delay if host is new
if (host == null) return "host " + host + " never accessed before -> Integer.MIN_VALUE"; // no delay if host is new
final StringBuilder s = new StringBuilder(50);
// find the minimum waiting time based on the network domain (local or global)
final boolean local = url.isLocal();
if (local) return "local host -> minimum local: " + minimumLocalDelta;
long waiting = minimumGlobalDelta;
int waiting = (url.isLocal()) ? minimumLocalDelta : minimumGlobalDelta;
s.append("minimumDelta = ").append(waiting);
// for CGI accesses, we double the minimum time
@ -209,7 +203,7 @@ public class Latency {
if (url.isCGI()) { waiting = waiting * 2; s.append(", isCGI = true -> double"); }
// if we have accessed the domain many times, get slower (the flux factor)
long flux = host.flux(waiting);
int flux = host.flux(waiting);
waiting += flux;
s.append(", flux = ").append(flux);
@ -220,7 +214,7 @@ public class Latency {
waiting = Math.max(waiting, host.average() * 2);
// find the delay as given by robots.txt on target site
long robotsDelay = waitingRobots(url, robots, thisAgents);
int robotsDelay = waitingRobots(url, robots, thisAgents);
if (robotsDelay < 0) return "no waiting for exclusive granted peer"; // no limits if granted exclusively for this peer
waiting = Math.max(waiting, robotsDelay);
@ -273,7 +267,7 @@ public class Latency {
public long robotsDelay() {
return this.robotsMinDelay;
}
public long flux(final long range) {
public int flux(final int range) {
return this.count >= 10000 ? range * Math.min(5000, this.count) / 10000 : range / (10000 - this.count);
}
}

View File

@ -51,8 +51,8 @@ public class NoticedURL {
LOCAL, GLOBAL, OVERHANG, REMOTE, NOLOAD;
}
private static final long minimumLocalDeltaInit = 10; // the minimum time difference between access of the same local domain
public static final long minimumGlobalDeltaInit = 500; // the minimum time difference between access of the same global domain
private static final int minimumLocalDeltaInit = 10; // the minimum time difference between access of the same local domain
public static final int minimumGlobalDeltaInit = 500; // the minimum time difference between access of the same global domain
private Balancer coreStack; // links found by crawling to depth-1
private Balancer limitStack; // links found by crawling at target depth
@ -72,15 +72,15 @@ public class NoticedURL {
this.noloadStack = new Balancer(cachePath, "urlNoticeNoLoadStack", minimumLocalDeltaInit, minimumGlobalDeltaInit, myAgentIDs, useTailCache, exceed134217727);
}
public long getMinimumLocalDelta() {
public int getMinimumLocalDelta() {
return this.coreStack.getMinimumLocalDelta();
}
public long getMinimumGlobalDelta() {
public int getMinimumGlobalDelta() {
return this.coreStack.getMinimumGlobalDelta();
}
public void setMinimumDelta(final long minimumLocalDelta, final long minimumGlobalDelta) {
public void setMinimumDelta(final int minimumLocalDelta, final int minimumGlobalDelta) {
this.coreStack.setMinimumDelta(minimumLocalDelta, minimumGlobalDelta);
this.limitStack.setMinimumDelta(minimumLocalDelta, minimumGlobalDelta);
this.remoteStack.setMinimumDelta(minimumLocalDelta, minimumGlobalDelta);

View File

@ -205,14 +205,14 @@ public class RobotsTxtEntry {
return null;
}
public long getCrawlDelayMillis() {
public int getCrawlDelayMillis() {
if (this.mem.containsKey(CRAWL_DELAY_MILLIS)) try {
return ByteArray.parseDecimal(this.mem.get(CRAWL_DELAY_MILLIS));
return (int) ByteArray.parseDecimal(this.mem.get(CRAWL_DELAY_MILLIS));
} catch (final NumberFormatException e) {
return 0;
}
if (this.mem.containsKey(CRAWL_DELAY)) try {
return 1000 * ByteArray.parseDecimal(this.mem.get(CRAWL_DELAY));
return 1000 * ((int) ByteArray.parseDecimal(this.mem.get(CRAWL_DELAY)));
} catch (final NumberFormatException e) {
return 0;
}

View File

@ -727,8 +727,8 @@ public final class Switchboard extends serverSwitch
OAIListFriendsLoader.init(this.loader, oaiFriends);
this.crawlQueues = new CrawlQueues(this, this.queuesRoot);
this.crawlQueues.noticeURL.setMinimumDelta(
getConfigLong("minimumLocalDelta", this.crawlQueues.noticeURL.getMinimumLocalDelta()),
getConfigLong("minimumGlobalDelta", this.crawlQueues.noticeURL.getMinimumGlobalDelta()));
getConfigInt("minimumLocalDelta", this.crawlQueues.noticeURL.getMinimumLocalDelta()),
getConfigInt("minimumGlobalDelta", this.crawlQueues.noticeURL.getMinimumGlobalDelta()));
/*
* Creating sync objects and loading status for the crawl jobs

View File

@ -92,11 +92,7 @@ public class serverObjects extends HashMap<String, String> implements Cloneable
super();
}
public serverObjects(final int initialCapacity) {
super(initialCapacity);
}
public serverObjects(final Map<String, String> input) {
protected serverObjects(final Map<String, String> input) {
super(input);
}
@ -219,10 +215,6 @@ public class serverObjects extends HashMap<String, String> implements Cloneable
return put(key, toJSON(value));
}
public String putJSON(final String key, final StringBuilder value) {
return put(key, toJSON(value.toString()));
}
public static String toJSON(String value) {
// value = value.replaceAll("\\", "\\\\");
value = patternDoublequote.matcher(value).replaceAll("'");
@ -235,10 +227,6 @@ public class serverObjects extends HashMap<String, String> implements Cloneable
return value;
}
public String putJSON(final String key, final byte[] value) {
return putJSON(key, UTF8.String(value));
}
/**
* Add a String to the map. The content of the String is escaped to be usable in HTML output.
* @param key key name as String.
@ -386,11 +374,6 @@ public class serverObjects extends HashMap<String, String> implements Cloneable
return s.equals("true") || s.equals("on") || s.equals("1");
}
public boolean hasValue(final String key) {
final String s = super.get(key);
return (s != null && !s.isEmpty());
}
// returns a set of all values where their key mappes the keyMapper
public String[] getAll(final String keyMapper) {
// the keyMapper may contain regular expressions as defined in String.matches