mirror of
https://github.com/yacy/yacy_search_server.git
synced 2024-09-19 00:01:41 +02:00
better balancing and duetime-cumputation also for no-delay intranet
hosts
This commit is contained in:
parent
c326aa8f67
commit
0833937c1c
|
@ -287,8 +287,8 @@ public class PerformanceQueues_p {
|
|||
}
|
||||
|
||||
if ((post != null) && (post.containsKey("minimumDeltaSubmit"))) {
|
||||
final long minimumLocalDelta = post.getLong("minimumLocalDelta", sb.crawlQueues.noticeURL.getMinimumLocalDelta());
|
||||
final long minimumGlobalDelta = post.getLong("minimumGlobalDelta", sb.crawlQueues.noticeURL.getMinimumGlobalDelta());
|
||||
final int minimumLocalDelta = post.getInt("minimumLocalDelta", sb.crawlQueues.noticeURL.getMinimumLocalDelta());
|
||||
final int minimumGlobalDelta = post.getInt("minimumGlobalDelta", sb.crawlQueues.noticeURL.getMinimumGlobalDelta());
|
||||
sb.setConfig("minimumLocalDelta", minimumLocalDelta);
|
||||
sb.setConfig("minimumGlobalDelta", minimumGlobalDelta);
|
||||
sb.crawlQueues.noticeURL.setMinimumDelta(minimumLocalDelta, minimumGlobalDelta);
|
||||
|
|
|
@ -70,8 +70,8 @@ public class Balancer {
|
|||
|
||||
// class variables filled with external values
|
||||
private final File cacheStacksPath;
|
||||
private long minimumLocalDelta;
|
||||
private long minimumGlobalDelta;
|
||||
private int minimumLocalDelta;
|
||||
private int minimumGlobalDelta;
|
||||
private final Set<String> myAgentIDs;
|
||||
private BufferedObjectIndex urlFileIndex;
|
||||
|
||||
|
@ -86,8 +86,8 @@ public class Balancer {
|
|||
public Balancer(
|
||||
final File cachePath,
|
||||
final String stackname,
|
||||
final long minimumLocalDelta,
|
||||
final long minimumGlobalDelta,
|
||||
final int minimumLocalDelta,
|
||||
final int minimumGlobalDelta,
|
||||
final Set<String> myAgentIDs,
|
||||
final boolean useTailCache,
|
||||
final boolean exceed134217727) {
|
||||
|
@ -118,15 +118,15 @@ public class Balancer {
|
|||
Log.logInfo("Balancer", "opened balancer file with " + this.urlFileIndex.size() + " entries from " + f.toString());
|
||||
}
|
||||
|
||||
public long getMinimumLocalDelta() {
|
||||
public int getMinimumLocalDelta() {
|
||||
return this.minimumLocalDelta;
|
||||
}
|
||||
|
||||
public long getMinimumGlobalDelta() {
|
||||
public int getMinimumGlobalDelta() {
|
||||
return this.minimumGlobalDelta;
|
||||
}
|
||||
|
||||
public void setMinimumDelta(final long minimumLocalDelta, final long minimumGlobalDelta) {
|
||||
public void setMinimumDelta(final int minimumLocalDelta, final int minimumGlobalDelta) {
|
||||
this.minimumLocalDelta = minimumLocalDelta;
|
||||
this.minimumGlobalDelta = minimumGlobalDelta;
|
||||
}
|
||||
|
@ -289,7 +289,7 @@ public class Balancer {
|
|||
Map<String, Integer[]> map = new TreeMap<String, Integer[]>(); // we use a tree map to get a stable ordering
|
||||
for (Map.Entry<String, HandleSet> entry: this.domainStacks.entrySet()) {
|
||||
int size = entry.getValue().size();
|
||||
int delta = (int) Latency.waitingRemainingGuessed(entry.getKey(), this.minimumLocalDelta, this.minimumGlobalDelta);
|
||||
int delta = Latency.waitingRemainingGuessed(entry.getKey(), this.minimumLocalDelta, this.minimumGlobalDelta);
|
||||
map.put(entry.getKey(), new Integer[]{size, delta});
|
||||
}
|
||||
return map;
|
||||
|
@ -297,7 +297,7 @@ public class Balancer {
|
|||
|
||||
/**
|
||||
* Get the minimum sleep time for a given url. The result can also be negative to reflect the time since the last access
|
||||
* The time can be as low as Long.MIN_VALUE to show that there should not be any limitation at all.
|
||||
* The time can be as low as Integer.MIN_VALUE to show that there should not be any limitation at all.
|
||||
* @param robots
|
||||
* @param profileEntry
|
||||
* @param crawlURL
|
||||
|
@ -616,7 +616,7 @@ public class Balancer {
|
|||
break;
|
||||
}
|
||||
count++;
|
||||
if (!this.domainStacks.isEmpty() && count > 120 * this.domainStacks.size()) break;
|
||||
if (this.domainStacks.size() >= 100 || (!this.domainStacks.isEmpty() && count > 600 * this.domainStacks.size())) break;
|
||||
}
|
||||
Log.logInfo("BALANCER", "re-fill of domain stacks; fileIndex.size() = " + this.urlFileIndex.size() + ", domainStacks.size = " + this.domainStacks.size() + ", collection time = " + (System.currentTimeMillis() - this.lastDomainStackFill) + " ms");
|
||||
this.domStackInitSize = this.domainStacks.size();
|
||||
|
|
|
@ -97,8 +97,8 @@ public class Latency {
|
|||
* @param thisAgents
|
||||
* @return the waiting time in milliseconds; 0 if not known; -1 if host gives us special rights
|
||||
*/
|
||||
public static long waitingRobots(final MultiProtocolURI url, final RobotsTxt robots, final Set<String> thisAgents) {
|
||||
long robotsDelay = 0;
|
||||
public static int waitingRobots(final MultiProtocolURI url, final RobotsTxt robots, final Set<String> thisAgents) {
|
||||
int robotsDelay = 0;
|
||||
RobotsTxtEntry robotsEntry = robots.getEntry(url, thisAgents);
|
||||
robotsDelay = (robotsEntry == null) ? 0 : robotsEntry.getCrawlDelayMillis();
|
||||
if (robotsEntry != null && robotsDelay == 0 && robotsEntry.getAgentName() != null) return -1; // no limits if granted exclusively for this peer
|
||||
|
@ -115,7 +115,7 @@ public class Latency {
|
|||
* @return the remaining waiting time in milliseconds. The return value may be negative
|
||||
* which expresses how long the time is over the minimum waiting time.
|
||||
*/
|
||||
public static long waitingRemainingGuessed(final String hostname, final long minimumLocalDelta, final long minimumGlobalDelta) {
|
||||
public static int waitingRemainingGuessed(final String hostname, final int minimumLocalDelta, final int minimumGlobalDelta) {
|
||||
if (hostname == null) return Integer.MIN_VALUE;
|
||||
|
||||
// first check if the domain was _ever_ accessed before
|
||||
|
@ -123,15 +123,13 @@ public class Latency {
|
|||
if (host == null) return Integer.MIN_VALUE; // no delay if host is new; use Integer because there is a cast to int somewhere
|
||||
|
||||
// find the minimum waiting time based on the network domain (local or global)
|
||||
final boolean local = Domains.isLocal(hostname, null);
|
||||
if (local) return minimumLocalDelta;
|
||||
long waiting = minimumGlobalDelta;
|
||||
|
||||
int waiting = (Domains.isLocal(hostname, null)) ? minimumLocalDelta : minimumGlobalDelta;
|
||||
|
||||
// if we have accessed the domain many times, get slower (the flux factor)
|
||||
waiting += host.flux(waiting);
|
||||
|
||||
// the time since last access to the domain is the basis of the remaining calculation
|
||||
final long timeSinceLastAccess = System.currentTimeMillis() - host.lastacc();
|
||||
final int timeSinceLastAccess = (int) (System.currentTimeMillis() - host.lastacc());
|
||||
|
||||
// use the access latency as rule how fast we can access the server
|
||||
// this applies also to localhost, but differently, because it is not necessary to
|
||||
|
@ -153,16 +151,14 @@ public class Latency {
|
|||
* @param minimumGlobalDelta
|
||||
* @return the remaining waiting time in milliseconds. can be negative to reflect the due-time after a possible nex loading time
|
||||
*/
|
||||
public static long waitingRemaining(final MultiProtocolURI url, final RobotsTxt robots, final Set<String> thisAgents, final long minimumLocalDelta, final long minimumGlobalDelta) {
|
||||
public static int waitingRemaining(final MultiProtocolURI url, final RobotsTxt robots, final Set<String> thisAgents, final int minimumLocalDelta, final int minimumGlobalDelta) {
|
||||
|
||||
// first check if the domain was _ever_ accessed before
|
||||
final Host host = host(url);
|
||||
if (host == null) return Integer.MIN_VALUE; // no delay if host is new; use Integer because there is a cast to int somewhere
|
||||
|
||||
// find the minimum waiting time based on the network domain (local or global)
|
||||
final boolean local = url.isLocal();
|
||||
if (local) return minimumLocalDelta;
|
||||
long waiting = minimumGlobalDelta;
|
||||
int waiting = (url.isLocal()) ? minimumLocalDelta : minimumGlobalDelta;
|
||||
|
||||
// for CGI accesses, we double the minimum time
|
||||
// mostly there is a database access in the background
|
||||
|
@ -178,10 +174,10 @@ public class Latency {
|
|||
waiting = Math.max(waiting, host.average() * 2);
|
||||
|
||||
// the time since last access to the domain is the basis of the remaining calculation
|
||||
final long timeSinceLastAccess = System.currentTimeMillis() - host.lastacc();
|
||||
final int timeSinceLastAccess = (int) (System.currentTimeMillis() - host.lastacc());
|
||||
|
||||
// find the delay as given by robots.txt on target site
|
||||
long robotsDelay = waitingRobots(url, robots, thisAgents);
|
||||
int robotsDelay = waitingRobots(url, robots, thisAgents);
|
||||
if (robotsDelay < 0) return -timeSinceLastAccess; // no limits if granted exclusively for this peer
|
||||
|
||||
waiting = Math.max(waiting, robotsDelay);
|
||||
|
@ -189,18 +185,16 @@ public class Latency {
|
|||
}
|
||||
|
||||
|
||||
public static String waitingRemainingExplain(final MultiProtocolURI url, final RobotsTxt robots, final Set<String> thisAgents, final long minimumLocalDelta, final long minimumGlobalDelta) {
|
||||
public static String waitingRemainingExplain(final MultiProtocolURI url, final RobotsTxt robots, final Set<String> thisAgents, final int minimumLocalDelta, final int minimumGlobalDelta) {
|
||||
|
||||
// first check if the domain was _ever_ accessed before
|
||||
final Host host = host(url);
|
||||
if (host == null) return "host " + host + " never accessed before -> Long.MIN_VALUE"; // no delay if host is new
|
||||
if (host == null) return "host " + host + " never accessed before -> Integer.MIN_VALUE"; // no delay if host is new
|
||||
|
||||
final StringBuilder s = new StringBuilder(50);
|
||||
|
||||
// find the minimum waiting time based on the network domain (local or global)
|
||||
final boolean local = url.isLocal();
|
||||
if (local) return "local host -> minimum local: " + minimumLocalDelta;
|
||||
long waiting = minimumGlobalDelta;
|
||||
int waiting = (url.isLocal()) ? minimumLocalDelta : minimumGlobalDelta;
|
||||
s.append("minimumDelta = ").append(waiting);
|
||||
|
||||
// for CGI accesses, we double the minimum time
|
||||
|
@ -209,7 +203,7 @@ public class Latency {
|
|||
if (url.isCGI()) { waiting = waiting * 2; s.append(", isCGI = true -> double"); }
|
||||
|
||||
// if we have accessed the domain many times, get slower (the flux factor)
|
||||
long flux = host.flux(waiting);
|
||||
int flux = host.flux(waiting);
|
||||
waiting += flux;
|
||||
s.append(", flux = ").append(flux);
|
||||
|
||||
|
@ -220,7 +214,7 @@ public class Latency {
|
|||
waiting = Math.max(waiting, host.average() * 2);
|
||||
|
||||
// find the delay as given by robots.txt on target site
|
||||
long robotsDelay = waitingRobots(url, robots, thisAgents);
|
||||
int robotsDelay = waitingRobots(url, robots, thisAgents);
|
||||
if (robotsDelay < 0) return "no waiting for exclusive granted peer"; // no limits if granted exclusively for this peer
|
||||
|
||||
waiting = Math.max(waiting, robotsDelay);
|
||||
|
@ -273,7 +267,7 @@ public class Latency {
|
|||
public long robotsDelay() {
|
||||
return this.robotsMinDelay;
|
||||
}
|
||||
public long flux(final long range) {
|
||||
public int flux(final int range) {
|
||||
return this.count >= 10000 ? range * Math.min(5000, this.count) / 10000 : range / (10000 - this.count);
|
||||
}
|
||||
}
|
||||
|
|
|
@ -51,8 +51,8 @@ public class NoticedURL {
|
|||
LOCAL, GLOBAL, OVERHANG, REMOTE, NOLOAD;
|
||||
}
|
||||
|
||||
private static final long minimumLocalDeltaInit = 10; // the minimum time difference between access of the same local domain
|
||||
public static final long minimumGlobalDeltaInit = 500; // the minimum time difference between access of the same global domain
|
||||
private static final int minimumLocalDeltaInit = 10; // the minimum time difference between access of the same local domain
|
||||
public static final int minimumGlobalDeltaInit = 500; // the minimum time difference between access of the same global domain
|
||||
|
||||
private Balancer coreStack; // links found by crawling to depth-1
|
||||
private Balancer limitStack; // links found by crawling at target depth
|
||||
|
@ -72,15 +72,15 @@ public class NoticedURL {
|
|||
this.noloadStack = new Balancer(cachePath, "urlNoticeNoLoadStack", minimumLocalDeltaInit, minimumGlobalDeltaInit, myAgentIDs, useTailCache, exceed134217727);
|
||||
}
|
||||
|
||||
public long getMinimumLocalDelta() {
|
||||
public int getMinimumLocalDelta() {
|
||||
return this.coreStack.getMinimumLocalDelta();
|
||||
}
|
||||
|
||||
public long getMinimumGlobalDelta() {
|
||||
public int getMinimumGlobalDelta() {
|
||||
return this.coreStack.getMinimumGlobalDelta();
|
||||
}
|
||||
|
||||
public void setMinimumDelta(final long minimumLocalDelta, final long minimumGlobalDelta) {
|
||||
public void setMinimumDelta(final int minimumLocalDelta, final int minimumGlobalDelta) {
|
||||
this.coreStack.setMinimumDelta(minimumLocalDelta, minimumGlobalDelta);
|
||||
this.limitStack.setMinimumDelta(minimumLocalDelta, minimumGlobalDelta);
|
||||
this.remoteStack.setMinimumDelta(minimumLocalDelta, minimumGlobalDelta);
|
||||
|
|
|
@ -205,14 +205,14 @@ public class RobotsTxtEntry {
|
|||
return null;
|
||||
}
|
||||
|
||||
public long getCrawlDelayMillis() {
|
||||
public int getCrawlDelayMillis() {
|
||||
if (this.mem.containsKey(CRAWL_DELAY_MILLIS)) try {
|
||||
return ByteArray.parseDecimal(this.mem.get(CRAWL_DELAY_MILLIS));
|
||||
return (int) ByteArray.parseDecimal(this.mem.get(CRAWL_DELAY_MILLIS));
|
||||
} catch (final NumberFormatException e) {
|
||||
return 0;
|
||||
}
|
||||
if (this.mem.containsKey(CRAWL_DELAY)) try {
|
||||
return 1000 * ByteArray.parseDecimal(this.mem.get(CRAWL_DELAY));
|
||||
return 1000 * ((int) ByteArray.parseDecimal(this.mem.get(CRAWL_DELAY)));
|
||||
} catch (final NumberFormatException e) {
|
||||
return 0;
|
||||
}
|
||||
|
|
|
@ -727,8 +727,8 @@ public final class Switchboard extends serverSwitch
|
|||
OAIListFriendsLoader.init(this.loader, oaiFriends);
|
||||
this.crawlQueues = new CrawlQueues(this, this.queuesRoot);
|
||||
this.crawlQueues.noticeURL.setMinimumDelta(
|
||||
getConfigLong("minimumLocalDelta", this.crawlQueues.noticeURL.getMinimumLocalDelta()),
|
||||
getConfigLong("minimumGlobalDelta", this.crawlQueues.noticeURL.getMinimumGlobalDelta()));
|
||||
getConfigInt("minimumLocalDelta", this.crawlQueues.noticeURL.getMinimumLocalDelta()),
|
||||
getConfigInt("minimumGlobalDelta", this.crawlQueues.noticeURL.getMinimumGlobalDelta()));
|
||||
|
||||
/*
|
||||
* Creating sync objects and loading status for the crawl jobs
|
||||
|
|
|
@ -92,11 +92,7 @@ public class serverObjects extends HashMap<String, String> implements Cloneable
|
|||
super();
|
||||
}
|
||||
|
||||
public serverObjects(final int initialCapacity) {
|
||||
super(initialCapacity);
|
||||
}
|
||||
|
||||
public serverObjects(final Map<String, String> input) {
|
||||
protected serverObjects(final Map<String, String> input) {
|
||||
super(input);
|
||||
}
|
||||
|
||||
|
@ -219,10 +215,6 @@ public class serverObjects extends HashMap<String, String> implements Cloneable
|
|||
return put(key, toJSON(value));
|
||||
}
|
||||
|
||||
public String putJSON(final String key, final StringBuilder value) {
|
||||
return put(key, toJSON(value.toString()));
|
||||
}
|
||||
|
||||
public static String toJSON(String value) {
|
||||
// value = value.replaceAll("\\", "\\\\");
|
||||
value = patternDoublequote.matcher(value).replaceAll("'");
|
||||
|
@ -235,10 +227,6 @@ public class serverObjects extends HashMap<String, String> implements Cloneable
|
|||
return value;
|
||||
}
|
||||
|
||||
public String putJSON(final String key, final byte[] value) {
|
||||
return putJSON(key, UTF8.String(value));
|
||||
}
|
||||
|
||||
/**
|
||||
* Add a String to the map. The content of the String is escaped to be usable in HTML output.
|
||||
* @param key key name as String.
|
||||
|
@ -386,11 +374,6 @@ public class serverObjects extends HashMap<String, String> implements Cloneable
|
|||
return s.equals("true") || s.equals("on") || s.equals("1");
|
||||
}
|
||||
|
||||
public boolean hasValue(final String key) {
|
||||
final String s = super.get(key);
|
||||
return (s != null && !s.isEmpty());
|
||||
}
|
||||
|
||||
// returns a set of all values where their key mappes the keyMapper
|
||||
public String[] getAll(final String keyMapper) {
|
||||
// the keyMapper may contain regular expressions as defined in String.matches
|
||||
|
|
Loading…
Reference in New Issue
Block a user