yacy_search_server/source/net/yacy/crawler/data/Latency.java
Michael Peter Christen da86f150ab - added a new Crawler Balancer: HostBalancer and HostQueues:
This organizes all urls to be loaded in separate queues for each host.
Each host separates the crawl depth into it's own queue. The primary
rule for urls taken from any queue is, that the crawl depth is minimal.
This produces a crawl depth which is identical to the clickdepth.
Furthermorem the crawl is able to create a much better balancing over
all hosts which is fair to all hosts that are in the queue.
This process will create a very large number of files for wide crawls in
the QUEUES folder: for each host a directory, for each crawl depth a
file inside the directory. A crawl with maxdepth = 4 will be able to
create 10.000s of files. To be able to use that many file readers, it
was necessary to implement a new index data structure which opens the
file only if an access is wanted (OnDemandOpenFileIndex). The usage of
such on-demand file reader shall prevent that the number of file
pointers is over the system limit, which is usually about 10.000 open
files. Some parts of YaCy had to be adopted to handle the crawl depth
number correctly. The logging and the IndexCreateQueues servlet had to
be adopted to show the crawl queues differently, because the host name
is attached to the port on the host to differentiate between http,
https, and ftp services.
2014-04-16 21:34:28 +02:00

352 lines
16 KiB
Java

// Latency.java
// ------------
// (C) 2009 by Michael Peter Christen; mc@yacy.net
// first published 19.03.2009 on http://yacy.net
//
// $LastChangedDate$
// $LastChangedRevision$
// $LastChangedBy$
//
// This program is free software; you can redistribute it and/or modify
// it under the terms of the GNU General Public License as published by
// the Free Software Foundation; either version 2 of the License, or
// (at your option) any later version.
//
// This program is distributed in the hope that it will be useful,
// but WITHOUT ANY WARRANTY; without even the implied warranty of
// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
// GNU General Public License for more details.
//
// You should have received a copy of the GNU General Public License
// along with this program; if not, write to the Free Software
// Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
package net.yacy.crawler.data;
import java.util.Iterator;
import java.util.Map;
import java.util.concurrent.ConcurrentHashMap;
import java.util.concurrent.atomic.AtomicInteger;
import java.util.concurrent.atomic.AtomicLong;
import net.yacy.cora.document.id.DigestURL;
import net.yacy.cora.document.id.MultiProtocolURL;
import net.yacy.cora.federate.yacy.CacheStrategy;
import net.yacy.cora.protocol.ClientIdentification;
import net.yacy.crawler.robots.RobotsTxt;
import net.yacy.crawler.robots.RobotsTxtEntry;
import net.yacy.kelondro.util.MemoryControl;
import net.yacy.search.Switchboard;
import net.yacy.search.SwitchboardConstants;
public class Latency {
// the map is a mapping from host names to host configurations
private static final int mapMaxSize = 1000;
private static final ConcurrentHashMap<String, Host> map = new ConcurrentHashMap<String, Host>();
/**
* update the latency entry after a host was selected for queueing into the loader
* @param url
* @param robotsCrawlDelay the crawl-delay given by the robots; 0 if not exist
*/
public static void updateAfterSelection(final DigestURL url, final long robotsCrawlDelay) {
final String host = url.getHost();
if (host == null) return;
String hosthash = url.hosthash();
Host h = map.get(hosthash);
if (h == null) {
h = new Host(host, Switchboard.getSwitchboard().getConfigInt("crawler.defaultAverageLatency", 500), robotsCrawlDelay);
if (map.size() > mapMaxSize || MemoryControl.shortStatus()) map.clear();
map.put(hosthash, h);
}
}
/**
* update the latency entry before a host is accessed
* @param url
* @param time the time to load the file in milliseconds
*/
public static void updateBeforeLoad(final DigestURL url) {
final String host = url.getHost();
if (host == null) return;
String hosthash = url.hosthash();
Host h = map.get(hosthash);
if (h == null) {
h = new Host(host, 500, 0);
if (map.size() > mapMaxSize || MemoryControl.shortStatus()) map.clear();
map.put(hosthash, h);
} else {
h.update();
}
}
/**
* update the latency entry after a host was accessed to load a file
* @param url
* @param time the time to load the file in milliseconds
*/
public static void updateAfterLoad(final DigestURL url, final long time) {
final String host = url.getHost();
if (host == null) return;
String hosthash = url.hosthash();
Host h = map.get(hosthash);
if (h == null) {
h = new Host(host, time, 0);
if (map.size() > mapMaxSize || MemoryControl.shortStatus()) map.clear();
map.put(hosthash, h);
} else {
h.update(time);
}
}
private static Host host(final DigestURL url) {
final String host = url.getHost();
if (host == null) return null;
return map.get(url.hosthash());
}
public static Iterator<Map.Entry<String, Host>> iterator() {
return map.entrySet().iterator();
}
/**
* Return the waiting time demanded by the robots.txt file of the target host.
* A special case is, if the remote host has a special crawl-delay assignment for
* this crawler with 0. This causes that a -1 is returned
* @param url
* @param robots
* @param thisAgents
* @return the waiting time in milliseconds; 0 if not known; -1 if host gives us special rights
*/
public static int waitingRobots(final MultiProtocolURL url, final RobotsTxt robots, final ClientIdentification.Agent agent) {
int robotsDelay = 0;
RobotsTxtEntry robotsEntry = robots.getEntry(url, agent);
robotsDelay = (robotsEntry == null) ? 0 : robotsEntry.getCrawlDelayMillis();
if (robotsEntry != null && robotsDelay == 0 && robotsEntry.getAgentName() != null) return -1; // no limits if granted exclusively for this peer
return robotsDelay;
}
private static int waitingRobots(final String hostport, final RobotsTxt robots, final ClientIdentification.Agent agent, final boolean fetchOnlineIfNotAvailableOrNotFresh) {
int robotsDelay = 0;
RobotsTxtEntry robotsEntry = robots.getEntry(hostport, agent, fetchOnlineIfNotAvailableOrNotFresh);
robotsDelay = (robotsEntry == null) ? 0 : robotsEntry.getCrawlDelayMillis();
if (robotsEntry != null && robotsDelay == 0 && robotsEntry.getAgentName() != null) return -1; // no limits if granted exclusively for this peer
return robotsDelay;
}
/**
* guess a minimum waiting time
* the time is not correct, because if the domain was not checked yet by the robots.txt delay value, it is too low
* @param hostname
* @param hosthash
* @param robots
* @param agent
* @return the remaining waiting time in milliseconds. The return value may be negative
* which expresses how long the time is over the minimum waiting time.
*/
public static int waitingRemainingGuessed(final String hostname, final String hosthash, final RobotsTxt robots, final ClientIdentification.Agent agent) {
// first check if the domain was _ever_ accessed before
final Host host = map.get(hosthash);
if (host == null) return Integer.MIN_VALUE; // no delay if host is new; use Integer because there is a cast to int somewhere
// find the minimum waiting time based on the network domain (local or global)
int waiting = agent.minimumDelta;
// if we have accessed the domain many times, get slower (the flux factor)
waiting += host.flux(waiting);
// use the access latency as rule how fast we can access the server
// this applies also to localhost, but differently, because it is not necessary to
// consider so many external accesses
waiting = Math.max(waiting, (int) (host.average() * Switchboard.getSwitchboard().getConfigFloat(SwitchboardConstants.CRAWLER_LATENCY_FACTOR, 0.5f)));
// if the number of same hosts as in the url in the loading queue is greater than MaxSameHostInQueue, then increase waiting
if (Switchboard.getSwitchboard().crawlQueues.hostcount(hostname) > Switchboard.getSwitchboard().getConfigInt(SwitchboardConstants.CRAWLER_MAX_SAME_HOST_IN_QUEUE, 20)) waiting += 3000;
// the time since last access to the domain is the basis of the remaining calculation
final int timeSinceLastAccess = (int) (System.currentTimeMillis() - host.lastacc());
// find the delay as given by robots.txt on target site
if (robots != null) {
int robotsDelay = waitingRobots(hostname + ":80", robots, agent, false);
if (robotsDelay < 0) return -timeSinceLastAccess; // no limits if granted exclusively for this peer
waiting = Math.max(waiting, robotsDelay);
}
return Math.min(60000, waiting) - timeSinceLastAccess;
}
/**
* calculates how long should be waited until the domain can be accessed again
* this follows from:
* - given minimum access times
* - the fact that an url is a CGI url or not
* - the times that the domain was accessed (flux factor)
* - the response latency of the domain
* - and a given minimum access time as given in robots.txt
* @param agent
* @return the remaining waiting time in milliseconds. can be negative to reflect the due-time after a possible nex loading time
*/
public static int waitingRemaining(final DigestURL url, final RobotsTxt robots, final ClientIdentification.Agent agent) {
// first check if the domain was _ever_ accessed before
final Host host = host(url);
if (host == null) return Integer.MIN_VALUE; // no delay if host is new; use Integer because there is a cast to int somewhere
// find the minimum waiting time based on the network domain (local or global)
boolean local = url.isLocal();
int waiting = agent.minimumDelta;
// if we have accessed the domain many times, get slower (the flux factor)
if (!local) waiting += host.flux(waiting);
// use the access latency as rule how fast we can access the server
waiting = Math.max(waiting, (int) (host.average() * Switchboard.getSwitchboard().getConfigFloat(SwitchboardConstants.CRAWLER_LATENCY_FACTOR, 0.5f)));
// if the number of same hosts as in the url in the loading queue is greater than MaxSameHostInQueue, then increase waiting
if (Switchboard.getSwitchboard().crawlQueues.hostcount(url.getHost()) > Switchboard.getSwitchboard().getConfigInt(SwitchboardConstants.CRAWLER_MAX_SAME_HOST_IN_QUEUE, 20)) waiting += 3000;
// the time since last access to the domain is the basis of the remaining calculation
final int timeSinceLastAccess = (int) (System.currentTimeMillis() - host.lastacc());
// find the delay as given by robots.txt on target site
int robotsDelay = waitingRobots(url, robots, agent);
if (robotsDelay < 0) return -timeSinceLastAccess; // no limits if granted exclusively for this peer
waiting = Math.max(waiting, robotsDelay);
return Math.min(60000, waiting) - timeSinceLastAccess;
}
public static String waitingRemainingExplain(final DigestURL url, final RobotsTxt robots, final ClientIdentification.Agent agent) {
// first check if the domain was _ever_ accessed before
final Host host = host(url);
if (host == null) return "host " + host + " never accessed before -> Integer.MIN_VALUE"; // no delay if host is new
// find the minimum waiting time based on the network domain (local or global)
boolean local = url.isLocal();
final StringBuilder s = new StringBuilder(50);
// find the minimum waiting time based on the network domain (local or global)
int waiting = agent.minimumDelta;
s.append("minimumDelta = ").append(waiting);
// if we have accessed the domain many times, get slower (the flux factor)
if (!local) {
int flux = host.flux(waiting);
waiting += flux;
s.append(", flux = ").append(flux);
}
// use the access latency as rule how fast we can access the server
// this applies also to localhost, but differently, because it is not necessary to
// consider so many external accesses
s.append(", host.average = ").append(host.average());
waiting = Math.max(waiting, (int) (host.average() * Switchboard.getSwitchboard().getConfigFloat(SwitchboardConstants.CRAWLER_LATENCY_FACTOR, 0.5f)));
// if the number of same hosts as in the url in the loading queue is greater than MaxSameHostInQueue, then increase waiting
int hostcount = Switchboard.getSwitchboard().crawlQueues.hostcount(url.getHost());
if (hostcount > Switchboard.getSwitchboard().getConfigInt(SwitchboardConstants.CRAWLER_MAX_SAME_HOST_IN_QUEUE, 20)) {
s.append(", hostcount = ").append(hostcount);
waiting += 5000;
}
// find the delay as given by robots.txt on target site
int robotsDelay = waitingRobots(url, robots, agent);
if (robotsDelay < 0) return "no waiting for exclusive granted peer"; // no limits if granted exclusively for this peer
waiting = Math.max(waiting, robotsDelay);
s.append(", robots.delay = ").append(robotsDelay);
// the time since last access to the domain is the basis of the remaining calculation
final long timeSinceLastAccess = System.currentTimeMillis() - host.lastacc();
s.append(", ((waitig = ").append(waiting);
s.append(") - (timeSinceLastAccess = ").append(timeSinceLastAccess).append(")) = ");
s.append(waiting - timeSinceLastAccess);
return s.toString();
}
/**
* Get the minimum sleep time for a given url. The result can also be negative to reflect the time since the last access
* The time can be as low as Integer.MIN_VALUE to show that there should not be any limitation at all.
* @param robots
* @param profileEntry
* @param crawlURL
* @return the sleep time in milliseconds; may be negative for no sleep time
*/
public static long getDomainSleepTime(final RobotsTxt robots, final CrawlProfile profileEntry, final DigestURL crawlURL) {
if (profileEntry == null) return 0;
long sleeptime = (
profileEntry.cacheStrategy() == CacheStrategy.CACHEONLY ||
(profileEntry.cacheStrategy() == CacheStrategy.IFEXIST && Cache.has(crawlURL.hash()))
) ? Integer.MIN_VALUE : waitingRemaining(crawlURL, robots, profileEntry.getAgent()); // this uses the robots.txt database and may cause a loading of robots.txt from the server
return sleeptime;
}
/**
* load a robots.txt to get the robots time.
* ATTENTION: this method causes that a robots.txt is loaded from the web which may cause a longer delay in execution.
* This shall therefore not be called in synchronized environments.
* @param robots
* @param profileEntry
* @param crawlURL
* @return
*/
public static long getRobotsTime(final RobotsTxt robots, final DigestURL crawlURL, ClientIdentification.Agent agent) {
long sleeptime = waitingRobots(crawlURL, robots, agent); // this uses the robots.txt database and may cause a loading of robots.txt from the server
return sleeptime < 0 ? 0 : sleeptime;
}
public static final class Host {
private AtomicLong timeacc;
private AtomicLong lastacc;
private AtomicInteger count;
private final String host;
private long robotsMinDelay;
private Host(final String host, final long time, long robotsMinDelay) {
this.host = host;
this.timeacc = new AtomicLong(time);
this.count = new AtomicInteger(1);
this.lastacc = new AtomicLong(System.currentTimeMillis());
this.robotsMinDelay = robotsMinDelay;
}
private void update(final long time) {
if (this.count.get() > 100) {
synchronized(this) {
// faster adoption to new values
this.timeacc.set(this.timeacc.get() / this.count.get());
this.count.set(1);
}
}
this.lastacc.set(System.currentTimeMillis());
this.timeacc.addAndGet(Math.min(30000, time));
this.count.incrementAndGet();
}
private void update() {
this.lastacc.set(System.currentTimeMillis());
}
public int count() {
return this.count.get();
}
public int average() {
return (int) (this.timeacc.get() / this.count.get());
}
public long lastacc() {
return this.lastacc.get();
}
public String host() {
return this.host;
}
public long robotsDelay() {
return this.robotsMinDelay;
}
public int flux(final int range) {
return this.count.get() >= 10000 ? range * Math.min(5000, this.count.get()) / 10000 : range / (10000 - this.count.get());
}
}
}