2009-03-20 12:21:32 +01:00
// Latency.java
// ------------
// (C) 2009 by Michael Peter Christen; mc@yacy.net
// first published 19.03.2009 on http://yacy.net
//
// $LastChangedDate$
// $LastChangedRevision$
// $LastChangedBy$
//
// This program is free software; you can redistribute it and/or modify
// it under the terms of the GNU General Public License as published by
// the Free Software Foundation; either version 2 of the License, or
// (at your option) any later version.
//
// This program is distributed in the hope that it will be useful,
// but WITHOUT ANY WARRANTY; without even the implied warranty of
// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
// GNU General Public License for more details.
//
// You should have received a copy of the GNU General Public License
// along with this program; if not, write to the Free Software
// Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
2012-09-21 15:48:16 +02:00
package net.yacy.crawler.data ;
2009-03-20 12:21:32 +01:00
2011-04-04 01:39:45 +02:00
import java.io.IOException ;
2009-03-20 12:21:32 +01:00
import java.util.Iterator ;
import java.util.Map ;
2011-04-04 01:39:45 +02:00
import java.util.Set ;
2009-03-20 12:21:32 +01:00
import java.util.concurrent.ConcurrentHashMap ;
2010-05-26 02:01:16 +02:00
import net.yacy.cora.document.MultiProtocolURI ;
2010-08-23 14:32:02 +02:00
import net.yacy.cora.protocol.Domains ;
2012-09-21 15:48:16 +02:00
import net.yacy.crawler.robots.RobotsTxt ;
import net.yacy.crawler.robots.RobotsTxtEntry ;
2011-03-09 17:32:34 +01:00
import net.yacy.kelondro.util.MemoryControl ;
2009-10-11 02:12:19 +02:00
2009-03-20 12:21:32 +01:00
public class Latency {
2012-05-15 12:24:54 +02:00
private final static int DEFAULT_AVERAGE = 300 ;
2010-05-26 02:01:16 +02:00
// the map is a mapping from host names to host configurations
2009-03-20 12:21:32 +01:00
private static final ConcurrentHashMap < String , Host > map = new ConcurrentHashMap < String , Host > ( ) ;
2011-09-30 10:26:31 +02:00
public static void update ( final MultiProtocolURI url , final long time ) {
final String host = url . getHost ( ) ;
2010-05-30 13:17:38 +02:00
if ( host = = null ) return ;
Host h = map . get ( host ) ;
2009-03-20 12:21:32 +01:00
if ( h = = null ) {
2010-05-30 13:17:38 +02:00
h = new Host ( host , time ) ;
2011-03-09 17:32:34 +01:00
if ( map . size ( ) > 1000 | | MemoryControl . shortStatus ( ) ) map . clear ( ) ;
2010-05-30 13:17:38 +02:00
map . put ( host , h ) ;
2009-03-20 12:21:32 +01:00
} else {
h . update ( time ) ;
}
}
2011-09-30 10:26:31 +02:00
public static void update ( final MultiProtocolURI url ) {
final String host = url . getHost ( ) ;
2010-05-30 13:17:38 +02:00
if ( host = = null ) return ;
Host h = map . get ( host ) ;
2009-06-06 10:46:59 +02:00
if ( h = = null ) {
2012-05-15 12:24:54 +02:00
h = new Host ( host , DEFAULT_AVERAGE ) ;
2011-03-09 17:32:34 +01:00
if ( map . size ( ) > 1000 | | MemoryControl . shortStatus ( ) ) map . clear ( ) ;
2010-05-30 13:17:38 +02:00
map . put ( host , h ) ;
2009-06-06 10:46:59 +02:00
} else {
h . update ( ) ;
}
}
2011-09-30 10:26:31 +02:00
public static void slowdown ( final MultiProtocolURI url ) {
final String host = url . getHost ( ) ;
2010-05-30 13:17:38 +02:00
if ( host = = null ) return ;
Host h = map . get ( host ) ;
2009-04-01 15:21:47 +02:00
if ( h = = null ) {
2012-05-15 12:24:54 +02:00
h = new Host ( host , DEFAULT_AVERAGE ) ;
2011-03-09 17:32:34 +01:00
if ( map . size ( ) > 1000 | | MemoryControl . shortStatus ( ) ) map . clear ( ) ;
2010-05-30 13:17:38 +02:00
map . put ( host , h ) ;
2009-04-01 15:21:47 +02:00
} else {
h . slowdown ( ) ;
}
}
2011-09-30 10:26:31 +02:00
public static Host host ( final MultiProtocolURI url ) {
final String host = url . getHost ( ) ;
2010-05-30 13:17:38 +02:00
if ( host = = null ) return null ;
return map . get ( host ) ;
2009-03-20 12:21:32 +01:00
}
2011-09-30 10:26:31 +02:00
public static int average ( final MultiProtocolURI url ) {
final String host = url . getHost ( ) ;
2010-05-30 13:17:38 +02:00
if ( host = = null ) return 0 ;
2011-09-30 10:26:31 +02:00
final Host h = map . get ( host ) ;
2010-05-30 13:17:38 +02:00
if ( h = = null ) return 0 ;
2009-03-20 12:21:32 +01:00
return h . average ( ) ;
}
2011-09-30 10:26:31 +02:00
2009-03-20 12:21:32 +01:00
public static Iterator < Map . Entry < String , Host > > iterator ( ) {
return map . entrySet ( ) . iterator ( ) ;
}
2011-09-30 10:26:31 +02:00
2009-03-20 12:21:32 +01:00
/ * *
* calculate the time since the last access of the domain as referenced by the url hash
* @param urlhash
* @return a time in milliseconds since last access of the domain or Long . MAX_VALUE if the domain was not accessed before
* /
2011-09-30 10:26:31 +02:00
public static long lastAccessDelta ( final MultiProtocolURI url ) {
2010-05-26 02:01:16 +02:00
final Latency . Host host = Latency . host ( url ) ;
2009-03-20 12:21:32 +01:00
if ( host = = null ) return Long . MAX_VALUE ; // never accessed
return System . currentTimeMillis ( ) - host . lastacc ( ) ;
}
2011-09-30 10:26:31 +02:00
2009-03-20 12:21:32 +01:00
/ * *
* guess a minimum waiting time
* the time is not correct , because if the domain was not checked yet by the robots . txt delay value , it is too low
* also the ' isCGI ' property is missing , because the full text of the domain is unknown here
2010-05-30 12:28:42 +02:00
* @param hostname
2009-03-20 12:21:32 +01:00
* @param minimumLocalDelta
* @param minimumGlobalDelta
2009-06-06 10:46:59 +02:00
* @return the remaining waiting time in milliseconds . The return value may be negative
* which expresses how long the time is over the minimum waiting time .
2009-03-20 12:21:32 +01:00
* /
2011-09-30 10:26:31 +02:00
public static long waitingRemainingGuessed ( final String hostname , final long minimumLocalDelta , final long minimumGlobalDelta ) {
2012-05-15 12:24:54 +02:00
if ( hostname = = null ) return Long . MIN_VALUE ;
2011-09-30 10:26:31 +02:00
2012-05-15 12:24:54 +02:00
// first check if the domain was _ever_ accessed before
final Host host = map . get ( hostname ) ;
if ( host = = null ) return Long . MIN_VALUE ; // no delay if host is new
2011-09-30 10:26:31 +02:00
2009-06-06 03:56:31 +02:00
// find the minimum waiting time based on the network domain (local or global)
2011-09-30 10:26:31 +02:00
final boolean local = Domains . isLocal ( hostname , null ) ;
2012-05-15 12:24:54 +02:00
if ( local ) return minimumLocalDelta ;
long waiting = minimumGlobalDelta ;
2011-09-30 10:26:31 +02:00
2009-06-06 03:56:31 +02:00
// if we have accessed the domain many times, get slower (the flux factor)
2012-05-15 12:24:54 +02:00
waiting + = host . flux ( waiting ) ;
2011-09-30 10:26:31 +02:00
2009-06-06 03:56:31 +02:00
// use the access latency as rule how fast we can access the server
// this applies also to localhost, but differently, because it is not necessary to
// consider so many external accesses
2012-05-15 12:24:54 +02:00
waiting = Math . max ( waiting , host . average ( ) * 2 ) ;
2011-09-30 10:26:31 +02:00
2012-05-15 12:24:54 +02:00
// the time since last access to the domain is the basis of the remaining calculation
final long timeSinceLastAccess = System . currentTimeMillis ( ) - host . lastacc ( ) ;
return Math . max ( 0 , Math . min ( 60000 , waiting ) - timeSinceLastAccess ) ;
2009-03-20 12:21:32 +01:00
}
2011-09-30 10:26:31 +02:00
2009-03-20 12:21:32 +01:00
/ * *
* calculates how long should be waited until the domain can be accessed again
* this follows from :
* - given minimum access times
* - the fact that an url is a CGI url or not
* - the times that the domain was accessed ( flux factor )
* - the response latency of the domain
* - and a given minimum access time as given in robots . txt
* @param minimumLocalDelta
* @param minimumGlobalDelta
* @return the remaining waiting time in milliseconds
* /
2012-04-21 13:47:48 +02:00
public static long waitingRemaining ( final MultiProtocolURI url , final RobotsTxt robots , final Set < String > thisAgents , final long minimumLocalDelta , final long minimumGlobalDelta ) {
2009-07-15 16:15:51 +02:00
2009-03-20 12:21:32 +01:00
// first check if the domain was _ever_ accessed before
2011-09-30 10:26:31 +02:00
final Host host = host ( url ) ;
2009-09-28 00:35:22 +02:00
if ( host = = null ) return Long . MIN_VALUE ; // no delay if host is new
2011-09-30 10:26:31 +02:00
2009-09-28 00:35:22 +02:00
// find the minimum waiting time based on the network domain (local or global)
final boolean local = url . isLocal ( ) ;
2010-03-22 10:12:18 +01:00
if ( local ) return minimumLocalDelta ;
2012-05-15 12:24:54 +02:00
long waiting = minimumGlobalDelta ;
2011-09-30 10:26:31 +02:00
2009-03-20 12:21:32 +01:00
// for CGI accesses, we double the minimum time
// mostly there is a database access in the background
// which creates a lot of unwanted IO on target site
if ( url . isCGI ( ) ) waiting = waiting * 2 ;
// if we have accessed the domain many times, get slower (the flux factor)
2012-05-15 12:24:54 +02:00
waiting + = host . flux ( waiting ) ;
2011-09-30 10:26:31 +02:00
2009-03-20 12:21:32 +01:00
// use the access latency as rule how fast we can access the server
// this applies also to localhost, but differently, because it is not necessary to
// consider so many external accesses
2012-05-15 12:24:54 +02:00
waiting = Math . max ( waiting , host . average ( ) * 2 ) ;
2011-09-30 10:26:31 +02:00
2012-05-15 12:24:54 +02:00
// find the delay as given by robots.txt on target site
long robotsDelay = 0 ;
RobotsTxtEntry robotsEntry ;
try {
robotsEntry = robots . getEntry ( url , thisAgents ) ;
} catch ( final IOException e ) {
robotsEntry = null ;
}
robotsDelay = ( robotsEntry = = null ) ? 0 : robotsEntry . getCrawlDelayMillis ( ) ;
if ( robotsEntry ! = null & & robotsDelay = = 0 & & robotsEntry . getAgentName ( ) ! = null ) return 0 ; // no limits if granted exclusively for this peer
2011-09-30 10:26:31 +02:00
2012-05-15 12:24:54 +02:00
waiting = Math . max ( waiting , robotsDelay ) ;
// the time since last access to the domain is the basis of the remaining calculation
final long timeSinceLastAccess = System . currentTimeMillis ( ) - host . lastacc ( ) ;
return Math . max ( 0 , Math . min ( 60000 , waiting ) - timeSinceLastAccess ) ;
2009-03-20 12:21:32 +01:00
}
2011-09-30 10:26:31 +02:00
2012-04-21 13:47:48 +02:00
public static String waitingRemainingExplain ( final MultiProtocolURI url , final RobotsTxt robots , final Set < String > thisAgents , final long minimumLocalDelta , final long minimumGlobalDelta ) {
2011-09-30 10:26:31 +02:00
2009-09-28 00:35:22 +02:00
// first check if the domain was _ever_ accessed before
2011-09-30 10:26:31 +02:00
final Host host = host ( url ) ;
2012-05-15 12:24:54 +02:00
if ( host = = null ) return " host " + host + " never accessed before -> Long.MIN_VALUE " ; // no delay if host is new
2011-09-30 10:26:31 +02:00
final StringBuilder s = new StringBuilder ( 50 ) ;
2009-09-28 00:35:22 +02:00
// find the minimum waiting time based on the network domain (local or global)
final boolean local = url . isLocal ( ) ;
2012-05-15 12:24:54 +02:00
if ( local ) return " local host -> minimum local: " + minimumLocalDelta ;
long waiting = minimumGlobalDelta ;
2009-09-28 00:35:22 +02:00
s . append ( " minimumDelta = " ) . append ( waiting ) ;
2011-09-30 10:26:31 +02:00
2009-09-28 00:35:22 +02:00
// for CGI accesses, we double the minimum time
// mostly there is a database access in the background
// which creates a lot of unwanted IO on target site
2012-05-15 12:24:54 +02:00
if ( url . isCGI ( ) ) { waiting = waiting * 2 ; s . append ( " , isCGI = true -> double " ) ; }
2009-09-28 00:35:22 +02:00
// if we have accessed the domain many times, get slower (the flux factor)
2012-05-15 12:24:54 +02:00
long flux = host . flux ( waiting ) ;
waiting + = flux ;
s . append ( " , flux = " ) . append ( flux ) ;
// use the access latency as rule how fast we can access the server
// this applies also to localhost, but differently, because it is not necessary to
// consider so many external accesses
s . append ( " , host.average = " ) . append ( host . average ( ) ) ;
waiting = Math . max ( waiting , host . average ( ) * 2 ) ;
2011-09-30 10:26:31 +02:00
2009-09-28 00:35:22 +02:00
// find the delay as given by robots.txt on target site
2011-04-04 01:39:45 +02:00
long robotsDelay = 0 ;
2012-05-15 12:24:54 +02:00
RobotsTxtEntry robotsEntry ;
try {
robotsEntry = robots . getEntry ( url , thisAgents ) ;
} catch ( final IOException e ) {
robotsEntry = null ;
2011-04-04 01:39:45 +02:00
}
2012-05-15 12:24:54 +02:00
robotsDelay = ( robotsEntry = = null ) ? 0 : robotsEntry . getCrawlDelayMillis ( ) ;
if ( robotsEntry ! = null & & robotsDelay = = 0 & & robotsEntry . getAgentName ( ) ! = null ) return " no waiting for exclusive granted peer " ; // no limits if granted exclusively for this peer
2011-09-30 10:26:31 +02:00
2012-05-15 12:24:54 +02:00
waiting = Math . max ( waiting , robotsDelay ) ;
s . append ( " , robots.delay = " ) . append ( robotsDelay ) ;
2011-09-30 10:26:31 +02:00
2012-05-15 12:24:54 +02:00
// the time since last access to the domain is the basis of the remaining calculation
final long timeSinceLastAccess = System . currentTimeMillis ( ) - host . lastacc ( ) ;
s . append ( " , ((waitig = " ) . append ( waiting ) ;
s . append ( " ) - (timeSinceLastAccess = " ) . append ( timeSinceLastAccess ) . append ( " )) = " ) ;
s . append ( waiting - timeSinceLastAccess ) ;
2009-09-28 00:35:22 +02:00
return s . toString ( ) ;
}
2011-09-30 10:26:31 +02:00
2009-03-20 12:21:32 +01:00
public static final class Host {
private long timeacc ;
private long lastacc ;
private int count ;
2010-01-11 00:09:48 +01:00
private final String host ;
2009-03-20 12:21:32 +01:00
private long robotsMinDelay ;
2011-09-30 10:26:31 +02:00
public Host ( final String host , final long time ) {
2009-03-20 12:21:32 +01:00
this . host = host ;
this . timeacc = time ;
this . count = 1 ;
this . lastacc = System . currentTimeMillis ( ) ;
this . robotsMinDelay = 0 ;
}
2011-09-30 10:26:31 +02:00
public void update ( final long time ) {
2009-03-20 12:21:32 +01:00
this . lastacc = System . currentTimeMillis ( ) ;
2009-07-25 23:38:57 +02:00
this . timeacc + = Math . min ( 30000 , time ) ;
2009-03-20 12:21:32 +01:00
this . count + + ;
}
2009-06-06 10:46:59 +02:00
public void update ( ) {
this . lastacc = System . currentTimeMillis ( ) ;
}
2009-04-01 15:21:47 +02:00
public void slowdown ( ) {
this . lastacc = System . currentTimeMillis ( ) ;
2009-04-01 22:13:57 +02:00
this . timeacc = Math . min ( 60000 , average ( ) * 2 ) ;
2009-04-01 15:21:47 +02:00
this . count = 1 ;
}
2009-03-20 12:21:32 +01:00
public int count ( ) {
return this . count ;
}
public int average ( ) {
return ( int ) ( this . timeacc / this . count ) ;
}
public long lastacc ( ) {
return this . lastacc ;
}
public String host ( ) {
return this . host ;
}
2011-09-30 10:26:31 +02:00
public void robotsDelay ( final long ur ) {
2009-03-20 12:21:32 +01:00
this . robotsMinDelay = ur ;
}
public long robotsDelay ( ) {
return this . robotsMinDelay ;
}
2011-09-30 10:26:31 +02:00
public long flux ( final long range ) {
return this . count > = 1000 ? range * Math . min ( 5000 , this . count ) / 1000 : range / ( 1000 - this . count ) ;
2009-03-20 12:21:32 +01:00
}
}
2011-09-30 10:26:31 +02:00
2009-03-20 12:21:32 +01:00
}