2009-03-20 12:21:32 +01:00
// Latency.java
// ------------
// (C) 2009 by Michael Peter Christen; mc@yacy.net
// first published 19.03.2009 on http://yacy.net
//
// $LastChangedDate$
// $LastChangedRevision$
// $LastChangedBy$
//
// This program is free software; you can redistribute it and/or modify
// it under the terms of the GNU General Public License as published by
// the Free Software Foundation; either version 2 of the License, or
// (at your option) any later version.
//
// This program is distributed in the hope that it will be useful,
// but WITHOUT ANY WARRANTY; without even the implied warranty of
// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
// GNU General Public License for more details.
//
// You should have received a copy of the GNU General Public License
// along with this program; if not, write to the Free Software
// Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
package de.anomic.crawler ;
2011-04-04 01:39:45 +02:00
import java.io.IOException ;
2009-03-20 12:21:32 +01:00
import java.util.Iterator ;
import java.util.Map ;
2011-04-04 01:39:45 +02:00
import java.util.Set ;
2009-03-20 12:21:32 +01:00
import java.util.concurrent.ConcurrentHashMap ;
2010-05-26 02:01:16 +02:00
import net.yacy.cora.document.MultiProtocolURI ;
2010-08-23 14:32:02 +02:00
import net.yacy.cora.protocol.Domains ;
2011-03-09 17:32:34 +01:00
import net.yacy.kelondro.util.MemoryControl ;
2011-09-25 18:59:06 +02:00
import net.yacy.search.Switchboard ;
2009-10-11 02:12:19 +02:00
2009-03-20 12:21:32 +01:00
public class Latency {
2010-05-26 02:01:16 +02:00
// the map is a mapping from host names to host configurations
2009-03-20 12:21:32 +01:00
private static final ConcurrentHashMap < String , Host > map = new ConcurrentHashMap < String , Host > ( ) ;
2010-05-26 02:01:16 +02:00
public static void update ( MultiProtocolURI url , long time ) {
2010-05-30 13:17:38 +02:00
String host = url . getHost ( ) ;
if ( host = = null ) return ;
Host h = map . get ( host ) ;
2009-03-20 12:21:32 +01:00
if ( h = = null ) {
2010-05-30 13:17:38 +02:00
h = new Host ( host , time ) ;
2011-03-09 17:32:34 +01:00
if ( map . size ( ) > 1000 | | MemoryControl . shortStatus ( ) ) map . clear ( ) ;
2010-05-30 13:17:38 +02:00
map . put ( host , h ) ;
2009-03-20 12:21:32 +01:00
} else {
h . update ( time ) ;
}
}
2010-05-26 02:01:16 +02:00
public static void update ( MultiProtocolURI url ) {
2010-05-30 13:17:38 +02:00
String host = url . getHost ( ) ;
if ( host = = null ) return ;
Host h = map . get ( host ) ;
2009-06-06 10:46:59 +02:00
if ( h = = null ) {
2010-05-30 13:17:38 +02:00
h = new Host ( host , 3000 ) ;
2011-03-09 17:32:34 +01:00
if ( map . size ( ) > 1000 | | MemoryControl . shortStatus ( ) ) map . clear ( ) ;
2010-05-30 13:17:38 +02:00
map . put ( host , h ) ;
2009-06-06 10:46:59 +02:00
} else {
h . update ( ) ;
}
}
2010-05-26 02:01:16 +02:00
public static void slowdown ( MultiProtocolURI url ) {
2010-05-30 13:17:38 +02:00
String host = url . getHost ( ) ;
if ( host = = null ) return ;
Host h = map . get ( host ) ;
2009-04-01 15:21:47 +02:00
if ( h = = null ) {
2010-05-30 13:17:38 +02:00
h = new Host ( host , 3000 ) ;
2011-03-09 17:32:34 +01:00
if ( map . size ( ) > 1000 | | MemoryControl . shortStatus ( ) ) map . clear ( ) ;
2010-05-30 13:17:38 +02:00
map . put ( host , h ) ;
2009-04-01 15:21:47 +02:00
} else {
h . slowdown ( ) ;
}
}
2010-05-26 02:01:16 +02:00
public static Host host ( MultiProtocolURI url ) {
2010-05-30 13:17:38 +02:00
String host = url . getHost ( ) ;
if ( host = = null ) return null ;
return map . get ( host ) ;
2009-03-20 12:21:32 +01:00
}
2010-05-26 02:01:16 +02:00
public static int average ( MultiProtocolURI url ) {
2010-05-30 13:17:38 +02:00
String host = url . getHost ( ) ;
if ( host = = null ) return 0 ;
Host h = map . get ( host ) ;
if ( h = = null ) return 0 ;
2009-03-20 12:21:32 +01:00
return h . average ( ) ;
}
public static Iterator < Map . Entry < String , Host > > iterator ( ) {
return map . entrySet ( ) . iterator ( ) ;
}
/ * *
* calculate the time since the last access of the domain as referenced by the url hash
* @param urlhash
* @return a time in milliseconds since last access of the domain or Long . MAX_VALUE if the domain was not accessed before
* /
2010-05-26 02:01:16 +02:00
public static long lastAccessDelta ( MultiProtocolURI url ) {
final Latency . Host host = Latency . host ( url ) ;
2009-03-20 12:21:32 +01:00
if ( host = = null ) return Long . MAX_VALUE ; // never accessed
return System . currentTimeMillis ( ) - host . lastacc ( ) ;
}
/ * *
* guess a minimum waiting time
* the time is not correct , because if the domain was not checked yet by the robots . txt delay value , it is too low
* also the ' isCGI ' property is missing , because the full text of the domain is unknown here
2010-05-30 12:28:42 +02:00
* @param hostname
2009-03-20 12:21:32 +01:00
* @param minimumLocalDelta
* @param minimumGlobalDelta
2009-06-06 10:46:59 +02:00
* @return the remaining waiting time in milliseconds . The return value may be negative
* which expresses how long the time is over the minimum waiting time .
2009-03-20 12:21:32 +01:00
* /
2010-05-26 02:01:16 +02:00
public static long waitingRemainingGuessed ( String hostname , final long minimumLocalDelta , final long minimumGlobalDelta ) {
2010-05-30 13:17:38 +02:00
if ( hostname = = null ) return 0 ;
2010-05-26 02:01:16 +02:00
Host host = map . get ( hostname ) ;
2010-05-30 13:17:38 +02:00
if ( host = = null ) return 0 ;
2009-06-06 03:56:31 +02:00
// the time since last access to the domain is the basis of the remaining calculation
final long timeSinceLastAccess = System . currentTimeMillis ( ) - host . lastacc ( ) ;
// find the minimum waiting time based on the network domain (local or global)
2010-07-18 22:14:20 +02:00
final boolean local = Domains . isLocal ( hostname ) ;
2009-06-06 03:56:31 +02:00
long waiting = ( local ) ? minimumLocalDelta : minimumGlobalDelta ;
2009-03-20 12:21:32 +01:00
2009-06-06 03:56:31 +02:00
// if we have accessed the domain many times, get slower (the flux factor)
if ( ! local ) waiting + = host . flux ( waiting ) ;
// use the access latency as rule how fast we can access the server
// this applies also to localhost, but differently, because it is not necessary to
// consider so many external accesses
waiting = Math . max ( waiting , ( local ) ? host . average ( ) / 2 : host . average ( ) * 2 ) ;
// prevent that that a robots file can stop our indexer completely
waiting = Math . min ( 60000 , waiting ) ;
// return time that is remaining
//System.out.println("Latency: " + (waiting - timeSinceLastAccess));
2009-06-06 10:46:59 +02:00
return waiting - timeSinceLastAccess ;
2009-03-20 12:21:32 +01:00
}
/ * *
* calculates how long should be waited until the domain can be accessed again
* this follows from :
* - given minimum access times
* - the fact that an url is a CGI url or not
* - the times that the domain was accessed ( flux factor )
* - the response latency of the domain
* - and a given minimum access time as given in robots . txt
* @param minimumLocalDelta
* @param minimumGlobalDelta
* @return the remaining waiting time in milliseconds
* /
2011-04-04 01:39:45 +02:00
public static long waitingRemaining ( MultiProtocolURI url , final Set < String > thisAgents , final long minimumLocalDelta , final long minimumGlobalDelta ) {
2009-07-15 16:15:51 +02:00
2009-03-20 12:21:32 +01:00
// first check if the domain was _ever_ accessed before
2010-05-26 02:01:16 +02:00
Host host = host ( url ) ;
2009-09-28 00:35:22 +02:00
if ( host = = null ) return Long . MIN_VALUE ; // no delay if host is new
// find the minimum waiting time based on the network domain (local or global)
final boolean local = url . isLocal ( ) ;
2010-03-22 10:12:18 +01:00
if ( local ) return minimumLocalDelta ;
2009-09-28 00:35:22 +02:00
long waiting = ( local ) ? minimumLocalDelta : minimumGlobalDelta ;
2009-03-20 12:21:32 +01:00
// the time since last access to the domain is the basis of the remaining calculation
2011-04-04 01:39:45 +02:00
final long timeSinceLastAccess = System . currentTimeMillis ( ) - host . lastacc ( ) ;
2009-03-20 12:21:32 +01:00
// for CGI accesses, we double the minimum time
// mostly there is a database access in the background
// which creates a lot of unwanted IO on target site
if ( url . isCGI ( ) ) waiting = waiting * 2 ;
// if we have accessed the domain many times, get slower (the flux factor)
2009-07-15 16:15:51 +02:00
if ( ! local & & host ! = null ) waiting + = host . flux ( waiting ) ;
2009-03-20 12:21:32 +01:00
// find the delay as given by robots.txt on target site
2011-04-04 01:39:45 +02:00
long robotsDelay = 0 ;
if ( ! local ) {
2011-05-02 16:05:51 +02:00
RobotsTxtEntry robotsEntry ;
2011-04-04 01:39:45 +02:00
try {
robotsEntry = Switchboard . getSwitchboard ( ) . robots . getEntry ( url , thisAgents ) ;
} catch ( IOException e ) {
robotsEntry = null ;
}
robotsDelay = ( robotsEntry = = null ) ? 0 : robotsEntry . getCrawlDelayMillis ( ) ;
2011-04-04 11:47:18 +02:00
if ( robotsEntry ! = null & & robotsDelay = = 0 & & robotsEntry . getAgentName ( ) ! = null ) return 0 ; // no limits if granted exclusively for this peer
2011-04-04 01:39:45 +02:00
}
2009-03-20 12:21:32 +01:00
waiting = Math . max ( waiting , robotsDelay ) ;
// use the access latency as rule how fast we can access the server
// this applies also to localhost, but differently, because it is not necessary to
// consider so many external accesses
2011-04-04 01:39:45 +02:00
waiting = Math . max ( waiting , ( local ) ? host . average ( ) / 2 : host . average ( ) * 2 ) ;
2009-03-20 12:21:32 +01:00
// prevent that that a robots file can stop our indexer completely
waiting = Math . min ( 60000 , waiting ) ;
// return time that is remaining
2009-03-20 15:54:37 +01:00
//System.out.println("Latency: " + (waiting - timeSinceLastAccess));
2009-03-20 12:21:32 +01:00
return Math . max ( 0 , waiting - timeSinceLastAccess ) ;
}
2009-09-28 00:35:22 +02:00
2011-04-04 01:39:45 +02:00
public static String waitingRemainingExplain ( MultiProtocolURI url , final Set < String > thisAgents , final long minimumLocalDelta , final long minimumGlobalDelta ) {
2009-09-28 00:35:22 +02:00
// first check if the domain was _ever_ accessed before
2010-05-26 02:01:16 +02:00
Host host = host ( url ) ;
2010-05-30 13:17:38 +02:00
if ( host = = null ) return " host " + host + " never accessed before -> 0 " ; // no delay if host is new
2009-09-28 00:35:22 +02:00
StringBuilder s = new StringBuilder ( 50 ) ;
// find the minimum waiting time based on the network domain (local or global)
final boolean local = url . isLocal ( ) ;
long waiting = ( local ) ? minimumLocalDelta : minimumGlobalDelta ;
s . append ( " minimumDelta = " ) . append ( waiting ) ;
// the time since last access to the domain is the basis of the remaining calculation
final long timeSinceLastAccess = ( host = = null ) ? 0 : System . currentTimeMillis ( ) - host . lastacc ( ) ;
s . append ( " , timeSinceLastAccess = " ) . append ( timeSinceLastAccess ) ;
// for CGI accesses, we double the minimum time
// mostly there is a database access in the background
// which creates a lot of unwanted IO on target site
if ( url . isCGI ( ) ) s . append ( " , isCGI = true -> double " ) ;
// if we have accessed the domain many times, get slower (the flux factor)
if ( ! local & & host ! = null ) s . append ( " , flux = " ) . append ( host . flux ( waiting ) ) ;
// find the delay as given by robots.txt on target site
2011-04-04 01:39:45 +02:00
long robotsDelay = 0 ;
if ( ! local ) {
2011-05-02 16:05:51 +02:00
RobotsTxtEntry robotsEntry ;
2011-04-04 01:39:45 +02:00
try {
robotsEntry = Switchboard . getSwitchboard ( ) . robots . getEntry ( url , thisAgents ) ;
} catch ( IOException e ) {
robotsEntry = null ;
}
robotsDelay = ( robotsEntry = = null ) ? 0 : robotsEntry . getCrawlDelayMillis ( ) ;
2011-04-04 14:20:20 +02:00
if ( robotsEntry ! = null & & robotsDelay = = 0 & & robotsEntry . getAgentName ( ) ! = null ) return " no waiting for exclusive granted peer " ; // no limits if granted exclusively for this peer
2011-04-04 01:39:45 +02:00
}
2009-09-28 00:35:22 +02:00
s . append ( " , robots.delay = " ) . append ( robotsDelay ) ;
// use the access latency as rule how fast we can access the server
// this applies also to localhost, but differently, because it is not necessary to
// consider so many external accesses
if ( host ! = null ) s . append ( " , host.average = " ) . append ( host . average ( ) ) ;
return s . toString ( ) ;
}
2009-03-20 12:21:32 +01:00
public static final class Host {
private long timeacc ;
private long lastacc ;
private int count ;
2010-01-11 00:09:48 +01:00
private final String host ;
2009-03-20 12:21:32 +01:00
private long robotsMinDelay ;
public Host ( String host , long time ) {
this . host = host ;
this . timeacc = time ;
this . count = 1 ;
this . lastacc = System . currentTimeMillis ( ) ;
this . robotsMinDelay = 0 ;
}
public void update ( long time ) {
this . lastacc = System . currentTimeMillis ( ) ;
2009-07-25 23:38:57 +02:00
this . timeacc + = Math . min ( 30000 , time ) ;
2009-03-20 12:21:32 +01:00
this . count + + ;
}
2009-06-06 10:46:59 +02:00
public void update ( ) {
this . lastacc = System . currentTimeMillis ( ) ;
}
2009-04-01 15:21:47 +02:00
public void slowdown ( ) {
this . lastacc = System . currentTimeMillis ( ) ;
2009-04-01 22:13:57 +02:00
this . timeacc = Math . min ( 60000 , average ( ) * 2 ) ;
2009-04-01 15:21:47 +02:00
this . count = 1 ;
}
2009-03-20 12:21:32 +01:00
public int count ( ) {
return this . count ;
}
public int average ( ) {
return ( int ) ( this . timeacc / this . count ) ;
}
public long lastacc ( ) {
return this . lastacc ;
}
public String host ( ) {
return this . host ;
}
public void robotsDelay ( long ur ) {
this . robotsMinDelay = ur ;
}
public long robotsDelay ( ) {
return this . robotsMinDelay ;
}
public long flux ( long range ) {
return count > = 1000 ? range * Math . min ( 5000 , count ) / 1000 : range / ( 1000 - count ) ;
}
}
}