2005-09-25 03:09:21 +02:00
// plasmaCrawlBalancer.java
// -----------------------
// part of YaCy
// (C) by Michael Peter Christen; mc@anomic.de
// first published on http://www.anomic.de
// Frankfurt, Germany, 2005
// created: 24.09.2005
//
// This program is free software; you can redistribute it and/or modify
// it under the terms of the GNU General Public License as published by
// the Free Software Foundation; either version 2 of the License, or
// (at your option) any later version.
//
// This program is distributed in the hope that it will be useful,
// but WITHOUT ANY WARRANTY; without even the implied warranty of
// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
// GNU General Public License for more details.
//
// You should have received a copy of the GNU General Public License
// along with this program; if not, write to the Free Software
// Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
//
// Using this software in any meaning (reading, learning, copying, compiling,
// running) means that you agree that the Author(s) is (are) not responsible
// for cost, loss of data or any harm that may be caused directly or indirectly
// by usage of this softare or this documentation. The usage of this software
// is on your own risk. The installation and usage (starting/running) of this
// software may allow other people or application to access your computer and
// any attached devices and is highly dependent on the configuration of the
// software which must be done by the user of the software; the author(s) is
// (are) also not responsible for proper configuration and usage of the
// software, even if provoked by documentation provided together with
// the software.
//
// Any changes to this file according to the GPL as documented in the file
// gpl.txt aside this file in the shipment you received can be done to the
// lines that follows this copyright notice here, but changes must not be
// done inside the copyright notive above. A re-distribution must contain
// the intact and unchanged copyright notice.
// Contributions and changes to the program code must be marked as such.
package de.anomic.plasma ;
import java.io.File ;
import java.io.IOException ;
2006-09-30 00:27:20 +02:00
import java.util.ArrayList ;
2007-02-21 17:23:31 +01:00
import java.util.Collections ;
2005-09-25 03:09:21 +02:00
import java.util.HashMap ;
import java.util.Iterator ;
2007-02-21 17:23:31 +01:00
import java.util.LinkedList ;
2006-09-30 00:27:20 +02:00
import java.util.Map ;
2007-02-21 17:23:31 +01:00
import java.util.TreeMap ;
2005-09-25 03:09:21 +02:00
2008-01-07 23:36:48 +01:00
import de.anomic.kelondro.kelondroAbstractRecords ;
2006-12-06 04:02:57 +01:00
import de.anomic.kelondro.kelondroBase64Order ;
2008-01-17 22:48:08 +01:00
import de.anomic.kelondro.kelondroEcoTable ;
2007-03-16 14:25:56 +01:00
import de.anomic.kelondro.kelondroIndex ;
2006-06-02 14:45:57 +02:00
import de.anomic.kelondro.kelondroRow ;
2005-09-25 03:09:21 +02:00
import de.anomic.kelondro.kelondroStack ;
2007-02-21 17:23:31 +01:00
import de.anomic.server.logging.serverLog ;
2006-11-08 17:17:47 +01:00
import de.anomic.yacy.yacySeedDB ;
2005-09-25 03:09:21 +02:00
public class plasmaCrawlBalancer {
2007-03-16 14:25:56 +01:00
2008-01-17 22:48:08 +01:00
private static final String stackSuffix = " 9.stack " ;
private static final String indexSuffix = " 9.db " ;
private static final int EcoFSBufferSize = 200 ;
2005-09-25 03:09:21 +02:00
2007-02-21 17:23:31 +01:00
// a shared domainAccess map for all balancers
2008-01-07 23:36:48 +01:00
private static final Map < String , domaccess > domainAccess = Collections . synchronizedMap ( new HashMap < String , domaccess > ( ) ) ;
2007-02-21 17:23:31 +01:00
// definition of payload for fileStack
2007-03-16 14:25:56 +01:00
private static final kelondroRow stackrow = new kelondroRow ( " byte[] urlhash- " + yacySeedDB . commonHashLength , kelondroBase64Order . enhancedCoder , 0 ) ;
2007-02-21 17:23:31 +01:00
// class variables
2008-01-07 23:36:48 +01:00
private ArrayList < String > urlRAMStack ; // a list that is flushed first
private kelondroStack urlFileStack ; // a file with url hashes
private kelondroIndex urlFileIndex ;
private HashMap < String , LinkedList < String > > domainStacks ; // a map from domain name part to Lists with url hashs
private File cacheStacksPath ;
private String stackname ;
private boolean top ; // to alternate between top and bottom of the file stack
2008-01-19 02:50:24 +01:00
private boolean fullram ;
2008-01-07 23:36:48 +01:00
public static class domaccess {
long time ;
int count ;
public domaccess ( ) {
this . time = System . currentTimeMillis ( ) ;
this . count = 0 ;
}
public void update ( ) {
this . time = System . currentTimeMillis ( ) ;
this . count + + ;
}
public long time ( ) {
return this . time ;
}
public int count ( ) {
return this . count ;
}
}
2005-09-25 03:09:21 +02:00
2008-01-19 02:50:24 +01:00
public plasmaCrawlBalancer ( File cachePath , String stackname , boolean fullram ) {
2007-03-16 14:25:56 +01:00
this . cacheStacksPath = cachePath ;
this . stackname = stackname ;
File stackFile = new File ( cachePath , stackname + stackSuffix ) ;
2007-09-28 03:21:31 +02:00
this . urlFileStack = kelondroStack . open ( stackFile , stackrow ) ;
2008-01-07 23:36:48 +01:00
this . domainStacks = new HashMap < String , LinkedList < String > > ( ) ;
this . urlRAMStack = new ArrayList < String > ( ) ;
2007-09-28 03:21:31 +02:00
this . top = true ;
2008-01-19 02:50:24 +01:00
this . fullram = fullram ;
2007-03-16 14:25:56 +01:00
// create a stack for newly entered entries
if ( ! ( cachePath . exists ( ) ) ) cachePath . mkdir ( ) ; // make the path
openFileIndex ( ) ;
2005-09-25 03:09:21 +02:00
}
2007-02-21 17:23:31 +01:00
public synchronized void close ( ) {
2007-08-03 14:21:46 +02:00
while ( domainStacksNotEmpty ( ) ) flushOnceDomStacks ( 0 , true ) ; // flush to ram, because the ram flush is optimized
2007-02-21 17:23:31 +01:00
try { flushAllRamStack ( ) ; } catch ( IOException e ) { }
2007-03-16 14:25:56 +01:00
if ( urlFileIndex ! = null ) {
urlFileIndex . close ( ) ;
urlFileIndex = null ;
}
if ( urlFileStack ! = null ) {
urlFileStack . close ( ) ;
urlFileStack = null ;
}
2005-09-25 03:09:21 +02:00
}
2007-02-21 17:23:31 +01:00
public void finalize ( ) {
2007-03-16 14:25:56 +01:00
if ( urlFileStack ! = null ) close ( ) ;
2007-02-21 17:23:31 +01:00
}
public synchronized void clear ( ) {
2007-03-16 14:25:56 +01:00
urlFileStack = kelondroStack . reset ( urlFileStack ) ;
2007-02-21 17:23:31 +01:00
domainStacks . clear ( ) ;
2007-03-16 14:25:56 +01:00
urlRAMStack . clear ( ) ;
resetFileIndex ( ) ;
2007-02-21 17:23:31 +01:00
}
2007-03-16 14:25:56 +01:00
private void openFileIndex ( ) {
cacheStacksPath . mkdirs ( ) ;
2008-01-20 22:42:35 +01:00
urlFileIndex = new kelondroEcoTable ( new File ( cacheStacksPath , stackname + indexSuffix ) , plasmaCrawlEntry . rowdef , ( fullram ) ? kelondroEcoTable . tailCacheUsageAuto : kelondroEcoTable . tailCacheDenyUsage , EcoFSBufferSize , 0 ) ;
2007-02-21 17:23:31 +01:00
}
2007-03-16 14:25:56 +01:00
private void resetFileIndex ( ) {
if ( urlFileIndex ! = null ) {
urlFileIndex . close ( ) ;
urlFileIndex = null ;
2008-01-17 22:48:08 +01:00
new File ( cacheStacksPath , stackname + indexSuffix ) . delete ( ) ;
2007-03-16 14:25:56 +01:00
}
openFileIndex ( ) ;
2005-10-05 02:52:55 +02:00
}
2007-03-16 14:25:56 +01:00
public synchronized plasmaCrawlEntry get ( String urlhash ) throws IOException {
kelondroRow . Entry entry = urlFileIndex . get ( urlhash . getBytes ( ) ) ;
if ( entry = = null ) return null ;
return new plasmaCrawlEntry ( entry ) ;
}
2007-09-28 03:21:31 +02:00
public synchronized int removeAllByProfileHandle ( String profileHandle ) throws IOException {
// removes all entries with a specific profile hash.
// this may last some time
// returns number of deletions
// first find a list of url hashes that shall be deleted
2008-01-07 23:36:48 +01:00
Iterator < kelondroRow . Entry > i = urlFileIndex . rows ( true , null ) ;
ArrayList < String > urlHashes = new ArrayList < String > ( ) ;
2007-09-28 03:21:31 +02:00
kelondroRow . Entry rowEntry ;
plasmaCrawlEntry crawlEntry ;
while ( i . hasNext ( ) ) {
rowEntry = ( kelondroRow . Entry ) i . next ( ) ;
crawlEntry = new plasmaCrawlEntry ( rowEntry ) ;
if ( crawlEntry . profileHandle ( ) . equals ( profileHandle ) ) {
urlHashes . add ( crawlEntry . url ( ) . hash ( ) ) ;
}
}
// then delete all these urls from the queues and the file index
2008-01-07 23:36:48 +01:00
Iterator < String > j = urlHashes . iterator ( ) ;
while ( j . hasNext ( ) ) this . remove ( j . next ( ) ) ;
2007-09-28 03:21:31 +02:00
return urlHashes . size ( ) ;
}
2007-03-16 14:25:56 +01:00
public synchronized plasmaCrawlEntry remove ( String urlhash ) throws IOException {
// this method is only here, because so many import/export methods need it
// and it was implemented in the previous architecture
2008-01-07 23:36:48 +01:00
// however, usage is not recommended
2007-04-17 17:15:47 +02:00
int s = urlFileIndex . size ( ) ;
2007-10-22 17:26:47 +02:00
kelondroRow . Entry entry = urlFileIndex . remove ( urlhash . getBytes ( ) , false ) ;
2007-03-16 14:25:56 +01:00
if ( entry = = null ) return null ;
2007-04-17 17:15:47 +02:00
assert urlFileIndex . size ( ) + 1 = = s : " urlFileIndex.size() = " + urlFileIndex . size ( ) + " , s = " + s ;
2007-03-16 14:25:56 +01:00
// now delete that thing also from the queues
2007-04-17 17:15:47 +02:00
2007-03-16 14:25:56 +01:00
// iterate through the RAM stack
2008-01-07 23:36:48 +01:00
Iterator < String > i = urlRAMStack . iterator ( ) ;
2007-03-16 14:25:56 +01:00
String h ;
while ( i . hasNext ( ) ) {
h = ( String ) i . next ( ) ;
if ( h . equals ( urlhash ) ) {
i . remove ( ) ;
2007-04-17 17:15:47 +02:00
return new plasmaCrawlEntry ( entry ) ;
2007-03-16 14:25:56 +01:00
}
}
2007-04-17 17:15:47 +02:00
// iterate through the file stack
// in general this is a bad idea. But this can only be avoided by avoidance of this method
2008-01-07 23:36:48 +01:00
Iterator < kelondroRow . Entry > j = urlFileStack . stackIterator ( true ) ;
while ( j . hasNext ( ) ) {
h = new String ( j . next ( ) . getColBytes ( 0 ) ) ;
2007-04-17 17:15:47 +02:00
if ( h . equals ( urlhash ) ) {
2008-01-07 23:36:48 +01:00
j . remove ( ) ;
2007-04-17 17:15:47 +02:00
return new plasmaCrawlEntry ( entry ) ;
}
}
2007-03-16 14:25:56 +01:00
2007-08-03 13:44:58 +02:00
if ( kelondroAbstractRecords . debugmode ) {
2007-04-17 17:15:47 +02:00
serverLog . logWarning ( " PLASMA BALANCER " , " remove: not found urlhash " + urlhash + " in " + stackname ) ;
}
2007-03-16 14:25:56 +01:00
return new plasmaCrawlEntry ( entry ) ;
}
2007-04-17 17:15:47 +02:00
public synchronized boolean has ( String urlhash ) {
2007-03-16 14:25:56 +01:00
try {
return urlFileIndex . has ( urlhash . getBytes ( ) ) ;
} catch ( IOException e ) {
e . printStackTrace ( ) ;
return false ;
}
2005-09-25 03:09:21 +02:00
}
2007-08-03 14:21:46 +02:00
public boolean notEmpty ( ) {
// alternative method to the property size() > 0
// this is better because it may avoid synchronized access to domain stack summarization
return urlRAMStack . size ( ) > 0 | | urlFileStack . size ( ) > 0 | | domainStacksNotEmpty ( ) ;
}
2007-06-29 00:06:33 +02:00
public int size ( ) {
2007-03-16 14:25:56 +01:00
int componentsize = urlFileStack . size ( ) + urlRAMStack . size ( ) + sizeDomainStacks ( ) ;
2007-04-19 15:37:02 +02:00
if ( componentsize ! = urlFileIndex . size ( ) ) {
// here is urlIndexFile.size() always smaller. why?
2007-08-03 13:44:58 +02:00
if ( kelondroAbstractRecords . debugmode ) {
2007-04-19 15:37:02 +02:00
serverLog . logWarning ( " PLASMA BALANCER " , " size operation wrong in " + stackname + " - componentsize = " + componentsize + " , urlFileIndex.size() = " + urlFileIndex . size ( ) ) ;
}
if ( ( componentsize = = 0 ) & & ( urlFileIndex . size ( ) > 0 ) ) {
resetFileIndex ( ) ;
}
}
2007-02-21 17:23:31 +01:00
return componentsize ;
2005-09-25 03:09:21 +02:00
}
2007-08-03 14:21:46 +02:00
private boolean domainStacksNotEmpty ( ) {
if ( domainStacks = = null ) return false ;
synchronized ( domainStacks ) {
2008-01-07 23:36:48 +01:00
Iterator < LinkedList < String > > i = domainStacks . values ( ) . iterator ( ) ;
2007-08-03 14:21:46 +02:00
while ( i . hasNext ( ) ) {
2008-01-07 23:36:48 +01:00
if ( i . next ( ) . size ( ) > 0 ) return true ;
2007-08-03 14:21:46 +02:00
}
}
return false ;
}
2005-09-25 03:09:21 +02:00
private int sizeDomainStacks ( ) {
2006-07-13 03:31:00 +02:00
if ( domainStacks = = null ) return 0 ;
2005-09-25 03:09:21 +02:00
int sum = 0 ;
2007-08-03 13:44:58 +02:00
synchronized ( domainStacks ) {
2008-01-07 23:36:48 +01:00
Iterator < LinkedList < String > > i = domainStacks . values ( ) . iterator ( ) ;
while ( i . hasNext ( ) ) sum + = i . next ( ) . size ( ) ;
2007-08-03 13:44:58 +02:00
}
2005-09-25 03:09:21 +02:00
return sum ;
}
2007-03-16 17:54:54 +01:00
private void flushOnceDomStacks ( int minimumleft , boolean ram ) {
// takes one entry from every domain stack and puts it on the ram or file stack
// the minimumleft value is a limit for the number of entries that should be left
2007-02-21 17:23:31 +01:00
if ( domainStacks . size ( ) = = 0 ) return ;
2007-08-03 14:21:46 +02:00
synchronized ( domainStacks ) {
2008-01-07 23:36:48 +01:00
Iterator < Map . Entry < String , LinkedList < String > > > i = domainStacks . entrySet ( ) . iterator ( ) ;
Map . Entry < String , LinkedList < String > > entry ;
LinkedList < String > list ;
2007-08-03 14:21:46 +02:00
while ( i . hasNext ( ) ) {
2008-01-07 23:36:48 +01:00
entry = i . next ( ) ;
list = entry . getValue ( ) ;
2007-08-03 14:21:46 +02:00
if ( list . size ( ) > minimumleft ) {
if ( ram ) {
urlRAMStack . add ( list . removeFirst ( ) ) ;
} else try {
urlFileStack . push ( urlFileStack . row ( ) . newEntry ( new byte [ ] [ ] { ( ( String ) list . removeFirst ( ) ) . getBytes ( ) } ) ) ;
} catch ( IOException e ) {
e . printStackTrace ( ) ;
}
2007-02-20 09:35:51 +01:00
}
2007-08-03 14:21:46 +02:00
if ( list . size ( ) = = 0 ) i . remove ( ) ;
2005-09-25 03:09:21 +02:00
}
}
}
2007-02-21 17:23:31 +01:00
private void flushAllRamStack ( ) throws IOException {
// this flushes only the ramStack to the fileStack, but does not flush the domainStacks
2007-03-16 14:25:56 +01:00
for ( int i = 0 ; i < urlRAMStack . size ( ) / 2 ; i + + ) {
urlFileStack . push ( urlFileStack . row ( ) . newEntry ( new byte [ ] [ ] { ( ( String ) urlRAMStack . get ( i ) ) . getBytes ( ) } ) ) ;
urlFileStack . push ( urlFileStack . row ( ) . newEntry ( new byte [ ] [ ] { ( ( String ) urlRAMStack . get ( urlRAMStack . size ( ) - i - 1 ) ) . getBytes ( ) } ) ) ;
2007-02-21 17:23:31 +01:00
}
2007-03-16 14:25:56 +01:00
if ( urlRAMStack . size ( ) % 2 = = 1 )
urlFileStack . push ( urlFileStack . row ( ) . newEntry ( new byte [ ] [ ] { ( ( String ) urlRAMStack . get ( urlRAMStack . size ( ) / 2 ) ) . getBytes ( ) } ) ) ;
2005-09-25 03:09:21 +02:00
}
2007-03-16 14:25:56 +01:00
public synchronized void push ( plasmaCrawlEntry entry ) throws IOException {
assert entry ! = null ;
2007-09-05 11:01:35 +02:00
if ( urlFileIndex . has ( entry . url ( ) . hash ( ) . getBytes ( ) ) ) {
serverLog . logWarning ( " PLASMA BALANCER " , " double-check has failed for urlhash " + entry . url ( ) . hash ( ) + " in " + stackname + " - fixed " ) ;
2007-02-21 17:23:31 +01:00
return ;
}
2007-03-16 14:25:56 +01:00
// extend domain stack
2007-09-05 11:01:35 +02:00
String dom = entry . url ( ) . hash ( ) . substring ( 6 ) ;
2008-01-07 23:36:48 +01:00
LinkedList < String > domainList = domainStacks . get ( dom ) ;
2007-02-21 17:23:31 +01:00
if ( domainList = = null ) {
// create new list
2008-01-07 23:36:48 +01:00
domainList = new LinkedList < String > ( ) ;
2007-08-03 14:21:46 +02:00
synchronized ( domainStacks ) {
2007-09-05 11:01:35 +02:00
domainList . add ( entry . url ( ) . hash ( ) ) ;
2007-08-03 14:21:46 +02:00
domainStacks . put ( dom , domainList ) ;
}
2007-02-21 17:23:31 +01:00
} else {
// extend existent domain list
2007-09-05 11:01:35 +02:00
domainList . addLast ( entry . url ( ) . hash ( ) ) ;
2005-09-25 03:09:21 +02:00
}
2007-02-26 12:54:43 +01:00
// add to index
2007-03-16 14:25:56 +01:00
urlFileIndex . put ( entry . toRow ( ) ) ;
2007-02-26 12:54:43 +01:00
2005-09-25 03:09:21 +02:00
// check size of domainStacks and flush
2007-02-21 17:23:31 +01:00
if ( ( domainStacks . size ( ) > 20 ) | | ( sizeDomainStacks ( ) > 1000 ) ) {
2007-03-16 17:54:54 +01:00
flushOnceDomStacks ( 1 , urlRAMStack . size ( ) < 100 ) ; // when the ram stack is small, flush it there
2005-09-25 03:09:21 +02:00
}
}
2007-08-16 01:18:12 +02:00
public synchronized plasmaCrawlEntry pop ( long minimumLocalDelta , long minimumGlobalDelta , long maximumAge ) throws IOException {
2007-02-09 11:32:58 +01:00
// returns an url-hash from the stack and ensures minimum delta times
2007-02-21 17:23:31 +01:00
// we have 3 sources to choose from: the ramStack, the domainStacks and the fileStack
String result = null ; // the result
// 1st: check ramStack
2007-03-16 14:25:56 +01:00
if ( urlRAMStack . size ( ) > 0 ) {
result = ( String ) urlRAMStack . remove ( 0 ) ;
2007-02-21 17:23:31 +01:00
}
// 2nd-a: check domainStacks for latest arrivals
2007-08-03 14:21:46 +02:00
if ( ( result = = null ) & & ( domainStacks . size ( ) > 0 ) ) synchronized ( domainStacks ) {
2007-02-21 17:23:31 +01:00
// we select specific domains that have not been used for a long time
// i.e. 60 seconds. Latest arrivals that have not yet been crawled
// fit also in that scheme
2008-01-07 23:36:48 +01:00
Iterator < Map . Entry < String , LinkedList < String > > > i = domainStacks . entrySet ( ) . iterator ( ) ;
Map . Entry < String , LinkedList < String > > entry ;
2007-02-21 17:23:31 +01:00
String domhash ;
long delta , maxdelta = 0 ;
String maxhash = null ;
2008-01-07 23:36:48 +01:00
LinkedList < String > domlist ;
2007-02-21 17:23:31 +01:00
while ( i . hasNext ( ) ) {
2008-01-07 23:36:48 +01:00
entry = i . next ( ) ;
2007-02-21 17:23:31 +01:00
domhash = ( String ) entry . getKey ( ) ;
delta = lastAccessDelta ( domhash ) ;
if ( delta = = Integer . MAX_VALUE ) {
// a brand new domain - we take it
2008-01-07 23:36:48 +01:00
domlist = entry . getValue ( ) ;
2007-02-21 17:23:31 +01:00
result = ( String ) domlist . removeFirst ( ) ;
if ( domlist . size ( ) = = 0 ) i . remove ( ) ;
break ;
}
if ( delta > maxdelta ) {
maxdelta = delta ;
maxhash = domhash ;
}
}
if ( maxdelta > maximumAge ) {
// success - we found an entry from a domain that has not been used for a long time
2008-01-07 23:36:48 +01:00
domlist = domainStacks . get ( maxhash ) ;
2007-02-21 17:23:31 +01:00
result = ( String ) domlist . removeFirst ( ) ;
if ( domlist . size ( ) = = 0 ) domainStacks . remove ( maxhash ) ;
}
}
// 2nd-b: check domainStacks for best match between stack size and retrieval time
2007-08-03 14:21:46 +02:00
if ( ( result = = null ) & & ( domainStacks . size ( ) > 0 ) ) synchronized ( domainStacks ) {
2007-02-21 17:23:31 +01:00
// we order all domains by the number of entries per domain
// then we iterate through these domains in descending entry order
// and that that one, that has a delta > minimumDelta
2008-01-07 23:36:48 +01:00
Iterator < Map . Entry < String , LinkedList < String > > > i = domainStacks . entrySet ( ) . iterator ( ) ;
Map . Entry < String , LinkedList < String > > entry ;
2007-02-21 17:23:31 +01:00
String domhash ;
2008-01-07 23:36:48 +01:00
LinkedList < String > domlist ;
TreeMap < Integer , String > hitlist = new TreeMap < Integer , String > ( ) ;
2007-02-21 17:23:31 +01:00
int count = 0 ;
// first collect information about sizes of the domain lists
while ( i . hasNext ( ) ) {
2008-01-07 23:36:48 +01:00
entry = i . next ( ) ;
domhash = entry . getKey ( ) ;
domlist = entry . getValue ( ) ;
2007-02-21 17:23:31 +01:00
hitlist . put ( new Integer ( domlist . size ( ) * 100 + count + + ) , domhash ) ;
}
2007-02-09 11:32:58 +01:00
2007-02-21 17:23:31 +01:00
// now iterate in descending order an fetch that one,
// that is acceptable by the minimumDelta constraint
long delta ;
String maxhash = null ;
while ( hitlist . size ( ) > 0 ) {
domhash = ( String ) hitlist . remove ( hitlist . lastKey ( ) ) ;
if ( maxhash = = null ) maxhash = domhash ; // remember first entry
delta = lastAccessDelta ( domhash ) ;
2007-08-16 01:18:12 +02:00
if ( delta > minimumGlobalDelta ) {
2008-01-07 23:36:48 +01:00
domlist = domainStacks . get ( domhash ) ;
2007-02-21 17:23:31 +01:00
result = ( String ) domlist . removeFirst ( ) ;
if ( domlist . size ( ) = = 0 ) domainStacks . remove ( domhash ) ;
break ;
}
2007-02-09 10:48:23 +01:00
}
2007-02-09 11:32:58 +01:00
2007-02-21 17:23:31 +01:00
// if we did yet not choose any entry, we simply take that one with the most entries
if ( ( result = = null ) & & ( maxhash ! = null ) ) {
2008-01-07 23:36:48 +01:00
domlist = domainStacks . get ( maxhash ) ;
2007-02-21 17:23:31 +01:00
result = ( String ) domlist . removeFirst ( ) ;
if ( domlist . size ( ) = = 0 ) domainStacks . remove ( maxhash ) ;
2006-07-13 03:31:00 +02:00
}
}
2005-09-25 03:09:21 +02:00
2007-02-21 17:23:31 +01:00
// 3rd: take entry from file
2007-03-16 14:25:56 +01:00
if ( ( result = = null ) & & ( urlFileStack . size ( ) > 0 ) ) {
2007-03-16 17:54:54 +01:00
kelondroRow . Entry nextentry = ( top ) ? urlFileStack . top ( ) : urlFileStack . bot ( ) ;
if ( nextentry = = null ) {
2007-03-07 10:08:13 +01:00
// emergency case: this means that something with the stack organization is wrong
// the file appears to be broken. We kill the file.
2007-03-16 14:25:56 +01:00
kelondroStack . reset ( urlFileStack ) ;
2007-03-07 10:08:13 +01:00
serverLog . logSevere ( " PLASMA BALANCER " , " get() failed to fetch entry from file stack. reset stack file. " ) ;
} else {
2007-03-16 17:54:54 +01:00
String nexthash = new String ( nextentry . getColBytes ( 0 ) ) ;
2007-02-21 17:23:31 +01:00
// check if the time after retrieval of last hash from same
// domain is not shorter than the minimumDelta
2007-03-16 17:54:54 +01:00
long delta = lastAccessDelta ( nexthash ) ;
2007-08-16 01:18:12 +02:00
if ( delta > minimumGlobalDelta ) {
2007-03-16 17:54:54 +01:00
// the entry is fine
result = new String ( ( top ) ? urlFileStack . pop ( ) . getColBytes ( 0 ) : urlFileStack . pot ( ) . getColBytes ( 0 ) ) ;
2007-02-21 17:23:31 +01:00
} else {
2007-03-16 17:54:54 +01:00
// try other entry
result = new String ( ( top ) ? urlFileStack . pot ( ) . getColBytes ( 0 ) : urlFileStack . pop ( ) . getColBytes ( 0 ) ) ;
2007-02-21 17:23:31 +01:00
delta = lastAccessDelta ( result ) ;
}
}
2007-03-16 17:54:54 +01:00
top = ! top ; // alternate top/bottom
2005-09-25 03:09:21 +02:00
}
2007-02-21 17:23:31 +01:00
// check case where we did not found anything
if ( result = = null ) {
2007-03-16 14:25:56 +01:00
serverLog . logSevere ( " PLASMA BALANCER " , " get() was not able to find a valid urlhash - total size = " + size ( ) + " , fileStack.size() = " + urlFileStack . size ( ) + " , ramStack.size() = " + urlRAMStack . size ( ) + " , domainStacks.size() = " + domainStacks . size ( ) ) ;
2007-02-21 17:23:31 +01:00
return null ;
2005-09-25 03:09:21 +02:00
}
2007-02-21 17:23:31 +01:00
// finally: check minimumDelta and if necessary force a sleep
long delta = lastAccessDelta ( result ) ;
2007-04-30 02:39:53 +02:00
assert delta > = 0 : " delta = " + delta ;
2007-06-06 09:53:56 +02:00
int s = urlFileIndex . size ( ) ;
2007-10-22 17:26:47 +02:00
kelondroRow . Entry rowEntry = urlFileIndex . remove ( result . getBytes ( ) , false ) ;
2007-06-06 09:53:56 +02:00
assert urlFileIndex . size ( ) + 1 = = s : " urlFileIndex.size() = " + urlFileIndex . size ( ) + " , s = " + s + " , result = " + result ;
if ( rowEntry = = null ) {
serverLog . logSevere ( " PLASMA BALANCER " , " get() found a valid urlhash, but failed to fetch the corresponding url entry - total size = " + size ( ) + " , fileStack.size() = " + urlFileStack . size ( ) + " , ramStack.size() = " + urlRAMStack . size ( ) + " , domainStacks.size() = " + domainStacks . size ( ) ) ;
return null ;
}
plasmaCrawlEntry crawlEntry = new plasmaCrawlEntry ( rowEntry ) ;
2007-09-05 11:01:35 +02:00
long minimumDelta = ( crawlEntry . url ( ) . isLocal ( ) ) ? minimumLocalDelta : minimumGlobalDelta ;
2007-06-06 09:53:56 +02:00
plasmaCrawlRobotsTxt . Entry robotsEntry = plasmaSwitchboard . robots . getEntry ( crawlEntry . url ( ) . getHost ( ) ) ;
Integer hostDelay = ( robotsEntry = = null ) ? null : robotsEntry . getCrawlDelay ( ) ;
long genericDelta = ( ( robotsEntry = = null ) | | ( hostDelay = = null ) ) ? minimumDelta : Math . max ( minimumDelta , hostDelay . intValue ( ) * 1000 ) ;
genericDelta = Math . min ( 10000 , genericDelta ) ; // prevent that ta robots file can stop our indexer completely
if ( delta < genericDelta ) {
2007-02-21 17:23:31 +01:00
// force a busy waiting here
// in best case, this should never happen if the balancer works propertly
// this is only to protect against the worst case, where the crawler could
// behave in a DoS-manner
2007-06-06 09:53:56 +02:00
long sleeptime = genericDelta - delta ;
2007-02-21 17:23:31 +01:00
try { synchronized ( this ) { this . wait ( sleeptime ) ; } } catch ( InterruptedException e ) { }
2005-09-25 03:09:21 +02:00
}
2007-02-21 17:23:31 +01:00
// update statistical data
2008-01-07 23:36:48 +01:00
domaccess lastAccess = domainAccess . get ( result . substring ( 6 ) ) ;
if ( lastAccess = = null ) lastAccess = new domaccess ( ) ; else lastAccess . update ( ) ;
domainAccess . put ( result . substring ( 6 ) , lastAccess ) ;
2007-06-06 09:53:56 +02:00
return crawlEntry ;
2007-02-21 17:23:31 +01:00
}
private long lastAccessDelta ( String hash ) {
assert hash ! = null ;
2008-01-07 23:36:48 +01:00
domaccess lastAccess = domainAccess . get ( ( hash . length ( ) > 6 ) ? hash . substring ( 6 ) : hash ) ;
2007-02-21 17:23:31 +01:00
if ( lastAccess = = null ) return Long . MAX_VALUE ; // never accessed
2008-01-07 23:36:48 +01:00
return System . currentTimeMillis ( ) - lastAccess . time ( ) ;
2007-02-21 17:23:31 +01:00
}
2007-03-16 14:25:56 +01:00
public synchronized plasmaCrawlEntry top ( int dist ) throws IOException {
2007-03-16 16:16:26 +01:00
// if we need to flush anything, then flush the domain stack first,
// to avoid that new urls get hidden by old entries from the file stack
2007-09-09 19:31:29 +02:00
if ( urlRAMStack = = null ) return null ;
2007-08-03 14:21:46 +02:00
while ( ( domainStacksNotEmpty ( ) ) & & ( urlRAMStack . size ( ) < = dist ) ) {
2007-03-16 16:16:26 +01:00
// flush only that much as we need to display
2007-03-16 17:54:54 +01:00
flushOnceDomStacks ( 0 , true ) ;
2007-03-16 16:16:26 +01:00
}
2007-09-09 19:31:29 +02:00
while ( ( urlFileStack ! = null ) & & ( urlRAMStack . size ( ) < = dist ) & & ( urlFileStack . size ( ) > 0 ) ) {
2007-03-16 16:16:26 +01:00
// flush some entries from disc to ram stack
2007-02-21 17:23:31 +01:00
try {
2007-08-27 00:06:58 +02:00
kelondroRow . Entry t = urlFileStack . pop ( ) ;
if ( t = = null ) break ;
urlRAMStack . add ( new String ( t . getColBytes ( 0 ) ) ) ;
2007-03-16 16:16:26 +01:00
} catch ( IOException e ) {
break ;
}
2005-09-25 03:09:21 +02:00
}
2007-03-16 14:25:56 +01:00
if ( dist > = urlRAMStack . size ( ) ) return null ;
String urlhash = ( String ) urlRAMStack . get ( dist ) ;
kelondroRow . Entry entry = urlFileIndex . get ( urlhash . getBytes ( ) ) ;
2007-03-16 16:16:26 +01:00
if ( entry = = null ) {
2007-08-03 13:44:58 +02:00
if ( kelondroAbstractRecords . debugmode ) serverLog . logWarning ( " PLASMA BALANCER " , " no entry in index for urlhash " + urlhash ) ;
2007-03-16 16:16:26 +01:00
return null ;
}
2007-03-16 14:25:56 +01:00
return new plasmaCrawlEntry ( entry ) ;
}
2008-01-07 23:36:48 +01:00
public synchronized Iterator < plasmaCrawlEntry > iterator ( ) throws IOException {
2007-03-16 14:25:56 +01:00
return new EntryIterator ( ) ;
}
2008-01-07 23:36:48 +01:00
private class EntryIterator implements Iterator < plasmaCrawlEntry > {
2007-03-16 14:25:56 +01:00
2008-01-07 23:36:48 +01:00
private Iterator < kelondroRow . Entry > rowIterator ;
2007-03-16 14:25:56 +01:00
public EntryIterator ( ) throws IOException {
rowIterator = urlFileIndex . rows ( true , null ) ;
}
public boolean hasNext ( ) {
return ( rowIterator = = null ) ? false : rowIterator . hasNext ( ) ;
}
2008-01-07 23:36:48 +01:00
public plasmaCrawlEntry next ( ) {
2007-03-16 14:25:56 +01:00
kelondroRow . Entry entry = ( kelondroRow . Entry ) rowIterator . next ( ) ;
try {
return ( entry = = null ) ? null : new plasmaCrawlEntry ( entry ) ;
} catch ( IOException e ) {
rowIterator = null ;
return null ;
}
}
public void remove ( ) {
if ( rowIterator ! = null ) rowIterator . remove ( ) ;
}
2005-09-25 03:09:21 +02:00
}
}