2005-10-10 01:11:17 +02:00
// plasmaCrawlStacker.java
2005-10-09 17:59:09 +02:00
// -----------------------
// part of YaCy
// (C) by Michael Peter Christen; mc@anomic.de
// first published on http://www.anomic.de
// Frankfurt, Germany, 2005
//
// This file was contributed by Martin Thelian
2007-10-29 02:43:20 +01:00
// ([MC] removed all multithreading and thread pools, this is not necessary here; complete renovation 2007)
2005-10-09 17:59:09 +02:00
//
2005-11-04 14:41:51 +01:00
// $LastChangedDate$
// $LastChangedRevision$
// $LastChangedBy$
2005-10-09 17:59:09 +02:00
//
// This program is free software; you can redistribute it and/or modify
// it under the terms of the GNU General Public License as published by
// the Free Software Foundation; either version 2 of the License, or
// (at your option) any later version.
//
// This program is distributed in the hope that it will be useful,
// but WITHOUT ANY WARRANTY; without even the implied warranty of
// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
// GNU General Public License for more details.
//
// You should have received a copy of the GNU General Public License
// along with this program; if not, write to the Free Software
// Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
//
// Using this software in any meaning (reading, learning, copying, compiling,
// running) means that you agree that the Author(s) is (are) not responsible
// for cost, loss of data or any harm that may be caused directly or indirectly
// by usage of this softare or this documentation. The usage of this software
// is on your own risk. The installation and usage (starting/running) of this
// software may allow other people or application to access your computer and
// any attached devices and is highly dependent on the configuration of the
// software which must be done by the user of the software; the author(s) is
// (are) also not responsible for proper configuration and usage of the
// software, even if provoked by documentation provided together with
// the software.
//
// Any changes to this file according to the GPL as documented in the file
// gpl.txt aside this file in the shipment you received can be done to the
// lines that follows this copyright notice here, but changes must not be
// done inside the copyright notive above. A re-distribution must contain
// the intact and unchanged copyright notice.
// Contributions and changes to the program code must be marked as such.
2005-10-05 12:45:33 +02:00
package de.anomic.plasma ;
import java.io.File ;
import java.io.IOException ;
2007-10-29 02:43:20 +01:00
import java.net.UnknownHostException ;
import java.util.ArrayList ;
2005-10-05 12:45:33 +02:00
import java.util.Date ;
import java.util.Iterator ;
import java.util.LinkedList ;
2006-11-08 17:17:47 +01:00
import de.anomic.index.indexURLEntry ;
2006-10-26 15:50:50 +02:00
import de.anomic.kelondro.kelondroCache ;
2008-01-17 22:48:08 +01:00
import de.anomic.kelondro.kelondroEcoTable ;
2005-11-07 13:19:05 +01:00
import de.anomic.kelondro.kelondroException ;
2006-08-25 00:21:22 +02:00
import de.anomic.kelondro.kelondroIndex ;
2006-06-02 14:45:57 +02:00
import de.anomic.kelondro.kelondroRow ;
2006-10-26 15:50:50 +02:00
import de.anomic.kelondro.kelondroRowSet ;
2005-10-05 12:45:33 +02:00
import de.anomic.kelondro.kelondroTree ;
2006-08-12 16:28:14 +02:00
import de.anomic.plasma.urlPattern.plasmaURLPattern ;
2007-07-24 02:46:17 +02:00
import de.anomic.server.serverDomains ;
2005-10-05 12:45:33 +02:00
import de.anomic.server.logging.serverLog ;
import de.anomic.yacy.yacyCore ;
2007-09-05 11:01:35 +02:00
import de.anomic.yacy.yacyURL ;
2005-10-05 12:45:33 +02:00
2007-10-29 02:43:20 +01:00
public final class plasmaCrawlStacker extends Thread {
2005-10-05 12:45:33 +02:00
2008-01-17 22:48:08 +01:00
private static final int EcoFSBufferSize = 20 ;
private static String stackfile = " urlNoticeStacker9.db " ;
2006-10-11 02:46:45 +02:00
// keys for different database types
public static final int QUEUE_DB_TYPE_RAM = 0 ;
public static final int QUEUE_DB_TYPE_TREE = 1 ;
2008-01-18 18:14:02 +01:00
public static final int QUEUE_DB_TYPE_ECO = 2 ;
2006-10-11 02:46:45 +02:00
2005-10-09 18:11:41 +02:00
final serverLog log = new serverLog ( " STACKCRAWL " ) ;
2007-10-29 02:43:20 +01:00
private plasmaSwitchboard sb ;
2008-01-17 13:12:52 +01:00
private final LinkedList < String > urlEntryHashCache ;
2007-10-29 02:43:20 +01:00
private kelondroIndex urlEntryCache ;
private File cacheStacksPath ;
private int dbtype ;
2007-10-31 12:32:40 +01:00
private boolean prequeue ;
2007-11-01 01:57:32 +01:00
private long dnsHit , dnsMiss ;
private int alternateCount ;
2007-10-31 12:32:40 +01:00
2007-10-29 02:43:20 +01:00
// objects for the prefetch task
2008-01-17 13:12:52 +01:00
private ArrayList < String > dnsfetchHosts = new ArrayList < String > ( ) ;
2005-10-05 12:45:33 +02:00
2008-02-19 10:14:07 +01:00
public plasmaCrawlStacker ( plasmaSwitchboard sb , File dbPath , int dbtype , boolean prequeue ) {
2005-10-05 12:45:33 +02:00
this . sb = sb ;
2007-10-31 12:32:40 +01:00
this . prequeue = prequeue ;
2007-11-01 01:57:32 +01:00
this . dnsHit = 0 ;
this . dnsMiss = 0 ;
this . alternateCount = 0 ;
2005-10-05 12:45:33 +02:00
2007-10-29 02:43:20 +01:00
// init the message list
2008-01-17 13:12:52 +01:00
this . urlEntryHashCache = new LinkedList < String > ( ) ;
2005-10-09 18:11:41 +02:00
2007-10-29 02:43:20 +01:00
// create a stack for newly entered entries
this . cacheStacksPath = dbPath ;
this . dbtype = dbtype ;
2005-11-15 13:46:22 +01:00
2007-10-29 02:43:20 +01:00
openDB ( ) ;
try {
// loop through the list and fill the messageList with url hashs
2008-01-17 13:12:52 +01:00
Iterator < kelondroRow . Entry > rows = this . urlEntryCache . rows ( true , null ) ;
2007-10-29 02:43:20 +01:00
kelondroRow . Entry entry ;
while ( rows . hasNext ( ) ) {
entry = ( kelondroRow . Entry ) rows . next ( ) ;
if ( entry = = null ) {
System . out . println ( " ERROR! null element found " ) ;
continue ;
}
this . urlEntryHashCache . add ( entry . getColString ( 0 , null ) ) ;
}
} catch ( kelondroException e ) {
/* if we have an error, we start with a fresh database */
plasmaCrawlStacker . this . log . logSevere ( " Unable to initialize crawl stacker queue, kelondroException: " + e . getMessage ( ) + " . Reseting DB. \ n " , e ) ;
2005-11-15 13:46:22 +01:00
2007-10-29 02:43:20 +01:00
// deleting old db and creating a new db
try { this . urlEntryCache . close ( ) ; } catch ( Exception ex ) { }
deleteDB ( ) ;
openDB ( ) ;
} catch ( IOException e ) {
/* if we have an error, we start with a fresh database */
plasmaCrawlStacker . this . log . logSevere ( " Unable to initialize crawl stacker queue, IOException: " + e . getMessage ( ) + " . Reseting DB. \ n " , e ) ;
2005-11-15 13:46:22 +01:00
2007-10-29 02:43:20 +01:00
// deleting old db and creating a new db
try { this . urlEntryCache . close ( ) ; } catch ( Exception ex ) { }
deleteDB ( ) ;
openDB ( ) ;
}
this . log . logInfo ( size ( ) + " entries in the stackCrawl queue. " ) ;
this . start ( ) ; // start the prefetcher thread
this . log . logInfo ( " STACKCRAWL thread initialized. " ) ;
}
2005-11-15 13:46:22 +01:00
2007-10-29 02:43:20 +01:00
public void run ( ) {
String nextHost ;
try {
while ( ! Thread . currentThread ( ) . isInterrupted ( ) ) { // action loop
if ( dnsfetchHosts . size ( ) = = 0 ) synchronized ( this ) { wait ( ) ; }
synchronized ( dnsfetchHosts ) {
nextHost = ( String ) dnsfetchHosts . remove ( dnsfetchHosts . size ( ) - 1 ) ;
}
try {
serverDomains . dnsResolve ( nextHost ) ;
} catch ( Exception e ) { }
}
} catch ( InterruptedException e ) { }
}
2007-11-01 01:57:32 +01:00
public boolean prefetchHost ( String host ) {
// returns true when the host was known in the dns cache.
// If not, the host is stacked on the fetch stack and false is returned
2007-10-29 02:43:20 +01:00
try {
serverDomains . dnsResolveFromCache ( host ) ;
2007-11-01 01:57:32 +01:00
return true ;
2007-10-29 02:43:20 +01:00
} catch ( UnknownHostException e ) {
synchronized ( this ) {
dnsfetchHosts . add ( host ) ;
notifyAll ( ) ;
}
2007-11-01 01:57:32 +01:00
return false ;
2007-10-29 02:43:20 +01:00
}
2005-10-05 12:45:33 +02:00
}
2007-10-29 02:43:20 +01:00
public void terminateDNSPrefetcher ( ) {
synchronized ( this ) {
interrupt ( ) ;
}
2006-10-17 23:01:35 +02:00
}
2005-10-09 18:40:44 +02:00
public void close ( ) {
2007-10-31 12:32:40 +01:00
if ( this . dbtype = = QUEUE_DB_TYPE_RAM ) {
2007-10-29 02:43:20 +01:00
this . log . logFine ( " Shutdown. Flushing remaining " + size ( ) + " crawl stacker job entries. please wait. " ) ;
while ( size ( ) > 0 ) {
if ( ! job ( ) ) break ;
}
2005-10-09 18:40:44 +02:00
}
2007-10-29 02:43:20 +01:00
terminateDNSPrefetcher ( ) ;
2005-10-09 18:40:44 +02:00
2007-06-26 16:37:10 +02:00
this . log . logFine ( " Shutdown. Closing stackCrawl queue. " ) ;
2007-10-29 02:43:20 +01:00
// closing the db
this . urlEntryCache . close ( ) ;
// clearing the hash list
this . urlEntryHashCache . clear ( ) ;
2005-10-05 12:45:33 +02:00
}
2007-10-29 02:43:20 +01:00
public boolean job ( ) {
plasmaCrawlEntry entry ;
2005-10-09 17:59:09 +02:00
try {
2007-10-29 02:43:20 +01:00
entry = dequeueEntry ( ) ;
} catch ( IOException e ) {
e . printStackTrace ( ) ;
return false ;
}
if ( entry = = null ) return false ;
try {
String rejectReason = sb . crawlStacker . stackCrawl ( entry ) ;
// if the url was rejected we store it into the error URL db
if ( rejectReason ! = null ) {
plasmaCrawlZURL . Entry ee = sb . crawlQueues . errorURL . newEntry ( entry , yacyCore . seedDB . mySeed ( ) . hash , null , 0 , rejectReason ) ;
ee . store ( ) ;
sb . crawlQueues . errorURL . push ( ee ) ;
2006-11-30 00:09:56 +01:00
}
2006-01-29 09:54:19 +01:00
} catch ( Exception e ) {
2007-10-29 02:43:20 +01:00
plasmaCrawlStacker . this . log . logWarning ( " Error while processing stackCrawl entry. \ n " + " Entry: " + entry . toString ( ) + " Error: " + e . toString ( ) , e ) ;
return false ;
2005-10-05 12:45:33 +02:00
}
2007-10-29 02:43:20 +01:00
return true ;
2005-10-05 12:45:33 +02:00
}
2007-10-29 02:43:20 +01:00
public void enqueueEntry (
2007-09-05 11:01:35 +02:00
yacyURL nexturl ,
2007-03-16 14:25:56 +01:00
String referrerhash ,
2005-10-05 12:45:33 +02:00
String initiatorHash ,
String name ,
Date loadDate ,
int currentdepth ,
2007-11-01 01:57:32 +01:00
plasmaCrawlProfile . entry profile ) {
2007-10-29 02:43:20 +01:00
if ( profile = = null ) return ;
plasmaCrawlEntry newEntry = new plasmaCrawlEntry (
2005-10-05 12:45:33 +02:00
initiatorHash ,
2007-03-16 14:25:56 +01:00
nexturl ,
referrerhash ,
2005-10-05 12:45:33 +02:00
name ,
loadDate ,
profile . handle ( ) ,
currentdepth ,
0 ,
0
2007-10-29 02:43:20 +01:00
) ;
if ( newEntry = = null ) return ;
synchronized ( this . urlEntryHashCache ) {
kelondroRow . Entry oldValue ;
2007-11-01 01:57:32 +01:00
boolean hostknown = true ;
if ( prequeue ) hostknown = prefetchHost ( nexturl . getHost ( ) ) ;
2007-10-29 02:43:20 +01:00
try {
oldValue = this . urlEntryCache . put ( newEntry . toRow ( ) ) ;
} catch ( IOException e ) {
oldValue = null ;
}
if ( oldValue = = null ) {
2007-11-01 01:57:32 +01:00
//System.out.println("*** debug crawlStacker dnsHit=" + this.dnsHit + ", dnsMiss=" + this.dnsMiss + ", alternateCount=" + this.alternateCount + ((this.dnsMiss > 0) ? (", Q=" + (this.dnsHit / this.dnsMiss)) : ""));
if ( hostknown ) {
this . alternateCount + + ;
2007-10-29 02:43:20 +01:00
this . urlEntryHashCache . addFirst ( newEntry . url ( ) . hash ( ) ) ;
2007-11-01 01:57:32 +01:00
this . dnsHit + + ;
2007-10-29 02:43:20 +01:00
} else {
2007-11-01 01:57:32 +01:00
if ( ( this . dnsMiss > 0 ) & & ( this . alternateCount > 2 * this . dnsHit / this . dnsMiss ) ) {
this . urlEntryHashCache . addFirst ( newEntry . url ( ) . hash ( ) ) ;
this . alternateCount = 0 ;
//System.out.println("*** debug crawlStacker alternate switch, dnsHit=" + this.dnsHit + ", dnsMiss=" + this.dnsMiss + ", alternateCount=" + this.alternateCount + ", Q=" + (this.dnsHit / this.dnsMiss));
} else {
this . urlEntryHashCache . addLast ( newEntry . url ( ) . hash ( ) ) ;
}
this . dnsMiss + + ;
2007-10-29 02:43:20 +01:00
}
}
2005-10-05 12:45:33 +02:00
}
}
2007-10-29 02:43:20 +01:00
private void deleteDB ( ) {
if ( this . dbtype = = QUEUE_DB_TYPE_RAM ) {
// do nothing..
}
2008-01-18 18:14:02 +01:00
if ( this . dbtype = = QUEUE_DB_TYPE_ECO ) {
2008-01-17 22:48:08 +01:00
new File ( cacheStacksPath , stackfile ) . delete ( ) ;
//kelondroFlexWidthArray.delete(cacheStacksPath, stackfile);
2007-10-29 02:43:20 +01:00
}
if ( this . dbtype = = QUEUE_DB_TYPE_TREE ) {
2008-01-17 22:48:08 +01:00
File cacheFile = new File ( cacheStacksPath , stackfile ) ;
2007-10-29 02:43:20 +01:00
cacheFile . delete ( ) ;
2005-10-05 12:45:33 +02:00
}
2007-10-29 02:43:20 +01:00
}
private void openDB ( ) {
if ( ! ( cacheStacksPath . exists ( ) ) ) cacheStacksPath . mkdir ( ) ; // make the path
if ( this . dbtype = = QUEUE_DB_TYPE_RAM ) {
this . urlEntryCache = new kelondroRowSet ( plasmaCrawlEntry . rowdef , 0 ) ;
}
2008-01-18 18:14:02 +01:00
if ( this . dbtype = = QUEUE_DB_TYPE_ECO ) {
2007-10-29 02:43:20 +01:00
cacheStacksPath . mkdirs ( ) ;
2008-01-17 22:48:08 +01:00
File f = new File ( cacheStacksPath , stackfile ) ;
2007-10-29 02:43:20 +01:00
try {
2008-01-20 22:42:35 +01:00
this . urlEntryCache = new kelondroEcoTable ( f , plasmaCrawlEntry . rowdef , kelondroEcoTable . tailCacheUsageAuto , EcoFSBufferSize , 0 ) ;
2008-01-17 22:48:08 +01:00
//this.urlEntryCache = new kelondroCache(new kelondroFlexTable(cacheStacksPath, newCacheName, preloadTime, plasmaCrawlEntry.rowdef, 0, true));
2007-10-29 02:43:20 +01:00
} catch ( Exception e ) {
e . printStackTrace ( ) ;
// kill DB and try again
2008-01-17 22:48:08 +01:00
f . delete ( ) ;
//kelondroFlexTable.delete(cacheStacksPath, newCacheName);
2007-10-29 02:43:20 +01:00
try {
2008-01-20 22:42:35 +01:00
this . urlEntryCache = new kelondroEcoTable ( f , plasmaCrawlEntry . rowdef , kelondroEcoTable . tailCacheUsageAuto , EcoFSBufferSize , 0 ) ;
2008-01-17 22:48:08 +01:00
//this.urlEntryCache = new kelondroCache(new kelondroFlexTable(cacheStacksPath, newCacheName, preloadTime, plasmaCrawlEntry.rowdef, 0, true));
2007-10-29 02:43:20 +01:00
} catch ( Exception ee ) {
ee . printStackTrace ( ) ;
System . exit ( - 1 ) ;
}
}
}
if ( this . dbtype = = QUEUE_DB_TYPE_TREE ) {
2008-01-17 22:48:08 +01:00
File cacheFile = new File ( cacheStacksPath , stackfile ) ;
2007-10-29 02:43:20 +01:00
cacheFile . getParentFile ( ) . mkdirs ( ) ;
2008-02-19 10:14:07 +01:00
this . urlEntryCache = new kelondroCache ( kelondroTree . open ( cacheFile , true , 0 , plasmaCrawlEntry . rowdef ) ) ;
2007-10-29 02:43:20 +01:00
}
}
public int size ( ) {
synchronized ( this . urlEntryHashCache ) {
return this . urlEntryHashCache . size ( ) ;
}
}
public int getDBType ( ) {
return this . dbtype ;
}
public plasmaCrawlEntry dequeueEntry ( ) throws IOException {
if ( this . urlEntryHashCache . size ( ) = = 0 ) return null ;
String urlHash = null ;
kelondroRow . Entry entry = null ;
synchronized ( this . urlEntryHashCache ) {
urlHash = ( String ) this . urlEntryHashCache . removeFirst ( ) ;
if ( urlHash = = null ) throw new IOException ( " urlHash is null " ) ;
entry = this . urlEntryCache . remove ( urlHash . getBytes ( ) , false ) ;
}
if ( ( urlHash = = null ) | | ( entry = = null ) ) return null ;
return new plasmaCrawlEntry ( entry ) ;
2005-10-05 12:45:33 +02:00
}
2007-11-12 17:32:50 +01:00
public String stackCrawl ( yacyURL url , yacyURL referrer , String initiatorHash , String name , Date loadDate , int currentdepth , plasmaCrawlProfile . entry profile ) {
2007-10-29 02:43:20 +01:00
// stacks a crawl item. The position can also be remote
// returns null if successful, a reason string if not successful
//this.log.logFinest("stackCrawl: nexturlString='" + nexturlString + "'");
// add the url into the crawling queue
plasmaCrawlEntry entry = new plasmaCrawlEntry (
initiatorHash , // initiator, needed for p2p-feedback
url , // url clear text string
2007-11-12 17:32:50 +01:00
( referrer = = null ) ? null : referrer . hash ( ) , // last url in crawling queue
2007-10-29 02:43:20 +01:00
name , // load date
loadDate , // the anchor name
( profile = = null ) ? null : profile . handle ( ) , // profile must not be null!
currentdepth , // depth so far
0 , // anchors, default value
0 // forkfactor, default value
) ;
return stackCrawl ( entry ) ;
2006-09-03 16:59:00 +02:00
}
2007-11-07 23:38:09 +01:00
public String stackCrawl ( plasmaCrawlEntry entry ) {
2005-10-05 12:45:33 +02:00
// stacks a crawl item. The position can also be remote
// returns null if successful, a reason string if not successful
2006-10-11 02:46:45 +02:00
//this.log.logFinest("stackCrawl: nexturlString='" + nexturlString + "'");
2005-10-05 12:45:33 +02:00
2005-10-09 18:11:41 +02:00
long startTime = System . currentTimeMillis ( ) ;
2005-10-05 12:45:33 +02:00
String reason = null ; // failure reason
2006-09-04 13:46:17 +02:00
// check if the protocol is supported
2007-10-29 02:43:20 +01:00
String urlProtocol = entry . url ( ) . getProtocol ( ) ;
if ( ! sb . crawlQueues . isSupportedProtocol ( urlProtocol ) ) {
2006-09-04 13:46:17 +02:00
reason = plasmaCrawlEURL . DENIED_UNSUPPORTED_PROTOCOL ;
2007-10-29 02:43:20 +01:00
this . log . logSevere ( " Unsupported protocol in URL ' " + entry . url ( ) . toString ( ) + " '. " +
2006-09-04 13:46:17 +02:00
" Stack processing time: " + ( System . currentTimeMillis ( ) - startTime ) + " ms " ) ;
return reason ;
}
2007-10-29 02:43:20 +01:00
2005-10-05 12:45:33 +02:00
// check if ip is local ip address
2007-10-29 02:43:20 +01:00
if ( ! sb . acceptURL ( entry . url ( ) ) ) {
2007-07-24 02:46:17 +02:00
reason = plasmaCrawlEURL . DENIED_IP_ADDRESS_NOT_IN_DECLARED_DOMAIN + " [ " + sb . getConfig ( " network.unit.domain " , " unknown " ) + " ] " ;
2007-10-29 02:43:20 +01:00
this . log . logFine ( " Host in URL ' " + entry . url ( ) . toString ( ) + " ' has IP address outside of declared range ( " + sb . getConfig ( " network.unit.domain " , " unknown " ) + " ). " +
2006-05-15 17:42:06 +02:00
" Stack processing time: " + ( System . currentTimeMillis ( ) - startTime ) + " ms " ) ;
2005-11-07 11:57:54 +01:00
return reason ;
2005-10-05 12:45:33 +02:00
}
// check blacklist
2007-10-29 02:43:20 +01:00
if ( plasmaSwitchboard . urlBlacklist . isListed ( plasmaURLPattern . BLACKLIST_CRAWLER , entry . url ( ) ) ) {
2006-08-07 17:11:14 +02:00
reason = plasmaCrawlEURL . DENIED_URL_IN_BLACKLIST ;
2007-10-29 02:43:20 +01:00
this . log . logFine ( " URL ' " + entry . url ( ) . toString ( ) + " ' is in blacklist. " +
2006-05-15 17:42:06 +02:00
" Stack processing time: " + ( System . currentTimeMillis ( ) - startTime ) + " ms " ) ;
2005-10-05 12:45:33 +02:00
return reason ;
2007-10-29 02:43:20 +01:00
}
plasmaCrawlProfile . entry profile = sb . profilesActiveCrawls . getEntry ( entry . profileHandle ( ) ) ;
if ( profile = = null ) {
String errorMsg = " LOST PROFILE HANDLE ' " + entry . profileHandle ( ) + " ' for URL " + entry . url ( ) ;
log . logWarning ( errorMsg ) ;
return errorMsg ;
}
2005-10-05 12:45:33 +02:00
// filter deny
2007-10-29 02:43:20 +01:00
if ( ( entry . depth ( ) > 0 ) & & ( profile ! = null ) & & ( ! ( entry . url ( ) . toString ( ) . matches ( profile . generalFilter ( ) ) ) ) ) {
2006-08-07 17:11:14 +02:00
reason = plasmaCrawlEURL . DENIED_URL_DOES_NOT_MATCH_FILTER ;
2006-09-03 16:59:00 +02:00
2007-10-29 02:43:20 +01:00
this . log . logFine ( " URL ' " + entry . url ( ) . toString ( ) + " ' does not match crawling filter ' " + profile . generalFilter ( ) + " '. " +
2006-05-15 17:42:06 +02:00
" Stack processing time: " + ( System . currentTimeMillis ( ) - startTime ) + " ms " ) ;
2005-10-05 12:45:33 +02:00
return reason ;
}
// deny cgi
2007-10-29 02:43:20 +01:00
if ( entry . url ( ) . isCGI ( ) ) {
2006-08-07 17:11:14 +02:00
reason = plasmaCrawlEURL . DENIED_CGI_URL ;
2006-09-03 16:59:00 +02:00
2007-10-29 02:43:20 +01:00
this . log . logFine ( " URL ' " + entry . url ( ) . toString ( ) + " ' is CGI URL. " +
2006-05-15 17:42:06 +02:00
" Stack processing time: " + ( System . currentTimeMillis ( ) - startTime ) + " ms " ) ;
2005-10-05 12:45:33 +02:00
return reason ;
}
// deny post properties
2007-10-29 02:43:20 +01:00
if ( ( entry . url ( ) . isPOST ( ) ) & & ( profile ! = null ) & & ( ! ( profile . crawlingQ ( ) ) ) ) {
2006-08-07 17:11:14 +02:00
reason = plasmaCrawlEURL . DENIED_POST_URL ;
2006-09-03 16:59:00 +02:00
2007-10-29 02:43:20 +01:00
this . log . logFine ( " URL ' " + entry . url ( ) . toString ( ) + " ' is post URL. " +
2006-05-15 17:42:06 +02:00
" Stack processing time: " + ( System . currentTimeMillis ( ) - startTime ) + " ms " ) ;
2005-10-05 12:45:33 +02:00
return reason ;
}
2007-10-29 02:43:20 +01:00
yacyURL referrerURL = ( entry . referrerhash ( ) = = null ) ? null : sb . crawlQueues . getURL ( entry . referrerhash ( ) ) ;
2006-03-23 17:05:16 +01:00
// add domain to profile domain list
2006-04-14 01:19:36 +02:00
if ( ( profile . domFilterDepth ( ) ! = Integer . MAX_VALUE ) | | ( profile . domMaxPages ( ) ! = Integer . MAX_VALUE ) ) {
2007-10-29 02:43:20 +01:00
profile . domInc ( entry . url ( ) . getHost ( ) , ( referrerURL = = null ) ? null : referrerURL . getHost ( ) . toLowerCase ( ) , entry . depth ( ) ) ;
2006-03-23 17:05:16 +01:00
}
// deny urls that do not match with the profile domain list
2007-10-29 02:43:20 +01:00
if ( ! ( profile . grantedDomAppearance ( entry . url ( ) . getHost ( ) ) ) ) {
2006-08-07 17:11:14 +02:00
reason = plasmaCrawlEURL . DENIED_NO_MATCH_WITH_DOMAIN_FILTER ;
2007-10-29 02:43:20 +01:00
this . log . logFine ( " URL ' " + entry . url ( ) . toString ( ) + " ' is not listed in granted domains. " +
2006-05-15 17:42:06 +02:00
" Stack processing time: " + ( System . currentTimeMillis ( ) - startTime ) + " ms " ) ;
2006-03-23 17:05:16 +01:00
return reason ;
}
// deny urls that exceed allowed number of occurrences
2007-10-29 02:43:20 +01:00
if ( ! ( profile . grantedDomCount ( entry . url ( ) . getHost ( ) ) ) ) {
2006-08-07 17:11:14 +02:00
reason = plasmaCrawlEURL . DENIED_DOMAIN_COUNT_EXCEEDED ;
2007-10-29 02:43:20 +01:00
this . log . logFine ( " URL ' " + entry . url ( ) . toString ( ) + " ' appeared too often, a maximum of " + profile . domMaxPages ( ) + " is allowed. " +
2006-05-15 17:42:06 +02:00
" Stack processing time: " + ( System . currentTimeMillis ( ) - startTime ) + " ms " ) ;
2006-03-23 17:05:16 +01:00
return reason ;
}
2006-09-03 16:59:00 +02:00
// check if the url is double registered
2007-10-29 02:43:20 +01:00
String dbocc = sb . crawlQueues . urlExists ( entry . url ( ) . hash ( ) ) ;
2007-11-16 15:48:09 +01:00
indexURLEntry oldEntry = this . sb . wordIndex . loadedURL . load ( entry . url ( ) . hash ( ) , null , 0 ) ;
2007-06-15 19:45:49 +02:00
boolean recrawl = ( oldEntry ! = null ) & & ( ( System . currentTimeMillis ( ) - oldEntry . loaddate ( ) . getTime ( ) ) > profile . recrawlIfOlder ( ) ) ;
2007-12-12 02:32:25 +01:00
// do double-check
if ( ( dbocc ! = null ) & & ( ! recrawl ) ) {
2006-08-07 17:11:14 +02:00
reason = plasmaCrawlEURL . DOUBLE_REGISTERED + dbocc + " ) " ;
2007-10-29 02:43:20 +01:00
this . log . logFine ( " URL ' " + entry . url ( ) . toString ( ) + " ' is double registered in ' " + dbocc + " '. " + " Stack processing time: " + ( System . currentTimeMillis ( ) - startTime ) + " ms " ) ;
2005-10-05 12:45:33 +02:00
return reason ;
}
2007-12-12 02:32:25 +01:00
if ( ( oldEntry ! = null ) & & ( ! recrawl ) ) {
reason = plasmaCrawlEURL . DOUBLE_REGISTERED + " LURL) " ;
this . log . logFine ( " URL ' " + entry . url ( ) . toString ( ) + " ' is double registered in 'LURL'. " + " Stack processing time: " + ( System . currentTimeMillis ( ) - startTime ) + " ms " ) ;
return reason ;
}
2006-03-23 17:05:16 +01:00
// show potential re-crawl
if ( recrawl ) {
2007-10-29 02:43:20 +01:00
this . log . logFine ( " RE-CRAWL of URL ' " + entry . url ( ) . toString ( ) + " ': this url was crawled " +
2006-03-23 17:05:16 +01:00
( ( System . currentTimeMillis ( ) - oldEntry . loaddate ( ) . getTime ( ) ) / 60000 / 60 / 24 ) + " days ago. " ) ;
}
2005-10-05 12:45:33 +02:00
// store information
2007-10-29 02:43:20 +01:00
boolean local = ( ( entry . initiator ( ) . equals ( yacyURL . dummyHash ) ) | | ( entry . initiator ( ) . equals ( yacyCore . seedDB . mySeed ( ) . hash ) ) ) ;
2005-10-05 12:45:33 +02:00
boolean global =
( profile ! = null ) & &
( profile . remoteIndexing ( ) ) /* granted */ & &
2007-10-29 02:43:20 +01:00
( entry . depth ( ) = = profile . generalDepth ( ) ) /* leaf node */ & &
2006-02-24 10:35:54 +01:00
//(initiatorHash.equals(yacyCore.seedDB.mySeed.hash)) /* not proxy */ &&
(
2007-10-01 14:30:23 +02:00
( yacyCore . seedDB . mySeed ( ) . isSenior ( ) ) | |
( yacyCore . seedDB . mySeed ( ) . isPrincipal ( ) )
2006-02-24 10:35:54 +01:00
) /* qualified */ ;
2005-10-05 12:45:33 +02:00
2006-02-16 11:56:17 +01:00
if ( ( ! local ) & & ( ! global ) & & ( ! profile . handle ( ) . equals ( this . sb . defaultRemoteProfile . handle ( ) ) ) ) {
2007-10-29 02:43:20 +01:00
this . log . logSevere ( " URL ' " + entry . url ( ) . toString ( ) + " ' can neither be crawled local nor global. " ) ;
2005-10-05 12:45:33 +02:00
}
2006-09-03 16:59:00 +02:00
// add the url into the crawling queue
2007-10-29 02:43:20 +01:00
sb . crawlQueues . noticeURL . push (
2006-07-24 18:04:14 +02:00
( ( global ) ? plasmaCrawlNURL . STACK_TYPE_LIMIT :
( ( local ) ? plasmaCrawlNURL . STACK_TYPE_CORE : plasmaCrawlNURL . STACK_TYPE_REMOTE ) ) /*local/remote stack*/ ,
2007-10-29 02:43:20 +01:00
entry ) ;
2005-10-05 12:45:33 +02:00
return null ;
}
2005-12-03 10:58:00 +01:00
}