2005-10-10 01:11:17 +02:00
// plasmaCrawlStacker.java
2005-10-09 17:59:09 +02:00
// -----------------------
// part of YaCy
2008-07-20 19:14:51 +02:00
// (C) by Michael Peter Christen; mc@yacy.net
2005-10-09 17:59:09 +02:00
// first published on http://www.anomic.de
// Frankfurt, Germany, 2005
//
// This file was contributed by Martin Thelian
2007-10-29 02:43:20 +01:00
// ([MC] removed all multithreading and thread pools, this is not necessary here; complete renovation 2007)
2005-10-09 17:59:09 +02:00
//
2005-11-04 14:41:51 +01:00
// $LastChangedDate$
// $LastChangedRevision$
// $LastChangedBy$
2005-10-09 17:59:09 +02:00
//
// This program is free software; you can redistribute it and/or modify
// it under the terms of the GNU General Public License as published by
// the Free Software Foundation; either version 2 of the License, or
// (at your option) any later version.
//
// This program is distributed in the hope that it will be useful,
// but WITHOUT ANY WARRANTY; without even the implied warranty of
// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
// GNU General Public License for more details.
//
// You should have received a copy of the GNU General Public License
// along with this program; if not, write to the Free Software
// Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
2008-05-06 02:32:41 +02:00
package de.anomic.crawler ;
2005-10-05 12:45:33 +02:00
2010-12-09 18:17:25 +01:00
import java.io.IOException ;
2010-08-12 03:29:56 +02:00
import java.net.InetAddress ;
2010-12-09 18:17:25 +01:00
import java.net.MalformedURLException ;
2007-10-29 02:43:20 +01:00
import java.net.UnknownHostException ;
2005-10-05 12:45:33 +02:00
import java.util.Date ;
2010-08-31 17:47:47 +02:00
import java.util.Map ;
2010-12-09 18:17:25 +01:00
import java.util.concurrent.BlockingQueue ;
2005-10-05 12:45:33 +02:00
2010-12-09 18:17:25 +01:00
import net.yacy.cora.document.MultiProtocolURI ;
2010-08-23 14:32:02 +02:00
import net.yacy.cora.protocol.Domains ;
2010-12-09 18:17:25 +01:00
import net.yacy.cora.protocol.ftp.FTPClient ;
2010-12-11 01:31:57 +01:00
import net.yacy.document.TextParser ;
2009-10-11 02:12:19 +02:00
import net.yacy.kelondro.data.meta.DigestURI ;
import net.yacy.kelondro.data.meta.URIMetadataRow ;
2009-10-10 01:13:30 +02:00
import net.yacy.kelondro.logging.Log ;
2010-04-08 02:11:32 +02:00
import net.yacy.kelondro.order.Base64Order ;
2009-10-11 02:12:19 +02:00
import net.yacy.kelondro.workflow.WorkflowProcessor ;
2009-10-21 22:14:30 +02:00
import net.yacy.repository.Blacklist ;
2010-10-30 16:44:33 +02:00
import net.yacy.repository.FilterEngine ;
2009-10-10 01:13:30 +02:00
2010-12-11 01:31:57 +01:00
import de.anomic.crawler.retrieval.FTPLoader ;
import de.anomic.crawler.retrieval.HTTPLoader ;
2009-07-15 23:07:46 +02:00
import de.anomic.crawler.retrieval.Request ;
2010-12-11 01:31:57 +01:00
import de.anomic.crawler.retrieval.SMBLoader ;
2009-10-11 02:12:19 +02:00
import de.anomic.search.Segment ;
2009-07-19 22:37:44 +02:00
import de.anomic.search.Switchboard ;
2009-05-28 16:26:05 +02:00
import de.anomic.yacy.yacySeedDB ;
2005-10-05 12:45:33 +02:00
2008-12-15 01:02:58 +01:00
public final class CrawlStacker {
2009-03-16 19:08:43 +01:00
2010-01-11 00:09:48 +01:00
private final Log log = new Log ( " STACKCRAWL " ) ;
2009-03-16 19:08:43 +01:00
2010-01-11 00:09:48 +01:00
private final WorkflowProcessor < Request > fastQueue , slowQueue ;
2010-06-25 18:44:57 +02:00
//private long dnsHit;
private long dnsMiss ;
private final CrawlQueues nextQueue ;
private final CrawlSwitchboard crawler ;
private final Segment indexSegment ;
private final yacySeedDB peers ;
private final boolean acceptLocalURLs , acceptGlobalURLs ;
2010-10-30 16:44:33 +02:00
private final FilterEngine domainList ;
2009-03-16 19:08:43 +01:00
2008-12-15 01:02:58 +01:00
// this is the process that checks url for double-occurrences and for allowance/disallowance by robots.txt
2009-03-16 19:08:43 +01:00
2009-05-28 16:26:05 +02:00
public CrawlStacker (
CrawlQueues cq ,
CrawlSwitchboard cs ,
Segment indexSegment ,
yacySeedDB peers ,
boolean acceptLocalURLs ,
2010-10-30 16:44:33 +02:00
boolean acceptGlobalURLs ,
FilterEngine domainList ) {
2008-12-15 01:02:58 +01:00
this . nextQueue = cq ;
2009-05-28 16:26:05 +02:00
this . crawler = cs ;
this . indexSegment = indexSegment ;
this . peers = peers ;
2010-01-11 00:09:48 +01:00
//this.dnsHit = 0;
2007-11-01 01:57:32 +01:00
this . dnsMiss = 0 ;
2008-12-15 01:02:58 +01:00
this . acceptLocalURLs = acceptLocalURLs ;
this . acceptGlobalURLs = acceptGlobalURLs ;
2010-10-30 16:44:33 +02:00
this . domainList = domainList ;
2009-03-16 19:08:43 +01:00
2009-10-11 02:12:19 +02:00
this . fastQueue = new WorkflowProcessor < Request > ( " CrawlStackerFast " , " This process checks new urls before they are enqueued into the balancer (proper, double-check, correct domain, filter) " , new String [ ] { " Balancer " } , this , " job " , 10000 , null , 2 ) ;
this . slowQueue = new WorkflowProcessor < Request > ( " CrawlStackerSlow " , " This is like CrawlStackerFast, but does additionaly a DNS lookup. The CrawlStackerFast does not need this because it can use the DNS cache. " , new String [ ] { " Balancer " } , this , " job " , 1000 , null , 5 ) ;
2009-03-16 19:08:43 +01:00
2007-10-29 02:43:20 +01:00
this . log . logInfo ( " STACKCRAWL thread initialized. " ) ;
}
2005-11-15 13:46:22 +01:00
2008-12-15 01:02:58 +01:00
public int size ( ) {
2008-12-19 00:18:34 +01:00
return this . fastQueue . queueSize ( ) + this . slowQueue . queueSize ( ) ;
2006-10-17 23:01:35 +02:00
}
2009-12-02 01:37:59 +01:00
public boolean isEmpty ( ) {
if ( ! this . fastQueue . queueIsEmpty ( ) ) return false ;
if ( ! this . slowQueue . queueIsEmpty ( ) ) return false ;
return true ;
}
2008-12-15 01:02:58 +01:00
2008-12-17 23:53:06 +01:00
public void clear ( ) {
this . fastQueue . clear ( ) ;
this . slowQueue . clear ( ) ;
2008-06-04 23:34:57 +02:00
}
2009-03-16 19:08:43 +01:00
2008-12-19 16:26:01 +01:00
public void announceClose ( ) {
this . log . logInfo ( " Flushing remaining " + size ( ) + " crawl stacker job entries. " ) ;
this . fastQueue . announceShutdown ( ) ;
this . slowQueue . announceShutdown ( ) ;
}
2009-03-16 19:08:43 +01:00
2005-10-09 18:40:44 +02:00
public void close ( ) {
2008-12-19 16:26:01 +01:00
this . log . logInfo ( " Shutdown. waiting for remaining " + size ( ) + " crawl stacker job entries. please wait. " ) ;
2008-12-19 00:18:34 +01:00
this . fastQueue . announceShutdown ( ) ;
this . slowQueue . announceShutdown ( ) ;
this . fastQueue . awaitShutdown ( 2000 ) ;
this . slowQueue . awaitShutdown ( 2000 ) ;
2009-03-16 19:08:43 +01:00
2008-04-08 16:44:39 +02:00
this . log . logInfo ( " Shutdown. Closing stackCrawl queue. " ) ;
2007-10-29 02:43:20 +01:00
2008-12-17 23:53:06 +01:00
clear ( ) ;
2005-10-05 12:45:33 +02:00
}
2008-12-15 01:02:58 +01:00
private boolean prefetchHost ( final String host ) {
// returns true when the host was known in the dns cache.
// If not, the host is stacked on the fetch stack and false is returned
try {
2009-10-11 02:12:19 +02:00
if ( Domains . dnsResolveFromCache ( host ) ! = null ) return true ; // found entry
2008-12-15 01:02:58 +01:00
} catch ( final UnknownHostException e ) {
2008-12-23 11:06:49 +01:00
// we know that this is unknown
2008-12-15 01:02:58 +01:00
return false ;
}
2008-12-23 11:06:49 +01:00
// we just don't know anything about that host
return false ;
2008-12-15 01:02:58 +01:00
}
2010-12-09 18:17:25 +01:00
2009-07-15 23:07:46 +02:00
public Request job ( Request entry ) {
2008-12-15 01:02:58 +01:00
// this is the method that is called by the busy thread from outside
2008-12-19 00:18:34 +01:00
if ( entry = = null ) return null ;
2007-10-29 02:43:20 +01:00
try {
2008-12-15 01:02:58 +01:00
final String rejectReason = stackCrawl ( entry ) ;
2007-10-29 02:43:20 +01:00
// if the url was rejected we store it into the error URL db
if ( rejectReason ! = null ) {
2010-04-08 02:11:32 +02:00
nextQueue . errorURL . push ( entry , peers . mySeed ( ) . hash . getBytes ( ) , new Date ( ) , 1 , rejectReason ) ;
2006-11-30 00:09:56 +01:00
}
2008-08-02 14:12:04 +02:00
} catch ( final Exception e ) {
2008-05-06 02:32:41 +02:00
CrawlStacker . this . log . logWarning ( " Error while processing stackCrawl entry. \ n " + " Entry: " + entry . toString ( ) + " Error: " + e . toString ( ) , e ) ;
2008-12-19 00:18:34 +01:00
return null ;
2005-10-05 12:45:33 +02:00
}
2008-12-19 00:18:34 +01:00
return null ;
2005-10-05 12:45:33 +02:00
}
2009-03-16 19:08:43 +01:00
2009-07-15 23:07:46 +02:00
public void enqueueEntry ( final Request entry ) {
2009-03-16 19:08:43 +01:00
2008-08-26 18:34:24 +02:00
// DEBUG
2010-07-27 03:16:26 +02:00
if ( log . isFinest ( ) ) log . logFinest ( " ENQUEUE " + entry . url ( ) + " , referer= " + entry . referrerhash ( ) + " , initiator= " + ( ( entry . initiator ( ) = = null ) ? " " : new String ( entry . initiator ( ) ) ) + " , name= " + entry . name ( ) + " , appdate= " + entry . appdate ( ) + " , depth= " + entry . depth ( ) ) ;
2008-06-18 01:56:39 +02:00
2008-12-17 23:53:06 +01:00
if ( prefetchHost ( entry . url ( ) . getHost ( ) ) ) {
2007-10-29 02:43:20 +01:00
try {
2008-12-19 00:18:34 +01:00
this . fastQueue . enQueue ( entry ) ;
2010-01-11 00:09:48 +01:00
//this.dnsHit++;
2008-12-17 23:53:06 +01:00
} catch ( InterruptedException e ) {
2009-11-05 21:28:37 +01:00
Log . logException ( e ) ;
2008-12-17 23:53:06 +01:00
}
} else {
try {
2008-12-19 00:18:34 +01:00
this . slowQueue . enQueue ( entry ) ;
2009-03-16 19:08:43 +01:00
this . dnsMiss + + ;
2008-12-17 23:53:06 +01:00
} catch ( InterruptedException e ) {
2009-11-05 21:28:37 +01:00
Log . logException ( e ) ;
2007-10-29 02:43:20 +01:00
}
2005-10-05 12:45:33 +02:00
}
}
2009-03-16 19:08:43 +01:00
2010-12-11 01:31:57 +01:00
public void enqueueEntries ( byte [ ] initiator , String profileHandle , Map < MultiProtocolURI , String > hyperlinks , boolean replace ) {
2010-12-09 18:17:25 +01:00
for ( Map . Entry < MultiProtocolURI , String > e : hyperlinks . entrySet ( ) ) {
if ( e . getKey ( ) = = null ) continue ;
// delete old entry, if exists to force a re-load of the url (thats wanted here)
final DigestURI url = new DigestURI ( e . getKey ( ) ) ;
final byte [ ] urlhash = url . hash ( ) ;
if ( replace ) {
indexSegment . urlMetadata ( ) . remove ( urlhash ) ;
this . nextQueue . noticeURL . removeByURLHash ( urlhash ) ;
this . nextQueue . errorURL . remove ( urlhash ) ;
}
2010-12-11 01:31:57 +01:00
if ( url . getProtocol ( ) . equals ( " ftp " ) ) {
// put the whole ftp site on the crawl stack
enqueueEntries ( initiator , profileHandle , " ftp " , url . getHost ( ) , url . getPort ( ) , replace ) ;
} else {
// put entry on crawl stack
enqueueEntry ( new Request (
initiator ,
url ,
null ,
e . getValue ( ) ,
new Date ( ) ,
profileHandle ,
0 ,
0 ,
0 ,
0
) ) ;
}
2010-12-09 18:17:25 +01:00
}
}
2010-12-11 01:31:57 +01:00
public void enqueueEntries ( final byte [ ] initiator , final String profileHandle , final String protocol , final String host , final int port , final boolean replace ) {
2010-12-09 18:17:25 +01:00
final CrawlQueues cq = this . nextQueue ;
new Thread ( ) {
public void run ( ) {
BlockingQueue < FTPClient . entryInfo > queue ;
try {
queue = FTPClient . sitelist ( host , port ) ;
FTPClient . entryInfo entry ;
while ( ( entry = queue . take ( ) ) ! = FTPClient . POISON_entryInfo ) {
// delete old entry, if exists to force a re-load of the url (thats wanted here)
DigestURI url = null ;
try {
if ( protocol . equals ( " ftp " ) ) url = new DigestURI ( " ftp:// " + host + ( port = = 21 ? " " : " : " + port ) + entry . name ) ;
else if ( protocol . equals ( " smb " ) ) url = new DigestURI ( " smb:// " + host + entry . name ) ;
else if ( protocol . equals ( " http " ) ) url = new DigestURI ( " http:// " + host + ( port = = 80 ? " " : " : " + port ) + entry . name ) ;
else if ( protocol . equals ( " https " ) ) url = new DigestURI ( " https:// " + host + ( port = = 443 ? " " : " : " + port ) + entry . name ) ;
} catch ( MalformedURLException e ) {
continue ;
}
final byte [ ] urlhash = url . hash ( ) ;
if ( replace ) {
indexSegment . urlMetadata ( ) . remove ( urlhash ) ;
cq . noticeURL . removeByURLHash ( urlhash ) ;
cq . errorURL . remove ( urlhash ) ;
}
// put entry on crawl stack
enqueueEntry ( new Request (
initiator ,
url ,
null ,
entry . name ,
entry . date ,
profileHandle ,
0 ,
0 ,
2010-12-11 01:31:57 +01:00
0 ,
entry . size
2010-12-09 18:17:25 +01:00
) ) ;
}
} catch ( IOException e1 ) {
} catch ( InterruptedException e ) {
}
}
} . start ( ) ;
}
2009-07-15 23:07:46 +02:00
public String stackCrawl ( final Request entry ) {
2005-10-05 12:45:33 +02:00
// stacks a crawl item. The position can also be remote
// returns null if successful, a reason string if not successful
2006-10-11 02:46:45 +02:00
//this.log.logFinest("stackCrawl: nexturlString='" + nexturlString + "'");
2010-06-25 18:44:57 +02:00
2010-08-31 17:47:47 +02:00
final Map < String , String > mp = crawler . profilesActiveCrawls . get ( entry . profileHandle ( ) . getBytes ( ) ) ;
CrawlProfile profile = mp = = null ? null : new CrawlProfile ( mp ) ;
2010-06-25 18:44:57 +02:00
String error ;
if ( profile = = null ) {
error = " LOST STACKER PROFILE HANDLE ' " + entry . profileHandle ( ) + " ' for URL " + entry . url ( ) ;
log . logWarning ( error ) ;
return error ;
}
error = checkAcceptance ( entry . url ( ) , profile , entry . depth ( ) ) ;
if ( error ! = null ) return error ;
final DigestURI referrerURL = ( entry . referrerhash ( ) = = null | | entry . referrerhash ( ) . length = = 0 ) ? null : nextQueue . getURL ( entry . referrerhash ( ) ) ;
// add domain to profile domain list
2010-09-30 14:50:34 +02:00
if ( profile . domMaxPages ( ) ! = Integer . MAX_VALUE ) {
2010-06-25 18:44:57 +02:00
profile . domInc ( entry . url ( ) . getHost ( ) , ( referrerURL = = null ) ? null : referrerURL . getHost ( ) . toLowerCase ( ) , entry . depth ( ) ) ;
}
// store information
final boolean local = Base64Order . enhancedCoder . equal ( entry . initiator ( ) , peers . mySeed ( ) . hash . getBytes ( ) ) ;
final boolean proxy = ( entry . initiator ( ) = = null | | entry . initiator ( ) . length = = 0 | | new String ( entry . initiator ( ) ) . equals ( " ------------ " ) ) & & profile . handle ( ) . equals ( crawler . defaultProxyProfile . handle ( ) ) ;
final boolean remote = profile . handle ( ) . equals ( crawler . defaultRemoteProfile . handle ( ) ) ;
final boolean global =
( profile . remoteIndexing ( ) ) /* granted */ & &
( entry . depth ( ) = = profile . depth ( ) ) /* leaf node */ & &
//(initiatorHash.equals(yacyCore.seedDB.mySeed.hash)) /* not proxy */ &&
(
( peers . mySeed ( ) . isSenior ( ) ) | |
( peers . mySeed ( ) . isPrincipal ( ) )
) /* qualified */ ;
if ( ! local & & ! global & & ! remote & & ! proxy ) {
2010-08-04 18:58:33 +02:00
error = " URL ' " + entry . url ( ) . toString ( ) + " ' cannot be crawled. initiator = " + ( ( entry . initiator ( ) = = null ) ? " " : new String ( entry . initiator ( ) ) ) + " , profile.handle = " + profile . handle ( ) ;
2010-06-25 18:44:57 +02:00
this . log . logSevere ( error ) ;
return error ;
}
2010-12-11 01:31:57 +01:00
long maxFileSize = Long . MAX_VALUE ;
if ( entry . size ( ) > 0 ) {
String protocol = entry . url ( ) . getProtocol ( ) ;
if ( protocol . equals ( " http " ) | | protocol . equals ( " https " ) ) maxFileSize = Switchboard . getSwitchboard ( ) . getConfigLong ( " crawler.http.maxFileSize " , HTTPLoader . DEFAULT_MAXFILESIZE ) ;
if ( protocol . equals ( " ftp " ) ) maxFileSize = Switchboard . getSwitchboard ( ) . getConfigLong ( " crawler.ftp.maxFileSize " , FTPLoader . DEFAULT_MAXFILESIZE ) ;
if ( protocol . equals ( " smb " ) ) maxFileSize = Switchboard . getSwitchboard ( ) . getConfigLong ( " crawler.smb.maxFileSize " , SMBLoader . DEFAULT_MAXFILESIZE ) ;
}
// check availability of parser and maxfilesize
if ( entry . size ( ) > maxFileSize | |
( entry . url ( ) . getFileExtension ( ) . length ( ) > 0 & & TextParser . supports ( entry . url ( ) , null ) ! = null )
) {
nextQueue . noticeURL . push ( NoticedURL . StackType . NOLOAD , entry ) ;
return null ;
}
2010-06-25 18:44:57 +02:00
if ( global ) {
// it may be possible that global == true and local == true, so do not check an error case against it
2010-08-04 18:58:33 +02:00
if ( proxy ) this . log . logWarning ( " URL ' " + entry . url ( ) . toString ( ) + " ' has conflicting initiator properties: global = true, proxy = true, initiator = proxy " + " , profile.handle = " + profile . handle ( ) ) ;
2010-06-25 18:44:57 +02:00
if ( remote ) this . log . logWarning ( " URL ' " + entry . url ( ) . toString ( ) + " ' has conflicting initiator properties: global = true, remote = true, initiator = " + new String ( entry . initiator ( ) ) + " , profile.handle = " + profile . handle ( ) ) ;
2010-12-11 01:31:57 +01:00
//int b = nextQueue.noticeURL.stackSize(NoticedURL.StackType.LIMIT);
nextQueue . noticeURL . push ( NoticedURL . StackType . LIMIT , entry ) ;
//assert b < nextQueue.noticeURL.stackSize(NoticedURL.StackType.LIMIT);
//this.log.logInfo("stacked/global: " + entry.url().toString() + ", stacksize = " + nextQueue.noticeURL.stackSize(NoticedURL.StackType.LIMIT));
2010-06-25 18:44:57 +02:00
} else if ( local ) {
2010-08-04 18:58:33 +02:00
if ( proxy ) this . log . logWarning ( " URL ' " + entry . url ( ) . toString ( ) + " ' has conflicting initiator properties: local = true, proxy = true, initiator = proxy " + " , profile.handle = " + profile . handle ( ) ) ;
2010-06-25 18:44:57 +02:00
if ( remote ) this . log . logWarning ( " URL ' " + entry . url ( ) . toString ( ) + " ' has conflicting initiator properties: local = true, remote = true, initiator = " + new String ( entry . initiator ( ) ) + " , profile.handle = " + profile . handle ( ) ) ;
2010-12-11 01:31:57 +01:00
//int b = nextQueue.noticeURL.stackSize(NoticedURL.StackType.CORE);
nextQueue . noticeURL . push ( NoticedURL . StackType . CORE , entry ) ;
//assert b < nextQueue.noticeURL.stackSize(NoticedURL.StackType.CORE);
//this.log.logInfo("stacked/local: " + entry.url().toString() + ", stacksize = " + nextQueue.noticeURL.stackSize(NoticedURL.StackType.CORE));
2010-06-25 18:44:57 +02:00
} else if ( proxy ) {
if ( remote ) this . log . logWarning ( " URL ' " + entry . url ( ) . toString ( ) + " ' has conflicting initiator properties: proxy = true, remote = true, initiator = " + new String ( entry . initiator ( ) ) + " , profile.handle = " + profile . handle ( ) ) ;
2010-12-11 01:31:57 +01:00
//int b = nextQueue.noticeURL.stackSize(NoticedURL.StackType.CORE);
nextQueue . noticeURL . push ( NoticedURL . StackType . CORE , entry ) ;
//assert b < nextQueue.noticeURL.stackSize(NoticedURL.StackType.CORE);
//this.log.logInfo("stacked/proxy: " + entry.url().toString() + ", stacksize = " + nextQueue.noticeURL.stackSize(NoticedURL.StackType.CORE));
2010-06-25 18:44:57 +02:00
} else if ( remote ) {
//int b = nextQueue.noticeURL.stackSize(NoticedURL.STACK_TYPE_REMOTE);
2010-12-11 01:31:57 +01:00
nextQueue . noticeURL . push ( NoticedURL . StackType . REMOTE , entry ) ;
2010-06-25 18:44:57 +02:00
//assert b < nextQueue.noticeURL.stackSize(NoticedURL.STACK_TYPE_REMOTE);
//this.log.logInfo("stacked/remote: " + entry.url().toString() + ", stacksize = " + nextQueue.noticeURL.stackSize(NoticedURL.STACK_TYPE_REMOTE));
}
2009-03-16 19:08:43 +01:00
2010-06-25 18:44:57 +02:00
return null ;
}
2006-09-04 13:46:17 +02:00
2010-08-31 17:47:47 +02:00
public String checkAcceptance ( final DigestURI url , final CrawlProfile profile , int depth ) {
2010-06-25 18:44:57 +02:00
2006-09-04 13:46:17 +02:00
// check if the protocol is supported
2010-06-25 18:44:57 +02:00
final String urlProtocol = url . getProtocol ( ) ;
2009-07-23 23:31:51 +02:00
if ( ! Switchboard . getSwitchboard ( ) . loader . isSupportedProtocol ( urlProtocol ) ) {
2010-06-25 18:44:57 +02:00
this . log . logSevere ( " Unsupported protocol in URL ' " + url . toString ( ) + " '. " ) ;
2009-03-16 19:08:43 +01:00
return " unsupported protocol " ;
2006-09-04 13:46:17 +02:00
}
2007-10-29 02:43:20 +01:00
2005-10-05 12:45:33 +02:00
// check if ip is local ip address
2010-06-25 18:44:57 +02:00
final String urlRejectReason = urlInAcceptedDomain ( url ) ;
2008-04-20 23:36:25 +02:00
if ( urlRejectReason ! = null ) {
2010-06-25 18:44:57 +02:00
if ( this . log . isFine ( ) ) this . log . logFine ( " denied_( " + urlRejectReason + " ) " ) ;
2009-03-16 19:08:43 +01:00
return " denied_( " + urlRejectReason + " ) " ;
2005-10-05 12:45:33 +02:00
}
2009-03-16 19:08:43 +01:00
2005-10-05 12:45:33 +02:00
// check blacklist
2010-06-25 18:44:57 +02:00
if ( Switchboard . urlBlacklist . isListed ( Blacklist . BLACKLIST_CRAWLER , url ) ) {
if ( this . log . isFine ( ) ) this . log . logFine ( " URL ' " + url . toString ( ) + " ' is in blacklist. " ) ;
2009-03-16 19:08:43 +01:00
return " url in blacklist " ;
2007-10-29 02:43:20 +01:00
}
2009-03-16 19:08:43 +01:00
2008-11-14 10:58:56 +01:00
// filter with must-match
2010-06-25 18:44:57 +02:00
if ( ( depth > 0 ) & & ! profile . mustMatchPattern ( ) . matcher ( url . toString ( ) ) . matches ( ) ) {
if ( this . log . isFine ( ) ) this . log . logFine ( " URL ' " + url . toString ( ) + " ' does not match must-match crawling filter ' " + profile . mustMatchPattern ( ) . toString ( ) + " '. " ) ;
2009-03-16 19:08:43 +01:00
return " url does not match must-match filter " ;
2005-10-05 12:45:33 +02:00
}
2008-11-14 10:58:56 +01:00
// filter with must-not-match
2010-06-25 18:44:57 +02:00
if ( ( depth > 0 ) & & profile . mustNotMatchPattern ( ) . matcher ( url . toString ( ) ) . matches ( ) ) {
if ( this . log . isFine ( ) ) this . log . logFine ( " URL ' " + url . toString ( ) + " ' does matches do-not-match crawling filter ' " + profile . mustNotMatchPattern ( ) . toString ( ) + " '. " ) ;
2009-03-16 19:08:43 +01:00
return " url matches must-not-match filter " ;
2008-11-14 10:58:56 +01:00
}
2005-10-05 12:45:33 +02:00
// deny cgi
2010-06-25 18:44:57 +02:00
if ( url . isIndividual ( ) ) {
if ( this . log . isFine ( ) ) this . log . logFine ( " URL ' " + url . toString ( ) + " ' is CGI URL. " ) ;
2009-03-16 19:08:43 +01:00
return " cgi url not allowed " ;
2005-10-05 12:45:33 +02:00
}
2009-03-16 19:08:43 +01:00
2005-10-05 12:45:33 +02:00
// deny post properties
2010-06-25 18:44:57 +02:00
if ( url . isPOST ( ) & & ! ( profile . crawlingQ ( ) ) ) {
if ( this . log . isFine ( ) ) this . log . logFine ( " URL ' " + url . toString ( ) + " ' is post URL. " ) ;
2009-03-16 19:08:43 +01:00
return " post url not allowed " ;
2005-10-05 12:45:33 +02:00
}
2009-03-16 19:08:43 +01:00
2006-03-23 17:05:16 +01:00
// deny urls that exceed allowed number of occurrences
2010-06-25 18:44:57 +02:00
if ( ! ( profile . grantedDomCount ( url . getHost ( ) ) ) ) {
if ( this . log . isFine ( ) ) this . log . logFine ( " URL ' " + url . toString ( ) + " ' appeared too often, a maximum of " + profile . domMaxPages ( ) + " is allowed. " ) ;
2009-03-16 19:08:43 +01:00
return " domain counter exceeded " ;
2006-03-23 17:05:16 +01:00
}
2006-09-03 16:59:00 +02:00
// check if the url is double registered
2010-06-25 18:44:57 +02:00
final String dbocc = nextQueue . urlExists ( url . hash ( ) ) ; // returns the name of the queue if entry exists
URIMetadataRow oldEntry = indexSegment . urlMetadata ( ) . load ( url . hash ( ) , null , 0 ) ;
2010-03-07 02:46:08 +01:00
if ( oldEntry = = null ) {
if ( dbocc ! = null ) {
// do double-check
2010-06-25 18:44:57 +02:00
if ( this . log . isFine ( ) ) this . log . logFine ( " URL ' " + url . toString ( ) + " ' is double registered in ' " + dbocc + " '. " ) ;
2010-03-07 02:46:08 +01:00
if ( dbocc . equals ( " errors " ) ) {
2010-06-25 18:44:57 +02:00
ZURL . Entry errorEntry = nextQueue . errorURL . get ( url . hash ( ) ) ;
2010-03-07 02:46:08 +01:00
return " double in: errors ( " + errorEntry . anycause ( ) + " ) " ;
} else {
return " double in: " + dbocc ;
}
2009-01-09 01:06:36 +01:00
}
2010-03-07 02:46:08 +01:00
} else {
final boolean recrawl = profile . recrawlIfOlder ( ) > oldEntry . loaddate ( ) . getTime ( ) ;
if ( recrawl ) {
2010-08-11 11:54:18 +02:00
if ( this . log . isInfo ( ) )
this . log . logInfo ( " RE-CRAWL of URL ' " + url . toString ( ) + " ': this url was crawled " +
2009-01-09 01:06:36 +01:00
( ( System . currentTimeMillis ( ) - oldEntry . loaddate ( ) . getTime ( ) ) / 60000 / 60 / 24 ) + " days ago. " ) ;
2010-03-07 02:46:08 +01:00
} else {
if ( dbocc = = null ) {
return " double in: LURL-DB " ;
} else {
2010-08-11 11:54:18 +02:00
if ( this . log . isInfo ( ) ) this . log . logInfo ( " URL ' " + url . toString ( ) + " ' is double registered in ' " + dbocc + " '. " + " Stack processing time: " ) ;
2010-03-07 02:46:08 +01:00
if ( dbocc . equals ( " errors " ) ) {
2010-06-25 18:44:57 +02:00
ZURL . Entry errorEntry = nextQueue . errorURL . get ( url . hash ( ) ) ;
2010-03-07 02:46:08 +01:00
return " double in: errors ( " + errorEntry . anycause ( ) + " ) " ;
} else {
return " double in: " + dbocc ;
}
}
2009-01-09 01:06:36 +01:00
}
2006-03-23 17:05:16 +01:00
}
2009-03-31 09:51:32 +02:00
2005-10-05 12:45:33 +02:00
return null ;
}
2010-06-25 18:44:57 +02:00
2008-12-15 01:02:58 +01:00
/ * *
* Test a url if it can be used for crawling / indexing
* This mainly checks if the url is in the declared domain ( local / global )
* @param url
* @return null if the url can be accepted , a string containing a rejection reason if the url cannot be accepted
* /
2009-10-11 02:12:19 +02:00
public String urlInAcceptedDomain ( final DigestURI url ) {
2010-07-18 22:14:20 +02:00
// returns true if the url can be accepted according to network.unit.domain
2008-12-15 01:02:58 +01:00
if ( url = = null ) return " url is null " ;
2010-10-30 16:44:33 +02:00
// check domainList from network-definition
if ( this . domainList ! = null ) {
if ( ! this . domainList . isListed ( url , null ) ) {
return " the url ' " + url + " ' is not in domainList of this network " ;
}
}
2010-07-18 22:14:20 +02:00
final boolean local = url . isLocal ( ) ;
if ( this . acceptLocalURLs & & local ) return null ;
if ( this . acceptGlobalURLs & & ! local ) return null ;
2008-12-15 01:02:58 +01:00
final String host = url . getHost ( ) ;
if ( host = = null ) return " url.host is null " ;
// check if this is a local address and we are allowed to index local pages:
//boolean local = hostAddress.isSiteLocalAddress() || hostAddress.isLoopbackAddress();
//assert local == yacyURL.isLocalDomain(url.hash()); // TODO: remove the dnsResolve above!
2010-08-12 03:29:56 +02:00
InetAddress ia = Domains . dnsResolve ( host ) ;
2008-12-15 01:02:58 +01:00
return ( local ) ?
2010-08-12 03:29:56 +02:00
( " the host ' " + host + " ' is local, but local addresses are not accepted: " + ( ( ia = = null ) ? " null " : ia . getHostAddress ( ) ) ) :
( " the host ' " + host + " ' is global, but global addresses are not accepted: " + ( ( ia = = null ) ? " null " : ia . getHostAddress ( ) ) ) ;
2008-12-15 01:02:58 +01:00
}
2009-04-20 08:38:28 +02:00
2010-04-15 15:22:59 +02:00
public String urlInAcceptedDomainHash ( final byte [ ] urlhash ) {
2010-07-18 22:14:20 +02:00
// returns true if the url can be accepted according to network.unit.domain
2009-04-20 08:38:28 +02:00
if ( urlhash = = null ) return " url is null " ;
// check if this is a local address and we are allowed to index local pages:
2010-04-15 15:22:59 +02:00
final boolean local = DigestURI . isLocal ( urlhash ) ;
2010-07-18 22:14:20 +02:00
if ( this . acceptLocalURLs & & local ) return null ;
if ( this . acceptGlobalURLs & & ! local ) return null ;
2009-04-20 08:38:28 +02:00
return ( local ) ?
2010-04-15 15:22:59 +02:00
( " the urlhash ' " + new String ( urlhash ) + " ' is local, but local addresses are not accepted " ) :
( " the urlhash ' " + new String ( urlhash ) + " ' is global, but global addresses are not accepted " ) ;
2009-04-20 08:38:28 +02:00
}
2009-03-16 19:08:43 +01:00
2008-12-15 01:02:58 +01:00
public boolean acceptLocalURLs ( ) {
return this . acceptLocalURLs ;
}
2009-03-16 19:08:43 +01:00
2008-12-15 01:02:58 +01:00
public boolean acceptGlobalURLs ( ) {
return this . acceptGlobalURLs ;
}
2005-12-03 10:58:00 +01:00
}