2010-01-12 11:05:28 +01:00
// Crawler_p.java
2008-07-20 19:14:51 +02:00
// (C) 2006 by Michael Peter Christen; mc@yacy.net, Frankfurt a. M., Germany
2006-12-19 01:29:45 +01:00
// first published 18.12.2006 on http://www.anomic.de
// this file was created using the an implementation from IndexCreate_p.java, published 02.12.2004
//
// This is a part of YaCy, a peer-to-peer based web search engine
//
2009-09-27 00:07:40 +02:00
// $LastChangedDate$
// $LastChangedRevision$
// $LastChangedBy$
2006-12-19 01:29:45 +01:00
//
// LICENSE
2011-06-13 23:44:03 +02:00
//
2006-12-19 01:29:45 +01:00
// This program is free software; you can redistribute it and/or modify
// it under the terms of the GNU General Public License as published by
// the Free Software Foundation; either version 2 of the License, or
// (at your option) any later version.
//
// This program is distributed in the hope that it will be useful,
// but WITHOUT ANY WARRANTY; without even the implied warranty of
// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
// GNU General Public License for more details.
//
// You should have received a copy of the GNU General Public License
// along with this program; if not, write to the Free Software
// Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
2007-03-02 22:09:28 +01:00
import java.io.File ;
2011-03-09 13:50:39 +01:00
import java.io.FileInputStream ;
2012-09-14 12:25:46 +02:00
import java.io.IOException ;
2007-03-02 22:09:28 +01:00
import java.io.Writer ;
import java.net.MalformedURLException ;
import java.util.Date ;
import java.util.HashMap ;
2011-01-28 17:24:33 +01:00
import java.util.HashSet ;
2007-03-02 22:09:28 +01:00
import java.util.Map ;
2011-04-21 15:58:49 +02:00
import java.util.Properties ;
2008-08-04 22:43:36 +02:00
import java.util.Set ;
2007-03-02 22:09:28 +01:00
import java.util.regex.Pattern ;
import java.util.regex.PatternSyntaxException ;
2012-09-14 12:25:46 +02:00
import net.yacy.cora.document.ASCII ;
2010-05-25 14:54:57 +02:00
import net.yacy.cora.document.MultiProtocolURI ;
2012-09-25 21:20:03 +02:00
import net.yacy.cora.federate.yacy.CacheStrategy ;
2010-08-23 14:32:02 +02:00
import net.yacy.cora.protocol.RequestHeader ;
2012-07-27 12:13:53 +02:00
import net.yacy.cora.util.SpaceExceededException ;
2012-09-21 15:48:16 +02:00
import net.yacy.crawler.data.CrawlProfile ;
import net.yacy.crawler.data.CrawlQueues ;
import net.yacy.crawler.data.ZURL.FailCategory ;
import net.yacy.crawler.retrieval.Request ;
import net.yacy.crawler.retrieval.SitemapImporter ;
import net.yacy.data.BookmarkHelper ;
import net.yacy.data.BookmarksDB ;
import net.yacy.data.ListManager ;
import net.yacy.data.WorkTables ;
import net.yacy.data.ymark.YMarkTables ;
2012-01-23 17:27:29 +01:00
import net.yacy.document.Document ;
2012-09-14 12:25:46 +02:00
import net.yacy.document.Parser.Failure ;
2009-10-18 02:53:43 +02:00
import net.yacy.document.parser.html.ContentScraper ;
import net.yacy.document.parser.html.TransformerWriter ;
2009-10-11 02:12:19 +02:00
import net.yacy.kelondro.data.meta.DigestURI ;
2009-11-05 21:28:37 +01:00
import net.yacy.kelondro.logging.Log ;
2009-10-10 03:14:19 +02:00
import net.yacy.kelondro.util.FileUtils ;
2011-10-04 11:06:24 +02:00
import net.yacy.peers.NewsPool ;
2012-07-02 13:57:29 +02:00
import net.yacy.repository.Blacklist.BlacklistType ;
2011-09-25 18:59:06 +02:00
import net.yacy.search.Switchboard ;
import net.yacy.search.SwitchboardConstants ;
2012-09-21 15:48:16 +02:00
import net.yacy.server.serverObjects ;
import net.yacy.server.serverSwitch ;
2007-03-02 22:09:28 +01:00
2010-01-12 11:05:28 +01:00
public class Crawler_p {
2006-12-19 01:29:45 +01:00
2010-01-12 11:05:28 +01:00
// this servlet does NOT create the Crawler servlet page content!
2006-12-19 01:29:45 +01:00
// this servlet starts a web crawl. The interface for entering the web crawl parameters is in IndexCreate_p.html
2011-06-13 23:44:03 +02:00
2012-07-05 09:14:04 +02:00
public static serverObjects respond ( @SuppressWarnings ( " unused " ) final RequestHeader header , final serverObjects post , final serverSwitch env ) {
2006-12-19 01:29:45 +01:00
// return variable that accumulates replacements
2009-07-19 22:37:44 +02:00
final Switchboard sb = ( Switchboard ) env ;
2011-06-13 23:44:03 +02:00
// inital values for AJAX Elements (without JavaScript)
2009-01-22 01:03:54 +01:00
final serverObjects prop = new serverObjects ( ) ;
prop . put ( " rejected " , 0 ) ;
prop . put ( " urlpublictextSize " , 0 ) ;
prop . put ( " rwipublictextSize " , 0 ) ;
prop . put ( " list " , " 0 " ) ;
2011-06-13 23:44:03 +02:00
prop . put ( " loaderSize " , 0 ) ;
2009-01-22 01:03:54 +01:00
prop . put ( " loaderMax " , 0 ) ;
prop . put ( " list-loader " , 0 ) ;
2010-12-29 15:30:25 +01:00
prop . put ( " localCrawlSize " , sb . crawlQueues . coreCrawlJobSize ( ) ) ;
2009-01-22 01:03:54 +01:00
prop . put ( " localCrawlState " , " " ) ;
2010-12-29 15:30:25 +01:00
prop . put ( " limitCrawlSize " , sb . crawlQueues . limitCrawlJobSize ( ) ) ;
2009-01-22 01:03:54 +01:00
prop . put ( " limitCrawlState " , " " ) ;
2012-01-05 18:33:05 +01:00
prop . put ( " remoteCrawlSize " , sb . crawlQueues . remoteTriggeredCrawlJobSize ( ) ) ;
2009-01-22 01:03:54 +01:00
prop . put ( " remoteCrawlState " , " " ) ;
2012-01-05 18:33:05 +01:00
prop . put ( " noloadCrawlSize " , sb . crawlQueues . noloadCrawlJobSize ( ) ) ;
prop . put ( " noloadCrawlState " , " " ) ;
2009-01-22 01:03:54 +01:00
prop . put ( " list-remote " , 0 ) ;
2007-10-24 23:38:19 +02:00
prop . put ( " forwardToCrawlStart " , " 0 " ) ;
2011-06-13 23:44:03 +02:00
2008-06-14 12:24:58 +02:00
prop . put ( " info " , " 0 " ) ;
2011-06-13 23:44:03 +02:00
2012-09-11 02:03:14 +02:00
if ( post ! = null ) {
String c = post . toString ( ) ;
if ( c . length ( ) < 1000 ) Log . logInfo ( " Crawl Start " , c ) ;
}
2010-09-30 14:50:34 +02:00
if ( post ! = null & & post . containsKey ( " continue " ) ) {
// continue queue
final String queue = post . get ( " continue " , " " ) ;
2010-11-27 01:54:59 +01:00
if ( " localcrawler " . equals ( queue ) ) {
2010-09-30 14:50:34 +02:00
sb . continueCrawlJob ( SwitchboardConstants . CRAWLJOB_LOCAL_CRAWL ) ;
2010-11-27 01:54:59 +01:00
} else if ( " remotecrawler " . equals ( queue ) ) {
2010-09-30 14:50:34 +02:00
sb . continueCrawlJob ( SwitchboardConstants . CRAWLJOB_REMOTE_TRIGGERED_CRAWL ) ;
2007-02-22 23:26:11 +01:00
}
2010-09-30 14:50:34 +02:00
}
2007-02-22 23:26:11 +01:00
2010-09-30 14:50:34 +02:00
if ( post ! = null & & post . containsKey ( " pause " ) ) {
// pause queue
final String queue = post . get ( " pause " , " " ) ;
2010-11-27 01:54:59 +01:00
if ( " localcrawler " . equals ( queue ) ) {
2010-09-30 14:50:34 +02:00
sb . pauseCrawlJob ( SwitchboardConstants . CRAWLJOB_LOCAL_CRAWL ) ;
2010-11-27 01:54:59 +01:00
} else if ( " remotecrawler " . equals ( queue ) ) {
2010-09-30 14:50:34 +02:00
sb . pauseCrawlJob ( SwitchboardConstants . CRAWLJOB_REMOTE_TRIGGERED_CRAWL ) ;
2007-02-22 23:26:11 +01:00
}
2010-09-30 14:50:34 +02:00
}
2011-06-13 23:44:03 +02:00
2012-05-23 18:00:37 +02:00
if ( post ! = null & & post . containsKey ( " terminate " ) ) try {
final String handle = post . get ( " handle " , " " ) ;
// termination of a crawl: shift the crawl from active to passive
final CrawlProfile p = sb . crawler . getActive ( handle . getBytes ( ) ) ;
if ( p ! = null ) sb . crawler . putPassive ( handle . getBytes ( ) , p ) ;
// delete all entries from the crawl queue that are deleted here
sb . crawler . removeActive ( handle . getBytes ( ) ) ;
sb . crawlQueues . noticeURL . removeByProfileHandle ( handle , 10000 ) ;
2012-07-27 12:13:53 +02:00
} catch ( final SpaceExceededException e ) {
2012-05-23 18:00:37 +02:00
Log . logException ( e ) ;
}
2010-09-30 14:50:34 +02:00
if ( post ! = null & & post . containsKey ( " crawlingstart " ) ) {
// init crawl
if ( sb . peers = = null ) {
prop . put ( " info " , " 3 " ) ;
} else {
2012-09-14 12:25:46 +02:00
2011-04-18 18:11:16 +02:00
// remove crawlingFileContent before we record the call
2012-06-06 14:27:18 +02:00
String crawlingFileName = post . get ( " crawlingFile " ) ;
final File crawlingFile ;
2012-07-10 22:59:03 +02:00
if ( crawlingFileName = = null | | crawlingFileName . isEmpty ( ) ) {
2012-06-06 14:27:18 +02:00
crawlingFile = null ;
} else {
if ( crawlingFileName . startsWith ( " file:// " ) ) crawlingFileName = crawlingFileName . substring ( 7 ) ;
crawlingFile = new File ( crawlingFileName ) ;
}
2011-04-18 18:11:16 +02:00
if ( crawlingFile ! = null & & crawlingFile . exists ( ) ) {
post . remove ( " crawlingFile$file " ) ;
}
2012-09-14 12:25:46 +02:00
// prepare some filter that are adjusted in case that this is wanted
boolean storeHTCache = " on " . equals ( post . get ( " storeHTCache " , " on " ) ) ;
String newcrawlingMustMatch = post . get ( " mustmatch " , CrawlProfile . MATCH_ALL_STRING ) ;
String newcrawlingMustNotMatch = post . get ( " mustnotmatch " , CrawlProfile . MATCH_NEVER_STRING ) ;
if ( newcrawlingMustMatch . length ( ) < 2 ) newcrawlingMustMatch = CrawlProfile . MATCH_ALL_STRING ; // avoid that all urls are filtered out if bad value was submitted
2010-11-27 01:54:59 +01:00
final boolean fullDomain = " domain " . equals ( post . get ( " range " , " wide " ) ) ; // special property in simple crawl start
final boolean subPath = " subpath " . equals ( post . get ( " range " , " wide " ) ) ; // special property in simple crawl start
2011-06-13 23:44:03 +02:00
2012-09-14 12:25:46 +02:00
String crawlingStart0 = post . get ( " crawlingURL " , " " ) . trim ( ) ; // the crawljob start url
String [ ] rootURLs0 = crawlingStart0 . indexOf ( '\n' ) > 0 | | crawlingStart0 . indexOf ( '\r' ) > 0 ? crawlingStart0 . split ( " [ \\ r \\ n]+ " ) : crawlingStart0 . split ( Pattern . quote ( " | " ) ) ;
Set < DigestURI > rootURLs = new HashSet < DigestURI > ( ) ;
String crawlName = " " ;
if ( crawlingFile = = null ) for ( String crawlingStart : rootURLs0 ) {
if ( crawlingStart = = null | | crawlingStart . length ( ) = = 0 ) continue ;
// add the prefix http:// if necessary
int pos = crawlingStart . indexOf ( " :// " , 0 ) ;
if ( pos = = - 1 ) {
if ( crawlingStart . startsWith ( " www " ) ) crawlingStart = " http:// " + crawlingStart ;
if ( crawlingStart . startsWith ( " ftp " ) ) crawlingStart = " ftp:// " + crawlingStart ;
}
try {
DigestURI crawlingStartURL = new DigestURI ( crawlingStart ) ;
rootURLs . add ( crawlingStartURL ) ;
crawlName + = crawlingStartURL . getHost ( ) + " _ " ;
if ( fullDomain ) {
newcrawlingMustMatch = CrawlProfile . mustMatchFilterFullDomain ( crawlingStartURL ) ;
if ( subPath ) newcrawlingMustMatch = newcrawlingMustMatch . substring ( 0 , newcrawlingMustMatch . length ( ) - 2 ) + crawlingStartURL . getPath ( ) + " .* " ;
}
if ( crawlingStart ! = null & & subPath & & ( pos = crawlingStart . lastIndexOf ( '/' ) ) > 0 ) {
newcrawlingMustMatch = crawlingStart . substring ( 0 , pos + 1 ) + " .* " ;
}
if ( crawlingStartURL ! = null & & ( crawlingStartURL . isFile ( ) | | crawlingStartURL . isSMB ( ) ) ) storeHTCache = false ;
} catch ( MalformedURLException e ) {
Log . logException ( e ) ;
}
}
if ( crawlName . length ( ) > 80 ) crawlName = crawlName . substring ( 0 , 80 ) ;
if ( crawlName . endsWith ( " _ " ) ) crawlName = crawlName . substring ( 0 , crawlName . length ( ) - 1 ) ;
2010-09-30 14:50:34 +02:00
// set the crawl filter
2011-09-29 17:17:39 +02:00
String ipMustMatch = post . get ( " ipMustmatch " , CrawlProfile . MATCH_ALL_STRING ) ;
final String ipMustNotMatch = post . get ( " ipMustnotmatch " , CrawlProfile . MATCH_NEVER_STRING ) ;
if ( ipMustMatch . length ( ) < 2 ) ipMustMatch = CrawlProfile . MATCH_ALL_STRING ;
2012-06-27 12:17:58 +02:00
final String countryMustMatch = post . getBoolean ( " countryMustMatchSwitch " ) ? post . get ( " countryMustMatchList " , " " ) : " " ;
2011-09-27 23:58:18 +02:00
sb . setConfig ( " crawlingIPMustMatch " , ipMustMatch ) ;
sb . setConfig ( " crawlingIPMustNotMatch " , ipMustNotMatch ) ;
if ( countryMustMatch . length ( ) > 0 ) sb . setConfig ( " crawlingCountryMustMatch " , countryMustMatch ) ;
2012-09-16 21:27:55 +02:00
String crawlerNoDepthLimitMatch = post . get ( " crawlingDepthExtension " , CrawlProfile . MATCH_NEVER_STRING ) ;
final String indexUrlMustMatch = post . get ( " indexmustmatch " , CrawlProfile . MATCH_ALL_STRING ) ;
final String indexUrlMustNotMatch = post . get ( " indexmustnotmatch " , CrawlProfile . MATCH_NEVER_STRING ) ;
2010-09-30 14:50:34 +02:00
final boolean crawlOrder = post . get ( " crawlOrder " , " off " ) . equals ( " on " ) ;
2011-03-15 02:03:35 +01:00
env . setConfig ( " crawlOrder " , crawlOrder ) ;
2011-06-13 23:44:03 +02:00
2012-09-16 21:27:55 +02:00
if ( crawlOrder ) crawlerNoDepthLimitMatch = CrawlProfile . MATCH_NEVER_STRING ; // without limitation the crawl order does not work
2011-03-15 02:03:35 +01:00
int newcrawlingdepth = post . getInt ( " crawlingDepth " , 8 ) ;
2010-09-30 14:50:34 +02:00
env . setConfig ( " crawlingDepth " , Integer . toString ( newcrawlingdepth ) ) ;
if ( ( crawlOrder ) & & ( newcrawlingdepth > 8 ) ) newcrawlingdepth = 8 ;
2011-06-13 23:44:03 +02:00
2012-09-14 12:25:46 +02:00
boolean directDocByURL = " on " . equals ( post . get ( " directDocByURL " , " on " ) ) ; // catch also all linked media documents without loading them
2011-09-30 14:38:28 +02:00
env . setConfig ( " crawlingDirectDocByURL " , directDocByURL ) ;
2012-09-03 15:26:08 +02:00
final String collection = post . get ( " collection " , sb . getConfig ( " collection " , " user " ) ) ;
env . setConfig ( " collection " , collection ) ;
2010-09-30 14:50:34 +02:00
// recrawl
final String recrawl = post . get ( " recrawl " , " nodoubles " ) ; // nodoubles, reload, scheduler
2010-11-27 01:54:59 +01:00
boolean crawlingIfOlderCheck = " on " . equals ( post . get ( " crawlingIfOlderCheck " , " off " ) ) ;
2011-03-15 02:03:35 +01:00
int crawlingIfOlderNumber = post . getInt ( " crawlingIfOlderNumber " , - 1 ) ;
2010-09-30 14:50:34 +02:00
String crawlingIfOlderUnit = post . get ( " crawlingIfOlderUnit " , " year " ) ; // year, month, day, hour
2011-03-15 02:03:35 +01:00
int repeat_time = post . getInt ( " repeat_time " , - 1 ) ;
2010-09-30 14:50:34 +02:00
final String repeat_unit = post . get ( " repeat_unit " , " seldays " ) ; // selminutes, selhours, seldays
2011-06-13 23:44:03 +02:00
2010-11-27 01:54:59 +01:00
if ( " scheduler " . equals ( recrawl ) & & repeat_time > 0 ) {
2011-06-13 23:44:03 +02:00
// set crawlingIfOlder attributes that are appropriate for scheduled crawling
2010-09-30 14:50:34 +02:00
crawlingIfOlderCheck = true ;
2010-11-27 01:54:59 +01:00
crawlingIfOlderNumber = " selminutes " . equals ( repeat_unit ) ? 1 : " selhours " . equals ( repeat_unit ) ? repeat_time / 2 : repeat_time * 12 ;
2010-09-30 14:50:34 +02:00
crawlingIfOlderUnit = " hour " ;
2010-11-27 01:54:59 +01:00
} else if ( " reload " . equals ( recrawl ) ) {
2010-09-30 14:50:34 +02:00
repeat_time = - 1 ;
crawlingIfOlderCheck = true ;
2010-11-27 01:54:59 +01:00
} else if ( " nodoubles " . equals ( recrawl ) ) {
2010-09-30 14:50:34 +02:00
repeat_time = - 1 ;
crawlingIfOlderCheck = false ;
}
2011-06-13 23:44:03 +02:00
final long crawlingIfOlder = recrawlIfOlderC ( crawlingIfOlderCheck , crawlingIfOlderNumber , crawlingIfOlderUnit ) ;
2010-09-30 14:50:34 +02:00
env . setConfig ( " crawlingIfOlder " , crawlingIfOlder ) ;
2010-08-20 01:52:38 +02:00
2010-09-30 14:50:34 +02:00
// store this call as api call
if ( repeat_time > 0 ) {
// store as scheduled api call
2012-09-14 12:25:46 +02:00
sb . tables . recordAPICall ( post , " Crawler_p.html " , WorkTables . TABLE_API_TYPE_CRAWLER , " crawl start for " + ( ( rootURLs . size ( ) = = 0 ) ? post . get ( " crawlingFile " , " " ) : rootURLs . iterator ( ) . next ( ) . toNormalform ( true , false ) ) , repeat_time , repeat_unit . substring ( 3 ) ) ;
2010-09-30 14:50:34 +02:00
} else {
// store just a protocol
2012-09-14 12:25:46 +02:00
sb . tables . recordAPICall ( post , " Crawler_p.html " , WorkTables . TABLE_API_TYPE_CRAWLER , " crawl start for " + ( ( rootURLs . size ( ) = = 0 ) ? post . get ( " crawlingFile " , " " ) : rootURLs . iterator ( ) . next ( ) . toNormalform ( true , false ) ) ) ;
2011-06-13 23:44:03 +02:00
}
2010-11-27 01:54:59 +01:00
final boolean crawlingDomMaxCheck = " on " . equals ( post . get ( " crawlingDomMaxCheck " , " off " ) ) ;
2011-03-15 02:03:35 +01:00
final int crawlingDomMaxPages = ( crawlingDomMaxCheck ) ? post . getInt ( " crawlingDomMaxPages " , - 1 ) : - 1 ;
2010-09-30 14:50:34 +02:00
env . setConfig ( " crawlingDomMaxPages " , Integer . toString ( crawlingDomMaxPages ) ) ;
2011-06-13 23:44:03 +02:00
2012-09-14 12:25:46 +02:00
boolean crawlingQ = " on " . equals ( post . get ( " crawlingQ " , " off " ) ) ;
2011-03-15 02:03:35 +01:00
env . setConfig ( " crawlingQ " , crawlingQ ) ;
2011-06-13 23:44:03 +02:00
2010-12-17 00:37:21 +01:00
final boolean indexText = " on " . equals ( post . get ( " indexText " , " on " ) ) ;
2011-03-15 02:03:35 +01:00
env . setConfig ( " indexText " , indexText ) ;
2011-06-13 23:44:03 +02:00
2010-12-17 00:37:21 +01:00
final boolean indexMedia = " on " . equals ( post . get ( " indexMedia " , " on " ) ) ;
2011-03-15 02:03:35 +01:00
env . setConfig ( " indexMedia " , indexMedia ) ;
2011-06-13 23:44:03 +02:00
2011-03-15 02:03:35 +01:00
env . setConfig ( " storeHTCache " , storeHTCache ) ;
2011-06-13 23:44:03 +02:00
CacheStrategy cachePolicy = CacheStrategy . parse ( post . get ( " cachePolicy " , " iffresh " ) ) ;
if ( cachePolicy = = null ) cachePolicy = CacheStrategy . IFFRESH ;
2010-11-27 01:54:59 +01:00
final boolean xsstopw = " on " . equals ( post . get ( " xsstopw " , " off " ) ) ;
2011-03-15 02:03:35 +01:00
env . setConfig ( " xsstopw " , xsstopw ) ;
2011-06-13 23:44:03 +02:00
2010-11-27 01:54:59 +01:00
final boolean xdstopw = " on " . equals ( post . get ( " xdstopw " , " off " ) ) ;
2011-03-15 02:03:35 +01:00
env . setConfig ( " xdstopw " , xdstopw ) ;
2011-06-13 23:44:03 +02:00
2010-11-27 01:54:59 +01:00
final boolean xpstopw = " on " . equals ( post . get ( " xpstopw " , " off " ) ) ;
2011-03-15 02:03:35 +01:00
env . setConfig ( " xpstopw " , xpstopw ) ;
2011-06-13 23:44:03 +02:00
2012-09-14 12:25:46 +02:00
String crawlingMode = post . get ( " crawlingMode " , " url " ) ;
if ( " file " . equals ( crawlingMode ) & & post . containsKey ( " crawlingFile " ) ) {
newcrawlingMustNotMatch = CrawlProfile . MATCH_NEVER_STRING ;
directDocByURL = false ;
}
if ( " sitemap " . equals ( crawlingMode ) ) {
newcrawlingMustMatch = CrawlProfile . MATCH_ALL_STRING ;
newcrawlingMustNotMatch = CrawlProfile . MATCH_NEVER_STRING ;
newcrawlingdepth = 0 ;
directDocByURL = false ;
crawlingQ = true ;
}
if ( " sitelist " . equals ( crawlingMode ) ) {
newcrawlingMustNotMatch = CrawlProfile . MATCH_NEVER_STRING ;
Set < DigestURI > newRootURLs = new HashSet < DigestURI > ( ) ;
for ( DigestURI sitelistURL : rootURLs ) {
// download document
Document scraper ;
try {
scraper = sb . loader . loadDocument ( sitelistURL , CacheStrategy . IFFRESH , BlacklistType . CRAWLER , CrawlQueues . queuedMinLoadDelay ) ;
// get links and generate filter
for ( MultiProtocolURI u : scraper . getAnchors ( ) . keySet ( ) ) {
newRootURLs . add ( new DigestURI ( u ) ) ;
2011-11-22 01:03:20 +01:00
}
2012-09-14 12:25:46 +02:00
} catch ( IOException e ) {
Log . logException ( e ) ;
}
}
rootURLs = newRootURLs ;
crawlingMode = " url " ;
if ( ( fullDomain | | subPath ) & & newcrawlingdepth > 0 ) newcrawlingMustMatch = CrawlProfile . MATCH_ALL_STRING ; // to prevent that there is a restriction on the original urls
}
// compute mustmatch filter according to rootURLs
if ( ( fullDomain | | subPath ) & & newcrawlingdepth > 0 ) {
String siteFilter = " .* " ;
if ( fullDomain ) {
siteFilter = siteFilter ( rootURLs ) ;
} else if ( subPath ) {
siteFilter = subpathFilter ( rootURLs ) ;
}
newcrawlingMustMatch = CrawlProfile . MATCH_ALL_STRING . equals ( newcrawlingMustMatch ) ? siteFilter : " (?=( " + newcrawlingMustMatch + " ))( " + siteFilter + " ) " ;
}
// check if the crawl filter works correctly
try {
Pattern . compile ( newcrawlingMustMatch ) ;
} catch ( final PatternSyntaxException e ) {
prop . put ( " info " , " 4 " ) ; // crawlfilter does not match url
prop . putHTML ( " info_newcrawlingfilter " , newcrawlingMustMatch ) ;
prop . putHTML ( " info_error " , e . getMessage ( ) ) ;
}
try {
Pattern . compile ( newcrawlingMustNotMatch ) ;
} catch ( final PatternSyntaxException e ) {
prop . put ( " info " , " 4 " ) ; // crawlfilter does not match url
prop . putHTML ( " info_newcrawlingfilter " , newcrawlingMustNotMatch ) ;
prop . putHTML ( " info_error " , e . getMessage ( ) ) ;
}
// prepare a new crawling profile
final CrawlProfile profile = new CrawlProfile (
crawlName ,
newcrawlingMustMatch ,
newcrawlingMustNotMatch ,
ipMustMatch ,
ipMustNotMatch ,
countryMustMatch ,
2012-09-16 21:27:55 +02:00
crawlerNoDepthLimitMatch ,
indexUrlMustMatch ,
indexUrlMustNotMatch ,
2012-09-14 12:25:46 +02:00
newcrawlingdepth ,
directDocByURL ,
crawlingIfOlder ,
crawlingDomMaxPages ,
crawlingQ ,
indexText ,
indexMedia ,
storeHTCache ,
crawlOrder ,
xsstopw ,
xdstopw ,
xpstopw ,
cachePolicy ,
collection ) ;
byte [ ] handle = ASCII . getBytes ( profile . handle ( ) ) ;
if ( " url " . equals ( crawlingMode ) ) {
if ( rootURLs . size ( ) = = 0 ) {
prop . put ( " info " , " 5 " ) ; //Crawling failed
prop . putHTML ( " info_crawlingURL " , " (no url given) " ) ;
prop . putHTML ( " info_reasonString " , " you must submit at least one crawl url " ) ;
} else {
// stack requests
sb . crawler . putActive ( handle , profile ) ;
sb . pauseCrawlJob ( SwitchboardConstants . CRAWLJOB_LOCAL_CRAWL ) ;
Set < DigestURI > successurls = new HashSet < DigestURI > ( ) ;
Map < DigestURI , String > failurls = new HashMap < DigestURI , String > ( ) ;
String failreason ;
for ( DigestURI url : rootURLs ) {
if ( ( failreason = stackUrl ( sb , profile , url ) ) = = null ) successurls . add ( url ) ; else failurls . put ( url , failreason ) ;
}
if ( failurls . size ( ) = = 0 ) {
2010-09-30 14:50:34 +02:00
// liftoff!
2012-09-14 12:25:46 +02:00
prop . put ( " info " , " 8 " ) ;
2011-11-22 00:10:29 +01:00
prop . putHTML ( " info_crawlingURL " , post . get ( " crawlingURL " ) ) ;
2012-09-14 12:25:46 +02:00
2010-09-30 14:50:34 +02:00
// generate a YaCyNews if the global flag was set
2011-01-12 01:00:14 +01:00
if ( ! sb . isRobinsonMode ( ) & & crawlOrder ) {
2012-09-14 12:25:46 +02:00
final Map < String , String > m = new HashMap < String , String > ( profile ) ; // must be cloned
2010-09-30 14:50:34 +02:00
m . remove ( " specificDepth " ) ;
m . remove ( " indexText " ) ;
m . remove ( " indexMedia " ) ;
m . remove ( " remoteIndexing " ) ;
m . remove ( " xsstopw " ) ;
m . remove ( " xpstopw " ) ;
m . remove ( " xdstopw " ) ;
m . remove ( " storeTXCache " ) ;
m . remove ( " storeHTCache " ) ;
m . remove ( " generalFilter " ) ;
m . remove ( " specificFilter " ) ;
m . put ( " intention " , post . get ( " intention " , " " ) . replace ( ',' , '/' ) ) ;
2011-10-04 11:06:24 +02:00
sb . peers . newsPool . publishMyNews ( sb . peers . mySeed ( ) , NewsPool . CATEGORY_CRAWL_START , m ) ;
2011-06-13 23:44:03 +02:00
}
2010-09-30 14:50:34 +02:00
} else {
2012-09-14 12:25:46 +02:00
StringBuilder fr = new StringBuilder ( ) ;
for ( Map . Entry < DigestURI , String > failure : failurls . entrySet ( ) ) {
sb . crawlQueues . errorURL . push (
new Request (
sb . peers . mySeed ( ) . hash . getBytes ( ) ,
failure . getKey ( ) ,
null ,
" " ,
new Date ( ) ,
profile . handle ( ) ,
0 ,
0 ,
0 ,
0 ) ,
sb . peers . mySeed ( ) . hash . getBytes ( ) ,
new Date ( ) ,
1 ,
FailCategory . FINAL_LOAD_CONTEXT ,
failure . getValue ( ) , - 1 ) ;
fr . append ( failure . getValue ( ) ) . append ( '/' ) ;
}
2010-09-30 14:50:34 +02:00
prop . put ( " info " , " 5 " ) ; //Crawling failed
prop . putHTML ( " info_crawlingURL " , ( post . get ( " crawlingURL " ) ) ) ;
2012-09-14 12:25:46 +02:00
prop . putHTML ( " info_reasonString " , fr . toString ( ) ) ;
2010-09-30 14:50:34 +02:00
}
2012-09-14 12:25:46 +02:00
if ( successurls . size ( ) > 0 ) sb . continueCrawlJob ( SwitchboardConstants . CRAWLJOB_LOCAL_CRAWL ) ;
}
} else if ( " sitemap " . equals ( crawlingMode ) ) {
final String sitemapURLStr = post . get ( " sitemapURL " , " " ) ;
try {
final DigestURI sitemapURL = new DigestURI ( sitemapURLStr ) ;
sb . crawler . putActive ( handle , profile ) ;
final SitemapImporter importer = new SitemapImporter ( sb , sitemapURL , profile ) ;
importer . start ( ) ;
sb . continueCrawlJob ( SwitchboardConstants . CRAWLJOB_LOCAL_CRAWL ) ;
2010-09-30 14:50:34 +02:00
} catch ( final Exception e ) {
// mist
2012-09-14 12:25:46 +02:00
prop . put ( " info " , " 6 " ) ; //Error with url
prop . putHTML ( " info_crawlingStart " , sitemapURLStr ) ;
2010-09-30 14:50:34 +02:00
prop . putHTML ( " info_error " , e . getMessage ( ) ) ;
2012-09-14 12:25:46 +02:00
Log . logException ( e ) ;
2010-09-30 14:50:34 +02:00
}
2010-11-27 01:54:59 +01:00
} else if ( " file " . equals ( crawlingMode ) ) {
2010-09-30 14:50:34 +02:00
if ( post . containsKey ( " crawlingFile " ) ) {
2011-03-09 13:50:39 +01:00
final String crawlingFileContent = post . get ( " crawlingFile$file " , " " ) ;
2010-09-30 14:50:34 +02:00
try {
2006-12-19 01:29:45 +01:00
// check if the crawl filter works correctly
2008-11-14 10:58:56 +01:00
Pattern . compile ( newcrawlingMustMatch ) ;
2012-07-03 17:06:20 +02:00
final ContentScraper scraper = new ContentScraper ( new DigestURI ( crawlingFile ) , 10000 ) ;
2010-09-30 14:50:34 +02:00
final Writer writer = new TransformerWriter ( null , null , scraper , null , false ) ;
2011-03-09 13:50:39 +01:00
if ( crawlingFile ! = null & & crawlingFile . exists ( ) ) {
FileUtils . copy ( new FileInputStream ( crawlingFile ) , writer ) ;
} else {
FileUtils . copy ( crawlingFileContent , writer ) ;
}
2010-09-30 14:50:34 +02:00
writer . close ( ) ;
2011-06-13 23:44:03 +02:00
2011-03-09 13:50:39 +01:00
// get links and generate filter
2011-04-21 15:58:49 +02:00
final Map < MultiProtocolURI , Properties > hyperlinks = scraper . getAnchors ( ) ;
2012-07-11 23:18:57 +02:00
if ( newcrawlingdepth > 0 ) {
if ( fullDomain ) {
newcrawlingMustMatch = siteFilter ( hyperlinks . keySet ( ) ) ;
} else if ( subPath ) {
newcrawlingMustMatch = subpathFilter ( hyperlinks . keySet ( ) ) ;
}
}
2011-06-13 23:44:03 +02:00
2012-09-14 12:25:46 +02:00
sb . crawler . putActive ( handle , profile ) ;
2010-09-30 14:50:34 +02:00
sb . pauseCrawlJob ( SwitchboardConstants . CRAWLJOB_LOCAL_CRAWL ) ;
2012-07-05 10:44:30 +02:00
sb . crawlStacker . enqueueEntriesAsynchronous ( sb . peers . mySeed ( ) . hash . getBytes ( ) , profile . handle ( ) , hyperlinks ) ;
2008-08-02 14:12:04 +02:00
} catch ( final PatternSyntaxException e ) {
2010-12-09 18:17:25 +01:00
prop . put ( " info " , " 4 " ) ; // crawlfilter does not match url
2008-11-14 10:58:56 +01:00
prop . putHTML ( " info_newcrawlingfilter " , newcrawlingMustMatch ) ;
2007-10-24 23:38:19 +02:00
prop . putHTML ( " info_error " , e . getMessage ( ) ) ;
2008-08-02 14:12:04 +02:00
} catch ( final Exception e ) {
2006-12-19 01:29:45 +01:00
// mist
2010-12-09 18:17:25 +01:00
prop . put ( " info " , " 7 " ) ; // Error with file
2011-03-09 13:50:39 +01:00
prop . putHTML ( " info_crawlingStart " , crawlingFileName ) ;
2007-10-24 23:38:19 +02:00
prop . putHTML ( " info_error " , e . getMessage ( ) ) ;
2009-11-05 21:28:37 +01:00
Log . logException ( e ) ;
2007-10-24 23:38:19 +02:00
}
2010-09-30 14:50:34 +02:00
sb . continueCrawlJob ( SwitchboardConstants . CRAWLJOB_LOCAL_CRAWL ) ;
}
2006-12-19 01:29:45 +01:00
}
}
2010-09-30 14:50:34 +02:00
}
2011-06-13 23:44:03 +02:00
2010-09-30 14:50:34 +02:00
if ( post ! = null & & post . containsKey ( " crawlingPerformance " ) ) {
setPerformance ( sb , post ) ;
2006-12-19 01:29:45 +01:00
}
2011-06-13 23:44:03 +02:00
2007-03-12 17:24:28 +01:00
// performance settings
2011-03-15 02:03:35 +01:00
final long LCbusySleep = env . getConfigLong ( SwitchboardConstants . CRAWLJOB_LOCAL_CRAWL_BUSYSLEEP , 1000L ) ;
2008-08-02 14:12:04 +02:00
final int LCppm = ( int ) ( 60000L / Math . max ( 1 , LCbusySleep ) ) ;
2009-05-15 01:11:10 +02:00
prop . put ( " crawlingSpeedMaxChecked " , ( LCppm > = 30000 ) ? " 1 " : " 0 " ) ;
prop . put ( " crawlingSpeedCustChecked " , ( ( LCppm > 10 ) & & ( LCppm < 30000 ) ) ? " 1 " : " 0 " ) ;
2007-10-24 23:38:19 +02:00
prop . put ( " crawlingSpeedMinChecked " , ( LCppm < = 10 ) ? " 1 " : " 0 " ) ;
2008-09-17 01:04:24 +02:00
prop . put ( " customPPMdefault " , Integer . toString ( LCppm ) ) ;
2011-06-13 23:44:03 +02:00
2012-05-23 18:00:37 +02:00
// generate crawl profile table
int count = 0 ;
boolean dark = true ;
final int domlistlength = ( post = = null ) ? 160 : post . getInt ( " domlistlength " , 160 ) ;
CrawlProfile profile ;
// put active crawls into list
for ( final byte [ ] h : sb . crawler . getActive ( ) ) {
profile = sb . crawler . getActive ( h ) ;
if ( CrawlProfile . ignoreNames . contains ( profile . name ( ) ) ) continue ;
2012-07-05 10:23:07 +02:00
profile . putProfileEntry ( " crawlProfilesShow_list_ " , prop , true , dark , count , domlistlength ) ;
2012-05-23 18:00:37 +02:00
dark = ! dark ;
count + + ;
}
prop . put ( " crawlProfilesShow_list " , count ) ;
prop . put ( " crawlProfilesShow " , count = = 0 ? 0 : 1 ) ;
2012-06-06 14:27:18 +02:00
2006-12-19 01:29:45 +01:00
// return rewrite properties
return prop ;
}
2011-06-13 23:44:03 +02:00
2012-09-14 12:25:46 +02:00
/ * *
* stack the url to the crawler
* @param sb
* @param profile
* @param url
* @return null if this was ok . If this failed , return a string with a fail reason
* /
private static String stackUrl ( Switchboard sb , CrawlProfile profile , DigestURI url ) {
byte [ ] handle = ASCII . getBytes ( profile . handle ( ) ) ;
// remove url from the index to be prepared for a re-crawl
final byte [ ] urlhash = url . hash ( ) ;
sb . index . fulltext ( ) . remove ( urlhash ) ;
sb . crawlQueues . noticeURL . removeByURLHash ( urlhash ) ;
sb . crawlQueues . errorURL . remove ( urlhash ) ;
// special handling of ftp protocol
if ( url . isFTP ( ) ) {
try {
sb . crawler . putActive ( handle , profile ) ;
sb . pauseCrawlJob ( SwitchboardConstants . CRAWLJOB_LOCAL_CRAWL ) ;
sb . crawlStacker . enqueueEntriesFTP ( sb . peers . mySeed ( ) . hash . getBytes ( ) , profile . handle ( ) , url . getHost ( ) , url . getPort ( ) , false ) ;
return null ;
} catch ( final Exception e ) {
// mist
Log . logException ( e ) ;
return " problem crawling an ftp site: " + e . getMessage ( ) ;
}
}
// get a scraper to get the title
Document scraper ;
try {
scraper = sb . loader . loadDocument ( url , CacheStrategy . IFFRESH , BlacklistType . CRAWLER , CrawlQueues . queuedMinLoadDelay ) ;
} catch ( IOException e ) {
Log . logException ( e ) ;
return " scraper cannot load URL: " + e . getMessage ( ) ;
}
final String title = scraper = = null ? url . toNormalform ( true , true ) : scraper . dc_title ( ) ;
final String description = scraper . dc_description ( ) ;
// add the url to the crawl stack
sb . crawler . removePassive ( handle ) ; // if there is an old entry, delete it
sb . crawler . putActive ( handle , profile ) ;
final String reasonString = sb . crawlStacker . stackCrawl ( new Request (
sb . peers . mySeed ( ) . hash . getBytes ( ) ,
url ,
null ,
" CRAWLING-ROOT " ,
new Date ( ) ,
profile . handle ( ) ,
0 ,
0 ,
0 ,
0
) ) ;
if ( reasonString ! = null ) return reasonString ;
// create a bookmark from crawl start url
//final Set<String> tags=ListManager.string2set(BookmarkHelper.cleanTagsString(post.get("bookmarkFolder","/crawlStart")));
final Set < String > tags = ListManager . string2set ( BookmarkHelper . cleanTagsString ( " /crawlStart " ) ) ;
tags . add ( " crawlStart " ) ;
final String [ ] keywords = scraper . dc_subject ( ) ;
if ( keywords ! = null ) {
for ( final String k : keywords ) {
final String kk = BookmarkHelper . cleanTagsString ( k ) ;
if ( kk . length ( ) > 0 ) tags . add ( kk ) ;
}
}
String tagStr = tags . toString ( ) ;
if ( tagStr . length ( ) > 2 & & tagStr . startsWith ( " [ " ) & & tagStr . endsWith ( " ] " ) ) tagStr = tagStr . substring ( 1 , tagStr . length ( ) - 2 ) ;
// we will create always a bookmark to use this to track crawled hosts
final BookmarksDB . Bookmark bookmark = sb . bookmarksDB . createBookmark ( url . toNormalform ( true , false ) , " admin " ) ;
if ( bookmark ! = null ) {
bookmark . setProperty ( BookmarksDB . Bookmark . BOOKMARK_TITLE , title ) ;
bookmark . setProperty ( BookmarksDB . Bookmark . BOOKMARK_DESCRIPTION , description ) ;
bookmark . setOwner ( " admin " ) ;
bookmark . setPublic ( false ) ;
bookmark . setTags ( tags , true ) ;
sb . bookmarksDB . saveBookmark ( bookmark ) ;
}
// do the same for ymarks
// TODO: could a non admin user add crawls?
try {
sb . tables . bookmarks . createBookmark ( sb . loader , url , YMarkTables . USER_ADMIN , true , " crawlStart " , " /Crawl Start " ) ;
} catch ( IOException e ) {
Log . logException ( e ) ;
} catch ( Failure e ) {
Log . logException ( e ) ;
}
// that was ok
return null ;
}
2008-08-25 22:31:32 +02:00
private static long recrawlIfOlderC ( final boolean recrawlIfOlderCheck , final int recrawlIfOlderNumber , final String crawlingIfOlderUnit ) {
if ( ! recrawlIfOlderCheck ) return 0L ;
2012-07-02 13:57:29 +02:00
if ( " year " . equals ( crawlingIfOlderUnit ) ) return System . currentTimeMillis ( ) - recrawlIfOlderNumber * 1000L * 60L * 60L * 24L * 365L ;
if ( " month " . equals ( crawlingIfOlderUnit ) ) return System . currentTimeMillis ( ) - recrawlIfOlderNumber * 1000L * 60L * 60L * 24L * 30L ;
if ( " day " . equals ( crawlingIfOlderUnit ) ) return System . currentTimeMillis ( ) - recrawlIfOlderNumber * 1000L * 60L * 60L * 24L ;
if ( " hour " . equals ( crawlingIfOlderUnit ) ) return System . currentTimeMillis ( ) - recrawlIfOlderNumber * 1000L * 60L * 60L ;
return System . currentTimeMillis ( ) - recrawlIfOlderNumber ;
2006-12-19 01:29:45 +01:00
}
2011-06-13 23:44:03 +02:00
2009-07-19 22:37:44 +02:00
private static void setPerformance ( final Switchboard sb , final serverObjects post ) {
2008-09-17 01:04:24 +02:00
final String crawlingPerformance = post . get ( " crawlingPerformance " , " custom " ) ;
2011-03-15 02:03:35 +01:00
final long LCbusySleep = sb . getConfigLong ( SwitchboardConstants . CRAWLJOB_LOCAL_CRAWL_BUSYSLEEP , 1000L ) ;
2009-05-15 01:11:10 +02:00
int wantedPPM = ( LCbusySleep = = 0 ) ? 30000 : ( int ) ( 60000L / LCbusySleep ) ;
2007-03-12 17:24:28 +01:00
try {
2011-03-15 02:03:35 +01:00
wantedPPM = post . getInt ( " customPPM " , wantedPPM ) ;
2008-08-02 14:12:04 +02:00
} catch ( final NumberFormatException e ) { }
2010-11-27 01:54:59 +01:00
if ( " minimum " . equals ( crawlingPerformance . toLowerCase ( ) ) ) wantedPPM = 10 ;
if ( " maximum " . equals ( crawlingPerformance . toLowerCase ( ) ) ) wantedPPM = 30000 ;
2007-03-12 17:24:28 +01:00
sb . setPerformance ( wantedPPM ) ;
}
2011-06-13 23:44:03 +02:00
2012-09-14 12:25:46 +02:00
private static String siteFilter ( final Set < ? extends MultiProtocolURI > uris ) {
2011-03-09 13:50:39 +01:00
final StringBuilder filter = new StringBuilder ( ) ;
final Set < String > filterSet = new HashSet < String > ( ) ;
for ( final MultiProtocolURI uri : uris ) {
filterSet . add ( new StringBuilder ( ) . append ( uri . getProtocol ( ) ) . append ( " :// " ) . append ( uri . getHost ( ) ) . append ( " .* " ) . toString ( ) ) ;
if ( ! uri . getHost ( ) . startsWith ( " www. " ) ) {
filterSet . add ( new StringBuilder ( ) . append ( uri . getProtocol ( ) ) . append ( " ://www. " ) . append ( uri . getHost ( ) ) . append ( " .* " ) . toString ( ) ) ;
}
}
for ( final String element : filterSet ) {
filter . append ( '|' ) . append ( element ) ;
}
return filter . length ( ) > 0 ? filter . substring ( 1 ) : " " ;
}
2012-07-11 23:18:57 +02:00
2012-09-14 12:25:46 +02:00
private static String subpathFilter ( final Set < ? extends MultiProtocolURI > uris ) {
2012-07-11 23:18:57 +02:00
final StringBuilder filter = new StringBuilder ( ) ;
final Set < String > filterSet = new HashSet < String > ( ) ;
for ( final MultiProtocolURI uri : uris ) {
filterSet . add ( new StringBuilder ( ) . append ( uri . toNormalform ( true , false ) ) . append ( " .* " ) . toString ( ) ) ;
}
for ( final String element : filterSet ) {
filter . append ( '|' ) . append ( element ) ;
}
return filter . length ( ) > 0 ? filter . substring ( 1 ) : " " ;
}
2006-12-19 01:29:45 +01:00
}