2010-01-12 11:05:28 +01:00
// Crawler_p.java
2008-07-20 19:14:51 +02:00
// (C) 2006 by Michael Peter Christen; mc@yacy.net, Frankfurt a. M., Germany
2006-12-19 01:29:45 +01:00
// first published 18.12.2006 on http://www.anomic.de
// this file was created using the an implementation from IndexCreate_p.java, published 02.12.2004
//
// LICENSE
2011-06-13 23:44:03 +02:00
//
2006-12-19 01:29:45 +01:00
// This program is free software; you can redistribute it and/or modify
// it under the terms of the GNU General Public License as published by
// the Free Software Foundation; either version 2 of the License, or
// (at your option) any later version.
//
// This program is distributed in the hope that it will be useful,
// but WITHOUT ANY WARRANTY; without even the implied warranty of
// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
// GNU General Public License for more details.
//
// You should have received a copy of the GNU General Public License
// along with this program; if not, write to the Free Software
// Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
2007-03-02 22:09:28 +01:00
import java.io.File ;
2011-03-09 13:50:39 +01:00
import java.io.FileInputStream ;
2012-09-14 12:25:46 +02:00
import java.io.IOException ;
2007-03-02 22:09:28 +01:00
import java.io.Writer ;
import java.net.MalformedURLException ;
import java.util.Date ;
import java.util.HashMap ;
2011-01-28 17:24:33 +01:00
import java.util.HashSet ;
2013-01-31 13:15:28 +01:00
import java.util.List ;
2007-03-02 22:09:28 +01:00
import java.util.Map ;
2008-08-04 22:43:36 +02:00
import java.util.Set ;
2007-03-02 22:09:28 +01:00
import java.util.regex.Pattern ;
import java.util.regex.PatternSyntaxException ;
2015-03-02 12:55:31 +01:00
import net.yacy.cora.date.AbstractFormatter ;
2013-09-15 00:30:23 +02:00
import net.yacy.cora.document.encoding.ASCII ;
import net.yacy.cora.document.id.AnchorURL ;
import net.yacy.cora.document.id.DigestURL ;
2014-12-28 14:27:42 +01:00
import net.yacy.cora.document.id.MultiProtocolURL ;
2013-09-17 15:27:02 +02:00
import net.yacy.cora.federate.solr.FailCategory ;
2012-09-25 21:20:03 +02:00
import net.yacy.cora.federate.yacy.CacheStrategy ;
2013-05-20 22:05:28 +02:00
import net.yacy.cora.protocol.ClientIdentification ;
2010-08-23 14:32:02 +02:00
import net.yacy.cora.protocol.RequestHeader ;
2013-07-09 14:28:25 +02:00
import net.yacy.cora.util.ConcurrentLog ;
2015-01-30 13:20:56 +01:00
import net.yacy.cora.util.JSONException ;
import net.yacy.cora.util.JSONObject ;
2012-07-27 12:13:53 +02:00
import net.yacy.cora.util.SpaceExceededException ;
2013-07-01 13:10:09 +02:00
import net.yacy.crawler.CrawlSwitchboard ;
2014-04-09 21:59:54 +02:00
import net.yacy.crawler.data.Cache ;
2012-09-21 15:48:16 +02:00
import net.yacy.crawler.data.CrawlProfile ;
2014-04-22 23:14:54 +02:00
import net.yacy.crawler.data.NoticedURL.StackType ;
2012-09-21 15:48:16 +02:00
import net.yacy.crawler.retrieval.SitemapImporter ;
2014-04-09 21:59:54 +02:00
import net.yacy.crawler.robots.RobotsTxt ;
2012-09-21 15:48:16 +02:00
import net.yacy.data.WorkTables ;
2012-01-23 17:27:29 +01:00
import net.yacy.document.Document ;
2015-01-30 13:20:56 +01:00
import net.yacy.document.VocabularyScraper ;
2009-10-18 02:53:43 +02:00
import net.yacy.document.parser.html.ContentScraper ;
import net.yacy.document.parser.html.TransformerWriter ;
2012-11-25 15:43:42 +01:00
import net.yacy.kelondro.index.RowHandleSet ;
2009-10-10 03:14:19 +02:00
import net.yacy.kelondro.util.FileUtils ;
2014-01-21 19:28:00 +01:00
import net.yacy.kelondro.workflow.BusyThread ;
2011-10-04 11:06:24 +02:00
import net.yacy.peers.NewsPool ;
2012-07-02 13:57:29 +02:00
import net.yacy.repository.Blacklist.BlacklistType ;
2011-09-25 18:59:06 +02:00
import net.yacy.search.Switchboard ;
import net.yacy.search.SwitchboardConstants ;
2013-11-16 18:23:14 +01:00
import net.yacy.search.index.Fulltext ;
import net.yacy.search.index.Segment ;
2015-10-01 13:18:44 +02:00
import net.yacy.search.query.SearchEventCache ;
2014-04-03 14:51:19 +02:00
import net.yacy.search.schema.CollectionSchema ;
2012-09-21 15:48:16 +02:00
import net.yacy.server.serverObjects ;
import net.yacy.server.serverSwitch ;
2007-03-02 22:09:28 +01:00
2010-01-12 11:05:28 +01:00
public class Crawler_p {
2006-12-19 01:29:45 +01:00
2010-01-12 11:05:28 +01:00
// this servlet does NOT create the Crawler servlet page content!
2006-12-19 01:29:45 +01:00
// this servlet starts a web crawl. The interface for entering the web crawl parameters is in IndexCreate_p.html
2011-06-13 23:44:03 +02:00
2012-11-06 15:21:56 +01:00
public static serverObjects respond ( final RequestHeader header , final serverObjects post , final serverSwitch env ) {
2015-10-01 13:18:44 +02:00
2006-12-19 01:29:45 +01:00
// return variable that accumulates replacements
2009-07-19 22:37:44 +02:00
final Switchboard sb = ( Switchboard ) env ;
2015-10-01 13:18:44 +02:00
// clean up all search events
SearchEventCache . cleanupEvents ( true ) ;
sb . index . clearCaches ( ) ; // every time the ranking is changed we need to remove old orderings
2011-06-13 23:44:03 +02:00
// inital values for AJAX Elements (without JavaScript)
2009-01-22 01:03:54 +01:00
final serverObjects prop = new serverObjects ( ) ;
prop . put ( " rejected " , 0 ) ;
2013-11-16 18:23:14 +01:00
Segment segment = sb . index ;
Fulltext fulltext = segment . fulltext ( ) ;
String localSolr = " /solr/select?core=collection1&q=*:*&start=0&rows=3 " ;
String remoteSolr = env . getConfig ( SwitchboardConstants . FEDERATED_SERVICE_SOLR_INDEXING_URL , localSolr ) ;
if ( ! remoteSolr . endsWith ( " / " ) ) remoteSolr = remoteSolr + " / " ;
prop . put ( " urlpublictextSolrURL " , fulltext . connectedLocalSolr ( ) ? localSolr : remoteSolr + " collection1/select?&q=*:*&start=0&rows=3 " ) ;
prop . putNum ( " urlpublictextSize " , fulltext . collectionSize ( ) ) ;
prop . putNum ( " urlpublictextSegmentCount " , fulltext . getDefaultConnector ( ) . getSegmentCount ( ) ) ;
prop . put ( " webgraphSolrURL " , fulltext . connectedLocalSolr ( ) ? localSolr . replace ( " collection1 " , " webgraph " ) : remoteSolr + " webgraph/select?&q=*:*&start=0&rows=3 " ) ;
2013-12-04 01:54:45 +01:00
prop . putNum ( " webgraphSize " , fulltext . useWebgraph ( ) ? fulltext . webgraphSize ( ) : 0 ) ;
prop . putNum ( " webgraphSegmentCount " , fulltext . useWebgraph ( ) ? fulltext . getWebgraphConnector ( ) . getSegmentCount ( ) : 0 ) ;
2013-11-16 18:23:14 +01:00
prop . putNum ( " citationSize " , segment . citationCount ( ) ) ;
prop . putNum ( " citationSegmentCount " , segment . citationSegmentCount ( ) ) ;
prop . putNum ( " rwipublictextSize " , segment . RWICount ( ) ) ;
prop . putNum ( " rwipublictextSegmentCount " , segment . RWISegmentCount ( ) ) ;
2009-01-22 01:03:54 +01:00
prop . put ( " list " , " 0 " ) ;
2011-06-13 23:44:03 +02:00
prop . put ( " loaderSize " , 0 ) ;
2009-01-22 01:03:54 +01:00
prop . put ( " loaderMax " , 0 ) ;
prop . put ( " list-loader " , 0 ) ;
2014-04-22 23:14:54 +02:00
int coreCrawlJobSize = sb . crawlQueues . coreCrawlJobSize ( ) ;
int limitCrawlJobSize = sb . crawlQueues . limitCrawlJobSize ( ) ;
int remoteTriggeredCrawlJobSize = sb . crawlQueues . remoteTriggeredCrawlJobSize ( ) ;
int noloadCrawlJobSize = sb . crawlQueues . noloadCrawlJobSize ( ) ;
int allsize = coreCrawlJobSize + limitCrawlJobSize + remoteTriggeredCrawlJobSize + noloadCrawlJobSize ;
prop . put ( " localCrawlSize " , coreCrawlJobSize ) ;
2009-01-22 01:03:54 +01:00
prop . put ( " localCrawlState " , " " ) ;
2014-04-22 23:14:54 +02:00
prop . put ( " limitCrawlSize " , limitCrawlJobSize ) ;
2009-01-22 01:03:54 +01:00
prop . put ( " limitCrawlState " , " " ) ;
2014-04-22 23:14:54 +02:00
prop . put ( " remoteCrawlSize " , remoteTriggeredCrawlJobSize ) ;
2009-01-22 01:03:54 +01:00
prop . put ( " remoteCrawlState " , " " ) ;
2014-04-22 23:14:54 +02:00
prop . put ( " noloadCrawlSize " , noloadCrawlJobSize ) ;
2012-01-05 18:33:05 +01:00
prop . put ( " noloadCrawlState " , " " ) ;
2014-04-22 23:14:54 +02:00
prop . put ( " terminate-button " , allsize = = 0 ? 0 : 1 ) ;
2009-01-22 01:03:54 +01:00
prop . put ( " list-remote " , 0 ) ;
2007-10-24 23:38:19 +02:00
prop . put ( " forwardToCrawlStart " , " 0 " ) ;
2011-06-13 23:44:03 +02:00
2008-06-14 12:24:58 +02:00
prop . put ( " info " , " 0 " ) ;
2012-11-25 15:43:42 +01:00
boolean debug = ( post ! = null & & post . containsKey ( " debug " ) ) ;
2012-09-11 02:03:14 +02:00
if ( post ! = null ) {
String c = post . toString ( ) ;
2013-07-09 14:28:25 +02:00
if ( c . length ( ) < 1000 ) ConcurrentLog . info ( " Crawl Start " , c ) ;
2012-09-11 02:03:14 +02:00
}
2014-04-22 23:14:54 +02:00
if ( post ! = null & & post . containsKey ( " queues_terminate_all " ) ) {
2014-08-05 22:23:52 +02:00
// terminate crawls individually
2014-12-07 23:43:38 +01:00
sb . crawlQueues . noticeURL . clear ( ) ;
2014-08-05 22:23:52 +02:00
for ( final byte [ ] h : sb . crawler . getActive ( ) ) {
CrawlProfile p = sb . crawler . getActive ( h ) ;
if ( CrawlSwitchboard . DEFAULT_PROFILES . contains ( p . name ( ) ) ) continue ;
if ( p ! = null ) sb . crawler . putPassive ( h , p ) ;
sb . crawler . removeActive ( h ) ;
sb . crawler . removePassive ( h ) ;
try { sb . crawlQueues . noticeURL . removeByProfileHandle ( p . handle ( ) , 10000 ) ; } catch ( SpaceExceededException e ) { }
}
2014-04-22 23:14:54 +02:00
// clear stacks
for ( StackType stackType : StackType . values ( ) ) sb . crawlQueues . noticeURL . clear ( stackType ) ;
try { sb . cleanProfiles ( ) ; } catch ( final InterruptedException e ) { /* ignore this */ }
// remove pause
sb . continueCrawlJob ( SwitchboardConstants . CRAWLJOB_LOCAL_CRAWL ) ;
sb . setConfig ( SwitchboardConstants . CRAWLJOB_LOCAL_CRAWL + " _isPaused_cause " , " " ) ;
sb . continueCrawlJob ( SwitchboardConstants . CRAWLJOB_REMOTE_TRIGGERED_CRAWL ) ;
sb . setConfig ( SwitchboardConstants . CRAWLJOB_REMOTE_TRIGGERED_CRAWL + " _isPaused_cause " , " " ) ;
prop . put ( " terminate-button " , 0 ) ;
}
2010-09-30 14:50:34 +02:00
if ( post ! = null & & post . containsKey ( " continue " ) ) {
// continue queue
final String queue = post . get ( " continue " , " " ) ;
2010-11-27 01:54:59 +01:00
if ( " localcrawler " . equals ( queue ) ) {
2010-09-30 14:50:34 +02:00
sb . continueCrawlJob ( SwitchboardConstants . CRAWLJOB_LOCAL_CRAWL ) ;
2013-04-09 18:55:26 +02:00
sb . setConfig ( SwitchboardConstants . CRAWLJOB_LOCAL_CRAWL + " _isPaused_cause " , " " ) ;
2010-11-27 01:54:59 +01:00
} else if ( " remotecrawler " . equals ( queue ) ) {
2010-09-30 14:50:34 +02:00
sb . continueCrawlJob ( SwitchboardConstants . CRAWLJOB_REMOTE_TRIGGERED_CRAWL ) ;
2013-04-09 18:55:26 +02:00
sb . setConfig ( SwitchboardConstants . CRAWLJOB_REMOTE_TRIGGERED_CRAWL + " _isPaused_cause " , " " ) ;
2007-02-22 23:26:11 +01:00
}
2010-09-30 14:50:34 +02:00
}
2007-02-22 23:26:11 +01:00
2010-09-30 14:50:34 +02:00
if ( post ! = null & & post . containsKey ( " pause " ) ) {
// pause queue
final String queue = post . get ( " pause " , " " ) ;
2010-11-27 01:54:59 +01:00
if ( " localcrawler " . equals ( queue ) ) {
2012-11-06 15:21:56 +01:00
sb . pauseCrawlJob ( SwitchboardConstants . CRAWLJOB_LOCAL_CRAWL , " user request in Crawler_p from " + header . refererHost ( ) ) ;
2010-11-27 01:54:59 +01:00
} else if ( " remotecrawler " . equals ( queue ) ) {
2012-11-06 15:21:56 +01:00
sb . pauseCrawlJob ( SwitchboardConstants . CRAWLJOB_REMOTE_TRIGGERED_CRAWL , " user request in Crawler_p from " + header . refererHost ( ) ) ;
2007-02-22 23:26:11 +01:00
}
2010-09-30 14:50:34 +02:00
}
2013-04-09 18:55:26 +02:00
String queuemessage = sb . getConfig ( SwitchboardConstants . CRAWLJOB_LOCAL_CRAWL + " _isPaused_cause " , " " ) ;
2014-04-22 23:14:54 +02:00
if ( queuemessage . length ( ) = = 0 ) {
prop . put ( " info-queue " , 0 ) ;
} else {
prop . put ( " info-queue " , 1 ) ;
prop . putHTML ( " info-queue_message " , " pause reason: " + queuemessage ) ;
}
2013-04-09 18:55:26 +02:00
2012-05-23 18:00:37 +02:00
if ( post ! = null & & post . containsKey ( " terminate " ) ) try {
final String handle = post . get ( " handle " , " " ) ;
// termination of a crawl: shift the crawl from active to passive
final CrawlProfile p = sb . crawler . getActive ( handle . getBytes ( ) ) ;
if ( p ! = null ) sb . crawler . putPassive ( handle . getBytes ( ) , p ) ;
// delete all entries from the crawl queue that are deleted here
sb . crawler . removeActive ( handle . getBytes ( ) ) ;
2013-09-26 10:22:31 +02:00
sb . crawler . removePassive ( handle . getBytes ( ) ) ;
2012-05-23 18:00:37 +02:00
sb . crawlQueues . noticeURL . removeByProfileHandle ( handle , 10000 ) ;
2012-07-27 12:13:53 +02:00
} catch ( final SpaceExceededException e ) {
2013-07-09 14:28:25 +02:00
ConcurrentLog . logException ( e ) ;
2012-05-23 18:00:37 +02:00
}
2010-09-30 14:50:34 +02:00
if ( post ! = null & & post . containsKey ( " crawlingstart " ) ) {
// init crawl
if ( sb . peers = = null ) {
prop . put ( " info " , " 3 " ) ;
} else {
2012-09-14 12:25:46 +02:00
2011-04-18 18:11:16 +02:00
// remove crawlingFileContent before we record the call
2012-06-06 14:27:18 +02:00
String crawlingFileName = post . get ( " crawlingFile " ) ;
final File crawlingFile ;
2012-07-10 22:59:03 +02:00
if ( crawlingFileName = = null | | crawlingFileName . isEmpty ( ) ) {
2012-06-06 14:27:18 +02:00
crawlingFile = null ;
} else {
if ( crawlingFileName . startsWith ( " file:// " ) ) crawlingFileName = crawlingFileName . substring ( 7 ) ;
crawlingFile = new File ( crawlingFileName ) ;
}
2011-04-18 18:11:16 +02:00
if ( crawlingFile ! = null & & crawlingFile . exists ( ) ) {
post . remove ( " crawlingFile$file " ) ;
}
2012-09-14 12:25:46 +02:00
// prepare some filter that are adjusted in case that this is wanted
2012-10-09 23:11:31 +02:00
boolean storeHTCache = " on " . equals ( post . get ( " storeHTCache " , " off " ) ) ;
2012-09-14 12:25:46 +02:00
String newcrawlingMustMatch = post . get ( " mustmatch " , CrawlProfile . MATCH_ALL_STRING ) ;
String newcrawlingMustNotMatch = post . get ( " mustnotmatch " , CrawlProfile . MATCH_NEVER_STRING ) ;
if ( newcrawlingMustMatch . length ( ) < 2 ) newcrawlingMustMatch = CrawlProfile . MATCH_ALL_STRING ; // avoid that all urls are filtered out if bad value was submitted
2015-09-05 14:07:23 +02:00
boolean fullDomain = " domain " . equals ( post . get ( " range " , " wide " ) ) ; // special property in simple crawl start
boolean subPath = " subpath " . equals ( post . get ( " range " , " wide " ) ) ; // special property in simple crawl start
2012-11-13 10:54:21 +01:00
final boolean restrictedcrawl = fullDomain | | subPath | | ! CrawlProfile . MATCH_ALL_STRING . equals ( newcrawlingMustMatch ) ;
final boolean deleteage = restrictedcrawl & & " age " . equals ( post . get ( " deleteold " , " off " ) ) ;
2012-11-13 16:54:28 +01:00
Date deleteageDate = null ;
if ( deleteage ) {
2014-07-22 00:23:17 +02:00
deleteageDate = timeParser ( true , post . getInt ( " deleteIfOlderNumber " , - 1 ) , post . get ( " deleteIfOlderUnit " , " year " ) ) ; // year, month, day, hour
2012-11-13 16:54:28 +01:00
}
2013-07-31 10:49:26 +02:00
final boolean deleteold = ( deleteage & & deleteageDate ! = null ) | | ( restrictedcrawl & & post . getBoolean ( " deleteold " ) ) ;
2013-10-21 12:49:32 +02:00
final String sitemapURLStr = post . get ( " sitemapURL " , " " ) ;
2012-09-14 12:25:46 +02:00
String crawlingStart0 = post . get ( " crawlingURL " , " " ) . trim ( ) ; // the crawljob start url
String [ ] rootURLs0 = crawlingStart0 . indexOf ( '\n' ) > 0 | | crawlingStart0 . indexOf ( '\r' ) > 0 ? crawlingStart0 . split ( " [ \\ r \\ n]+ " ) : crawlingStart0 . split ( Pattern . quote ( " | " ) ) ;
2013-09-15 00:30:23 +02:00
Set < DigestURL > rootURLs = new HashSet < DigestURL > ( ) ;
2012-09-14 12:25:46 +02:00
String crawlName = " " ;
if ( crawlingFile = = null ) for ( String crawlingStart : rootURLs0 ) {
if ( crawlingStart = = null | | crawlingStart . length ( ) = = 0 ) continue ;
// add the prefix http:// if necessary
int pos = crawlingStart . indexOf ( " :// " , 0 ) ;
if ( pos = = - 1 ) {
2013-05-08 13:26:25 +02:00
if ( crawlingStart . startsWith ( " ftp " ) ) crawlingStart = " ftp:// " + crawlingStart ; else crawlingStart = " http:// " + crawlingStart ;
2012-09-14 12:25:46 +02:00
}
try {
2013-09-15 00:30:23 +02:00
DigestURL crawlingStartURL = new DigestURL ( crawlingStart ) ;
2012-09-14 12:25:46 +02:00
rootURLs . add ( crawlingStartURL ) ;
2013-05-20 11:25:26 +02:00
crawlName + = ( ( crawlingStartURL . getHost ( ) = = null ) ? crawlingStartURL . toNormalform ( true ) : crawlingStartURL . getHost ( ) ) + ',' ;
2012-09-14 12:25:46 +02:00
if ( crawlingStartURL ! = null & & ( crawlingStartURL . isFile ( ) | | crawlingStartURL . isSMB ( ) ) ) storeHTCache = false ;
2013-07-17 18:31:30 +02:00
} catch ( final MalformedURLException e ) {
2013-07-09 14:28:25 +02:00
ConcurrentLog . logException ( e ) ;
2012-09-14 12:25:46 +02:00
}
2012-09-26 14:05:33 +02:00
} else {
crawlName = crawlingFile . getName ( ) ;
2012-09-14 12:25:46 +02:00
}
2015-06-29 02:02:01 +02:00
if ( crawlName . endsWith ( " , " ) ) crawlName = crawlName . substring ( 0 , crawlName . length ( ) - 1 ) ;
if ( crawlName . length ( ) > 64 ) {
crawlName = " crawl_for_ " + rootURLs . size ( ) + " _start_points_ " + Integer . toHexString ( crawlName . hashCode ( ) ) ;
2012-10-23 02:50:26 +02:00
int p = crawlName . lastIndexOf ( ',' ) ;
if ( p > = 8 ) crawlName = crawlName . substring ( 0 , p ) ;
}
2013-10-21 12:49:32 +02:00
if ( crawlName . length ( ) = = 0 & & sitemapURLStr . length ( ) > 0 ) crawlName = " sitemap loader for " + sitemapURLStr ;
2015-09-05 14:07:23 +02:00
// in case that a root url has a file protocol, then the site filter does not work, patch that:
if ( fullDomain ) {
for ( DigestURL u : rootURLs ) if ( u . isFile ( ) ) { fullDomain = false ; subPath = true ; break ; }
}
2012-09-14 12:25:46 +02:00
2014-04-09 18:33:48 +02:00
// delete old robots entries
2014-05-28 03:01:34 +02:00
for ( DigestURL ru : rootURLs ) {
2014-04-09 21:59:54 +02:00
sb . robots . delete ( ru ) ;
2014-05-28 03:01:34 +02:00
try {
if ( ru . getHost ( ) ! = null ) { // might be null for file://
Cache . delete ( RobotsTxt . robotsURL ( RobotsTxt . getHostPort ( ru ) ) . hash ( ) ) ;
}
} catch ( IOException e ) { }
2014-04-09 21:59:54 +02:00
}
2014-04-10 09:08:59 +02:00
try { sb . robots . clear ( ) ; } catch ( IOException e ) { } // to be safe: clear all.
2014-04-09 18:33:48 +02:00
2010-09-30 14:50:34 +02:00
// set the crawl filter
2011-09-29 17:17:39 +02:00
String ipMustMatch = post . get ( " ipMustmatch " , CrawlProfile . MATCH_ALL_STRING ) ;
final String ipMustNotMatch = post . get ( " ipMustnotmatch " , CrawlProfile . MATCH_NEVER_STRING ) ;
if ( ipMustMatch . length ( ) < 2 ) ipMustMatch = CrawlProfile . MATCH_ALL_STRING ;
2013-07-31 10:49:26 +02:00
final String countryMustMatch = post . getBoolean ( " countryMustMatchSwitch " ) ? post . get ( " countryMustMatchList " , " " ) : " " ;
2011-09-27 23:58:18 +02:00
sb . setConfig ( " crawlingIPMustMatch " , ipMustMatch ) ;
sb . setConfig ( " crawlingIPMustNotMatch " , ipMustNotMatch ) ;
if ( countryMustMatch . length ( ) > 0 ) sb . setConfig ( " crawlingCountryMustMatch " , countryMustMatch ) ;
2012-09-16 21:27:55 +02:00
String crawlerNoDepthLimitMatch = post . get ( " crawlingDepthExtension " , CrawlProfile . MATCH_NEVER_STRING ) ;
final String indexUrlMustMatch = post . get ( " indexmustmatch " , CrawlProfile . MATCH_ALL_STRING ) ;
final String indexUrlMustNotMatch = post . get ( " indexmustnotmatch " , CrawlProfile . MATCH_NEVER_STRING ) ;
2013-04-26 10:49:55 +02:00
final String indexContentMustMatch = post . get ( " indexcontentmustmatch " , CrawlProfile . MATCH_ALL_STRING ) ;
final String indexContentMustNotMatch = post . get ( " indexcontentmustnotmatch " , CrawlProfile . MATCH_NEVER_STRING ) ;
2012-09-16 21:27:55 +02:00
2010-09-30 14:50:34 +02:00
final boolean crawlOrder = post . get ( " crawlOrder " , " off " ) . equals ( " on " ) ;
2011-03-15 02:03:35 +01:00
env . setConfig ( " crawlOrder " , crawlOrder ) ;
2011-06-13 23:44:03 +02:00
2012-09-16 21:27:55 +02:00
if ( crawlOrder ) crawlerNoDepthLimitMatch = CrawlProfile . MATCH_NEVER_STRING ; // without limitation the crawl order does not work
2011-03-15 02:03:35 +01:00
int newcrawlingdepth = post . getInt ( " crawlingDepth " , 8 ) ;
2010-09-30 14:50:34 +02:00
env . setConfig ( " crawlingDepth " , Integer . toString ( newcrawlingdepth ) ) ;
if ( ( crawlOrder ) & & ( newcrawlingdepth > 8 ) ) newcrawlingdepth = 8 ;
2011-06-13 23:44:03 +02:00
2012-10-09 23:11:31 +02:00
boolean directDocByURL = " on " . equals ( post . get ( " directDocByURL " , " off " ) ) ; // catch also all linked media documents without loading them
2011-09-30 14:38:28 +02:00
env . setConfig ( " crawlingDirectDocByURL " , directDocByURL ) ;
2012-10-31 17:44:45 +01:00
final String collection = post . get ( " collection " , " user " ) ;
2012-09-03 15:26:08 +02:00
env . setConfig ( " collection " , collection ) ;
2010-09-30 14:50:34 +02:00
// recrawl
final String recrawl = post . get ( " recrawl " , " nodoubles " ) ; // nodoubles, reload, scheduler
2014-07-22 00:23:17 +02:00
Date crawlingIfOlder = null ;
2012-11-12 11:19:39 +01:00
if ( " reload " . equals ( recrawl ) ) {
crawlingIfOlder = timeParser ( true , post . getInt ( " reloadIfOlderNumber " , - 1 ) , post . get ( " reloadIfOlderUnit " , " year " ) ) ; // year, month, day, hour
2010-09-30 14:50:34 +02:00
}
2014-07-22 00:23:17 +02:00
env . setConfig ( " crawlingIfOlder " , crawlingIfOlder = = null ? Long . MAX_VALUE : crawlingIfOlder . getTime ( ) ) ;
2010-08-20 01:52:38 +02:00
2010-09-30 14:50:34 +02:00
// store this call as api call
2012-11-12 11:19:39 +01:00
sb . tables . recordAPICall ( post , " Crawler_p.html " , WorkTables . TABLE_API_TYPE_CRAWLER , " crawl start for " + ( ( rootURLs . size ( ) = = 0 ) ? post . get ( " crawlingFile " , " " ) : rootURLs . iterator ( ) . next ( ) . toNormalform ( true ) ) ) ;
2011-06-13 23:44:03 +02:00
2010-11-27 01:54:59 +01:00
final boolean crawlingDomMaxCheck = " on " . equals ( post . get ( " crawlingDomMaxCheck " , " off " ) ) ;
2011-03-15 02:03:35 +01:00
final int crawlingDomMaxPages = ( crawlingDomMaxCheck ) ? post . getInt ( " crawlingDomMaxPages " , - 1 ) : - 1 ;
2010-09-30 14:50:34 +02:00
env . setConfig ( " crawlingDomMaxPages " , Integer . toString ( crawlingDomMaxPages ) ) ;
2011-06-13 23:44:03 +02:00
2014-02-22 01:44:08 +01:00
boolean followFrames = " on " . equals ( post . get ( " followFrames " , " false " ) ) ;
2013-07-03 14:50:06 +02:00
env . setConfig ( " followFrames " , followFrames ) ;
2014-02-22 01:44:08 +01:00
boolean obeyHtmlRobotsNoindex = " on " . equals ( post . get ( " obeyHtmlRobotsNoindex " , " false " ) ) ;
2013-07-03 14:50:06 +02:00
env . setConfig ( " obeyHtmlRobotsNoindex " , obeyHtmlRobotsNoindex ) ;
2014-07-18 12:43:01 +02:00
boolean obeyHtmlRobotsNofollow = " on " . equals ( post . get ( " obeyHtmlRobotsNofollow " , " false " ) ) ;
env . setConfig ( " obeyHtmlRobotsNofollow " , obeyHtmlRobotsNofollow ) ;
2011-06-13 23:44:03 +02:00
2014-02-22 01:44:08 +01:00
final boolean indexText = " on " . equals ( post . get ( " indexText " , " false " ) ) ;
2011-03-15 02:03:35 +01:00
env . setConfig ( " indexText " , indexText ) ;
2011-06-13 23:44:03 +02:00
2014-02-22 01:44:08 +01:00
final boolean indexMedia = " on " . equals ( post . get ( " indexMedia " , " false " ) ) ;
2011-03-15 02:03:35 +01:00
env . setConfig ( " indexMedia " , indexMedia ) ;
2011-06-13 23:44:03 +02:00
2011-03-15 02:03:35 +01:00
env . setConfig ( " storeHTCache " , storeHTCache ) ;
2013-08-22 14:23:47 +02:00
2014-06-01 01:02:03 +02:00
String defaultAgentName = sb . isIntranetMode ( ) ? ClientIdentification . yacyIntranetCrawlerAgentName : ClientIdentification . yacyInternetCrawlerAgentName ;
String agentName = post . get ( " agentName " , defaultAgentName ) ;
2013-08-22 14:23:47 +02:00
ClientIdentification . Agent agent = ClientIdentification . getAgent ( agentName ) ;
2014-06-01 01:02:03 +02:00
if ( agent = = null ) agent = ClientIdentification . getAgent ( defaultAgentName ) ;
2011-06-13 23:44:03 +02:00
CacheStrategy cachePolicy = CacheStrategy . parse ( post . get ( " cachePolicy " , " iffresh " ) ) ;
if ( cachePolicy = = null ) cachePolicy = CacheStrategy . IFFRESH ;
2012-09-14 12:25:46 +02:00
String crawlingMode = post . get ( " crawlingMode " , " url " ) ;
if ( " file " . equals ( crawlingMode ) & & post . containsKey ( " crawlingFile " ) ) {
newcrawlingMustNotMatch = CrawlProfile . MATCH_NEVER_STRING ;
directDocByURL = false ;
}
if ( " sitemap " . equals ( crawlingMode ) ) {
newcrawlingMustMatch = CrawlProfile . MATCH_ALL_STRING ;
newcrawlingMustNotMatch = CrawlProfile . MATCH_NEVER_STRING ;
newcrawlingdepth = 0 ;
directDocByURL = false ;
}
if ( " sitelist " . equals ( crawlingMode ) ) {
newcrawlingMustNotMatch = CrawlProfile . MATCH_NEVER_STRING ;
2013-09-15 00:30:23 +02:00
Set < DigestURL > newRootURLs = new HashSet < DigestURL > ( ) ;
for ( DigestURL sitelistURL : rootURLs ) {
2012-09-14 12:25:46 +02:00
// download document
Document scraper ;
try {
2013-08-22 14:23:47 +02:00
scraper = sb . loader . loadDocument ( sitelistURL , CacheStrategy . IFFRESH , BlacklistType . CRAWLER , agent ) ;
2012-09-14 12:25:46 +02:00
// get links and generate filter
2013-09-15 00:30:23 +02:00
for ( DigestURL u : scraper . getAnchors ( ) ) {
2013-02-22 15:45:15 +01:00
newRootURLs . add ( u ) ;
2011-11-22 01:03:20 +01:00
}
2013-07-17 18:31:30 +02:00
} catch ( final IOException e ) {
2013-07-09 14:28:25 +02:00
ConcurrentLog . logException ( e ) ;
2012-09-14 12:25:46 +02:00
}
}
rootURLs = newRootURLs ;
crawlingMode = " url " ;
if ( ( fullDomain | | subPath ) & & newcrawlingdepth > 0 ) newcrawlingMustMatch = CrawlProfile . MATCH_ALL_STRING ; // to prevent that there is a restriction on the original urls
}
2014-04-23 23:12:08 +02:00
// delete all error urls for that domain
// and all urls for that host from the crawl queue
Set < String > hosthashes = new HashSet < String > ( ) ;
2014-12-28 14:27:42 +01:00
boolean anysmbftporpdf = false ;
2014-04-23 23:12:08 +02:00
for ( DigestURL u : rootURLs ) {
sb . index . fulltext ( ) . remove ( u . hash ( ) ) ;
hosthashes . add ( u . hosthash ( ) ) ;
2014-12-28 14:27:42 +01:00
if ( " smb.ftp " . indexOf ( u . getProtocol ( ) ) > = 0 | | " pdf " . equals ( MultiProtocolURL . getFileExtension ( u . getFileName ( ) ) ) ) anysmbftporpdf = true ;
2014-04-23 23:12:08 +02:00
}
sb . crawlQueues . removeHosts ( hosthashes ) ;
sb . index . fulltext ( ) . commit ( true ) ;
2012-09-14 12:25:46 +02:00
2014-12-28 14:27:42 +01:00
boolean crawlingQ = anysmbftporpdf | | " on " . equals ( post . get ( " crawlingQ " , " off " ) ) | | " sitemap " . equals ( crawlingMode ) ;
env . setConfig ( " crawlingQ " , crawlingQ ) ;
2012-09-14 12:25:46 +02:00
// compute mustmatch filter according to rootURLs
if ( ( fullDomain | | subPath ) & & newcrawlingdepth > 0 ) {
String siteFilter = " .* " ;
if ( fullDomain ) {
2012-10-10 10:40:32 +02:00
siteFilter = CrawlProfile . siteFilter ( rootURLs ) ;
2012-11-04 02:58:26 +01:00
if ( deleteold ) {
2013-10-24 16:20:20 +02:00
sb . index . fulltext ( ) . deleteStaleDomainHashes ( hosthashes , deleteageDate ) ;
2012-11-04 02:58:26 +01:00
}
2012-09-14 12:25:46 +02:00
} else if ( subPath ) {
2012-10-10 10:40:32 +02:00
siteFilter = CrawlProfile . subpathFilter ( rootURLs ) ;
2012-11-04 02:58:26 +01:00
if ( deleteold ) {
2013-09-15 00:30:23 +02:00
for ( DigestURL u : rootURLs ) {
2012-12-11 13:38:28 +01:00
String basepath = u . toNormalform ( true ) ;
if ( ! basepath . endsWith ( " / " ) ) { int p = basepath . lastIndexOf ( " / " ) ; if ( p > 0 ) basepath = basepath . substring ( 0 , p + 1 ) ; }
2013-05-11 10:53:12 +02:00
int count = sb . index . fulltext ( ) . remove ( basepath , deleteageDate ) ;
2013-07-09 14:28:25 +02:00
if ( count > 0 ) ConcurrentLog . info ( " Crawler_p " , " deleted " + count + " documents for host " + u . getHost ( ) ) ;
2012-11-04 02:58:26 +01:00
}
}
2012-10-10 10:40:32 +02:00
}
if ( CrawlProfile . MATCH_ALL_STRING . equals ( newcrawlingMustMatch ) ) {
newcrawlingMustMatch = siteFilter ;
} else if ( ! CrawlProfile . MATCH_ALL_STRING . equals ( siteFilter ) ) {
// combine both
newcrawlingMustMatch = " ( " + newcrawlingMustMatch + " )|( " + siteFilter + " ) " ;
2012-09-14 12:25:46 +02:00
}
}
// check if the crawl filter works correctly
try {
2012-10-10 10:40:32 +02:00
Pattern mmp = Pattern . compile ( newcrawlingMustMatch ) ;
2013-09-15 00:30:23 +02:00
for ( DigestURL u : rootURLs ) {
2012-10-10 11:46:22 +02:00
assert mmp . matcher ( u . toNormalform ( true ) ) . matches ( ) : " pattern " + mmp . toString ( ) + " does not match url " + u . toNormalform ( true ) ;
2012-10-10 10:40:32 +02:00
}
2012-09-14 12:25:46 +02:00
} catch ( final PatternSyntaxException e ) {
prop . put ( " info " , " 4 " ) ; // crawlfilter does not match url
prop . putHTML ( " info_newcrawlingfilter " , newcrawlingMustMatch ) ;
prop . putHTML ( " info_error " , e . getMessage ( ) ) ;
2015-06-23 23:41:43 +02:00
}
2012-09-14 12:25:46 +02:00
2014-05-17 21:34:23 +02:00
boolean hasCrawlstartDataOK = ! crawlName . isEmpty ( ) ;
if ( hasCrawlstartDataOK ) {
// check crawlurl was given in sitecrawl
if ( " url " . equals ( crawlingMode ) & & rootURLs . size ( ) = = 0 ) hasCrawlstartDataOK = false ;
}
2014-12-01 15:03:09 +01:00
String snapshotsMaxDepthString = post . get ( " snapshotsMaxDepth " , " -1 " ) ;
int snapshotsMaxDepth = Integer . parseInt ( snapshotsMaxDepthString ) ;
2014-12-09 16:20:34 +01:00
boolean snapshotsLoadImage = post . getBoolean ( " snapshotsLoadImage " ) ;
2014-12-01 15:03:09 +01:00
boolean snapshotsReplaceOld = post . getBoolean ( " snapshotsReplaceOld " ) ;
2015-05-08 13:46:27 +02:00
String snapshotsMustnotmatch = post . get ( " snapshotsMustnotmatch " , " " ) ;
2014-12-01 15:03:09 +01:00
2015-01-30 13:20:56 +01:00
// get vocabulary scraper info
JSONObject vocabulary_scraper = new JSONObject ( ) ; // key = vocabulary_name, value = properties with key = type (i.e. 'class') and value = keyword in context
for ( String key : post . keySet ( ) ) {
if ( key . startsWith ( " vocabulary_ " ) ) {
if ( key . endsWith ( " _class " ) ) {
String vocabulary = key . substring ( 11 , key . length ( ) - 6 ) ;
String value = post . get ( key ) ;
if ( value ! = null & & value . length ( ) > 0 ) {
JSONObject props ;
try {
props = vocabulary_scraper . getJSONObject ( vocabulary ) ;
} catch ( JSONException e ) {
props = new JSONObject ( ) ;
vocabulary_scraper . put ( vocabulary , props ) ;
}
props . put ( " class " , value ) ;
}
}
}
}
2015-04-15 13:17:23 +02:00
int timezoneOffset = post . getInt ( " timezoneOffset " , 0 ) ;
2015-06-29 02:02:01 +02:00
// in case that we crawl from a file, load that file and re-compute mustmatch pattern
List < AnchorURL > hyperlinks_from_file = null ;
if ( " file " . equals ( crawlingMode ) & & post . containsKey ( " crawlingFile " ) & & crawlingFile ! = null ) {
final String crawlingFileContent = post . get ( " crawlingFile$file " , " " ) ;
try {
// check if the crawl filter works correctly
final ContentScraper scraper = new ContentScraper ( new DigestURL ( crawlingFile ) , 10000000 , new VocabularyScraper ( ) , timezoneOffset ) ;
final Writer writer = new TransformerWriter ( null , null , scraper , null , false ) ;
if ( crawlingFile ! = null & & crawlingFile . exists ( ) ) {
FileUtils . copy ( new FileInputStream ( crawlingFile ) , writer ) ;
} else {
FileUtils . copy ( crawlingFileContent , writer ) ;
}
writer . close ( ) ;
// get links and generate filter
hyperlinks_from_file = scraper . getAnchors ( ) ;
if ( newcrawlingdepth > 0 ) {
if ( fullDomain ) {
newcrawlingMustMatch = CrawlProfile . siteFilter ( hyperlinks_from_file ) ;
} else if ( subPath ) {
newcrawlingMustMatch = CrawlProfile . subpathFilter ( hyperlinks_from_file ) ;
}
}
} catch ( final Exception e ) {
// mist
prop . put ( " info " , " 7 " ) ; // Error with file
prop . putHTML ( " info_crawlingStart " , crawlingFileName ) ;
prop . putHTML ( " info_error " , e . getMessage ( ) ) ;
ConcurrentLog . logException ( e ) ;
}
sb . continueCrawlJob ( SwitchboardConstants . CRAWLJOB_LOCAL_CRAWL ) ;
}
2012-09-14 12:25:46 +02:00
// prepare a new crawling profile
2014-03-15 21:52:42 +01:00
final CrawlProfile profile ;
byte [ ] handle ;
if ( hasCrawlstartDataOK ) {
profile = new CrawlProfile (
crawlName ,
newcrawlingMustMatch ,
newcrawlingMustNotMatch ,
ipMustMatch ,
ipMustNotMatch ,
countryMustMatch ,
crawlerNoDepthLimitMatch ,
indexUrlMustMatch ,
indexUrlMustNotMatch ,
indexContentMustMatch ,
indexContentMustNotMatch ,
newcrawlingdepth ,
directDocByURL ,
crawlingIfOlder ,
crawlingDomMaxPages ,
2014-07-18 12:43:01 +02:00
crawlingQ , followFrames ,
obeyHtmlRobotsNoindex , obeyHtmlRobotsNofollow ,
2014-03-15 21:52:42 +01:00
indexText ,
indexMedia ,
storeHTCache ,
crawlOrder ,
2014-12-01 15:03:09 +01:00
snapshotsMaxDepth ,
2014-12-09 16:20:34 +01:00
snapshotsLoadImage ,
2014-12-01 15:03:09 +01:00
snapshotsReplaceOld ,
2015-05-08 13:46:27 +02:00
snapshotsMustnotmatch ,
2014-03-15 21:52:42 +01:00
cachePolicy ,
collection ,
2015-01-30 13:20:56 +01:00
agentName ,
2015-04-15 13:17:23 +02:00
new VocabularyScraper ( vocabulary_scraper ) ,
timezoneOffset ) ;
2014-03-15 21:52:42 +01:00
handle = ASCII . getBytes ( profile . handle ( ) ) ;
// before we fire up a new crawl, we make sure that another crawl with the same name is not running
sb . crawler . removeActive ( handle ) ;
sb . crawler . removePassive ( handle ) ;
try {
sb . crawlQueues . noticeURL . removeByProfileHandle ( profile . handle ( ) , 10000 ) ;
} catch ( final SpaceExceededException e1 ) { }
} else {
profile = null ;
handle = null ;
2012-11-05 22:14:27 +01:00
}
2014-03-15 21:52:42 +01:00
2012-10-25 10:20:55 +02:00
// start the crawl
2012-09-14 12:25:46 +02:00
if ( " url " . equals ( crawlingMode ) ) {
if ( rootURLs . size ( ) = = 0 ) {
prop . put ( " info " , " 5 " ) ; //Crawling failed
prop . putHTML ( " info_crawlingURL " , " (no url given) " ) ;
prop . putHTML ( " info_reasonString " , " you must submit at least one crawl url " ) ;
} else {
// stack requests
sb . crawler . putActive ( handle , profile ) ;
2013-09-15 00:30:23 +02:00
final Set < DigestURL > successurls = new HashSet < DigestURL > ( ) ;
final Map < DigestURL , String > failurls = new HashMap < DigestURL , String > ( ) ;
2012-10-31 17:44:45 +01:00
sb . stackURLs ( rootURLs , profile , successurls , failurls ) ;
2012-09-14 12:25:46 +02:00
if ( failurls . size ( ) = = 0 ) {
2010-09-30 14:50:34 +02:00
// liftoff!
2012-09-14 12:25:46 +02:00
prop . put ( " info " , " 8 " ) ;
2011-11-22 00:10:29 +01:00
prop . putHTML ( " info_crawlingURL " , post . get ( " crawlingURL " ) ) ;
2012-09-14 12:25:46 +02:00
2010-09-30 14:50:34 +02:00
// generate a YaCyNews if the global flag was set
2011-01-12 01:00:14 +01:00
if ( ! sb . isRobinsonMode ( ) & & crawlOrder ) {
2012-09-14 12:25:46 +02:00
final Map < String , String > m = new HashMap < String , String > ( profile ) ; // must be cloned
2010-09-30 14:50:34 +02:00
m . remove ( " specificDepth " ) ;
m . remove ( " indexText " ) ;
m . remove ( " indexMedia " ) ;
m . remove ( " remoteIndexing " ) ;
m . remove ( " xsstopw " ) ;
m . remove ( " xpstopw " ) ;
m . remove ( " xdstopw " ) ;
m . remove ( " storeTXCache " ) ;
m . remove ( " storeHTCache " ) ;
m . remove ( " generalFilter " ) ;
m . remove ( " specificFilter " ) ;
m . put ( " intention " , post . get ( " intention " , " " ) . replace ( ',' , '/' ) ) ;
2011-10-04 11:06:24 +02:00
sb . peers . newsPool . publishMyNews ( sb . peers . mySeed ( ) , NewsPool . CATEGORY_CRAWL_START , m ) ;
2011-06-13 23:44:03 +02:00
}
2010-09-30 14:50:34 +02:00
} else {
2012-09-14 12:25:46 +02:00
StringBuilder fr = new StringBuilder ( ) ;
2013-09-15 00:30:23 +02:00
for ( Map . Entry < DigestURL , String > failure : failurls . entrySet ( ) ) {
2014-04-17 13:21:43 +02:00
sb . crawlQueues . errorURL . push ( failure . getKey ( ) , 0 , null , FailCategory . FINAL_LOAD_CONTEXT , failure . getValue ( ) , - 1 ) ;
2012-09-14 12:25:46 +02:00
fr . append ( failure . getValue ( ) ) . append ( '/' ) ;
}
2010-09-30 14:50:34 +02:00
prop . put ( " info " , " 5 " ) ; //Crawling failed
prop . putHTML ( " info_crawlingURL " , ( post . get ( " crawlingURL " ) ) ) ;
2012-09-14 12:25:46 +02:00
prop . putHTML ( " info_reasonString " , fr . toString ( ) ) ;
2010-09-30 14:50:34 +02:00
}
2012-09-14 12:25:46 +02:00
if ( successurls . size ( ) > 0 ) sb . continueCrawlJob ( SwitchboardConstants . CRAWLJOB_LOCAL_CRAWL ) ;
}
} else if ( " sitemap " . equals ( crawlingMode ) ) {
try {
2014-03-28 14:44:52 +01:00
final DigestURL sitemapURL = sitemapURLStr . indexOf ( " // " ) > 0 ? new DigestURL ( sitemapURLStr ) : new DigestURL ( rootURLs . iterator ( ) . next ( ) , sitemapURLStr ) ; // fix for relative paths which should not exist but are used anyway
2012-09-14 12:25:46 +02:00
sb . crawler . putActive ( handle , profile ) ;
final SitemapImporter importer = new SitemapImporter ( sb , sitemapURL , profile ) ;
importer . start ( ) ;
sb . continueCrawlJob ( SwitchboardConstants . CRAWLJOB_LOCAL_CRAWL ) ;
2010-09-30 14:50:34 +02:00
} catch ( final Exception e ) {
// mist
2012-09-14 12:25:46 +02:00
prop . put ( " info " , " 6 " ) ; //Error with url
prop . putHTML ( " info_crawlingStart " , sitemapURLStr ) ;
2010-09-30 14:50:34 +02:00
prop . putHTML ( " info_error " , e . getMessage ( ) ) ;
2013-07-09 14:28:25 +02:00
ConcurrentLog . logException ( e ) ;
2010-09-30 14:50:34 +02:00
}
2010-11-27 01:54:59 +01:00
} else if ( " file " . equals ( crawlingMode ) ) {
2015-06-29 02:02:01 +02:00
if ( post . containsKey ( " crawlingFile " ) & & crawlingFile ! = null & & hyperlinks_from_file ! = null ) {
2010-09-30 14:50:34 +02:00
try {
2012-07-11 23:18:57 +02:00
if ( newcrawlingdepth > 0 ) {
if ( fullDomain ) {
2015-06-29 02:02:01 +02:00
newcrawlingMustMatch = CrawlProfile . siteFilter ( hyperlinks_from_file ) ;
2012-07-11 23:18:57 +02:00
} else if ( subPath ) {
2015-06-29 02:02:01 +02:00
newcrawlingMustMatch = CrawlProfile . subpathFilter ( hyperlinks_from_file ) ;
2012-07-11 23:18:57 +02:00
}
}
2012-09-14 12:25:46 +02:00
sb . crawler . putActive ( handle , profile ) ;
2015-06-29 02:02:01 +02:00
sb . crawlStacker . enqueueEntriesAsynchronous ( sb . peers . mySeed ( ) . hash . getBytes ( ) , profile . handle ( ) , hyperlinks_from_file , profile . timezoneOffset ( ) ) ;
2008-08-02 14:12:04 +02:00
} catch ( final PatternSyntaxException e ) {
2010-12-09 18:17:25 +01:00
prop . put ( " info " , " 4 " ) ; // crawlfilter does not match url
2008-11-14 10:58:56 +01:00
prop . putHTML ( " info_newcrawlingfilter " , newcrawlingMustMatch ) ;
2007-10-24 23:38:19 +02:00
prop . putHTML ( " info_error " , e . getMessage ( ) ) ;
2008-08-02 14:12:04 +02:00
} catch ( final Exception e ) {
2006-12-19 01:29:45 +01:00
// mist
2010-12-09 18:17:25 +01:00
prop . put ( " info " , " 7 " ) ; // Error with file
2011-03-09 13:50:39 +01:00
prop . putHTML ( " info_crawlingStart " , crawlingFileName ) ;
2007-10-24 23:38:19 +02:00
prop . putHTML ( " info_error " , e . getMessage ( ) ) ;
2013-07-09 14:28:25 +02:00
ConcurrentLog . logException ( e ) ;
2007-10-24 23:38:19 +02:00
}
2010-09-30 14:50:34 +02:00
sb . continueCrawlJob ( SwitchboardConstants . CRAWLJOB_LOCAL_CRAWL ) ;
}
2006-12-19 01:29:45 +01:00
}
}
2010-09-30 14:50:34 +02:00
}
2011-06-13 23:44:03 +02:00
2014-01-21 19:28:00 +01:00
/ *
* < input id = " customPPM " name = " customPPM " type = " number " min = " 10 " max = " 30000 " style = " width:46px " value = " #[customPPMdefault]# " / > PPM
< input id = " latencyFactor " name = " latencyFactor " type = " number " min = " 0.1 " max = " 3.0 " step = " 0.1 " style = " width:32px " value = " #[latencyFactorDefault]# " / > LF
< input id = " MaxSameHostInQueue " name = " MaxSameHostInQueue " type = " number " min = " 1 " max = " 30 " style = " width:32px " value = " #[MaxSameHostInQueueDefault]# " / > MH
< input type = " submit " name = " crawlingPerformance " value = " set " / >
( < a href = " /Crawler_p.html?crawlingPerformance=minimum " > min < / a > / < a href = " /Crawler_p.html?crawlingPerformance=maximum " > max < / a > )
< / td >
* /
2010-09-30 14:50:34 +02:00
if ( post ! = null & & post . containsKey ( " crawlingPerformance " ) ) {
2014-01-21 19:28:00 +01:00
final String crawlingPerformance = post . get ( " crawlingPerformance " , " custom " ) ;
final long LCbusySleep1 = sb . getConfigLong ( SwitchboardConstants . CRAWLJOB_LOCAL_CRAWL_BUSYSLEEP , 1000L ) ;
int wantedPPM = ( LCbusySleep1 = = 0 ) ? 30000 : ( int ) ( 60000L / LCbusySleep1 ) ;
try {
wantedPPM = post . getInt ( " customPPM " , wantedPPM ) ;
} catch ( final NumberFormatException e ) { }
if ( " minimum " . equals ( crawlingPerformance . toLowerCase ( ) ) ) wantedPPM = 10 ;
if ( " maximum " . equals ( crawlingPerformance . toLowerCase ( ) ) ) wantedPPM = 30000 ;
int wPPM = wantedPPM ;
if ( wPPM < = 0 ) {
wPPM = 1 ;
}
if ( wPPM > = 30000 ) {
wPPM = 30000 ;
}
final int newBusySleep = 60000 / wPPM ; // for wantedPPM = 10: 6000; for wantedPPM = 1000: 60
final float loadprereq = wantedPPM < = 10 ? 1 . 0f : wantedPPM < = 100 ? 2 . 0f : wantedPPM > = 1000 ? 8 . 0f : 3 . 0f ;
BusyThread thread ;
thread = sb . getThread ( SwitchboardConstants . CRAWLJOB_LOCAL_CRAWL ) ;
if ( thread ! = null ) {
sb . setConfig ( SwitchboardConstants . CRAWLJOB_LOCAL_CRAWL_BUSYSLEEP , thread . setBusySleep ( newBusySleep ) ) ;
sb . setConfig ( SwitchboardConstants . CRAWLJOB_LOCAL_CRAWL_LOADPREREQ , thread . setLoadPreReqisite ( loadprereq ) ) ;
thread . setLoadPreReqisite ( loadprereq ) ;
thread . setIdleSleep ( 2000 ) ;
}
float latencyFactor = post . getFloat ( " latencyFactor " , 0 . 5f ) ;
int MaxSameHostInQueue = post . getInt ( " MaxSameHostInQueue " , 20 ) ;
env . setConfig ( SwitchboardConstants . CRAWLER_LATENCY_FACTOR , latencyFactor ) ;
env . setConfig ( SwitchboardConstants . CRAWLER_MAX_SAME_HOST_IN_QUEUE , MaxSameHostInQueue ) ;
2006-12-19 01:29:45 +01:00
}
2011-06-13 23:44:03 +02:00
2007-03-12 17:24:28 +01:00
// performance settings
2011-03-15 02:03:35 +01:00
final long LCbusySleep = env . getConfigLong ( SwitchboardConstants . CRAWLJOB_LOCAL_CRAWL_BUSYSLEEP , 1000L ) ;
2008-08-02 14:12:04 +02:00
final int LCppm = ( int ) ( 60000L / Math . max ( 1 , LCbusySleep ) ) ;
2008-09-17 01:04:24 +02:00
prop . put ( " customPPMdefault " , Integer . toString ( LCppm ) ) ;
2014-01-21 19:28:00 +01:00
prop . put ( " latencyFactorDefault " , env . getConfigFloat ( SwitchboardConstants . CRAWLER_LATENCY_FACTOR , 0 . 5f ) ) ;
prop . put ( " MaxSameHostInQueueDefault " , env . getConfigInt ( SwitchboardConstants . CRAWLER_MAX_SAME_HOST_IN_QUEUE , 20 ) ) ;
2011-06-13 23:44:03 +02:00
2012-05-23 18:00:37 +02:00
// generate crawl profile table
int count = 0 ;
boolean dark = true ;
final int domlistlength = ( post = = null ) ? 160 : post . getInt ( " domlistlength " , 160 ) ;
CrawlProfile profile ;
// put active crawls into list
2012-10-23 02:50:26 +02:00
String hosts = " " ;
2012-05-23 18:00:37 +02:00
for ( final byte [ ] h : sb . crawler . getActive ( ) ) {
profile = sb . crawler . getActive ( h ) ;
2013-07-01 13:10:09 +02:00
if ( CrawlSwitchboard . DEFAULT_PROFILES . contains ( profile . name ( ) ) ) continue ;
2012-07-05 10:23:07 +02:00
profile . putProfileEntry ( " crawlProfilesShow_list_ " , prop , true , dark , count , domlistlength ) ;
2012-11-25 15:43:42 +01:00
prop . put ( " crawlProfilesShow_list_ " + count + " _debug " , debug ? 1 : 0 ) ;
if ( debug ) {
RowHandleSet urlhashes = sb . crawler . getURLHashes ( h ) ;
prop . put ( " crawlProfilesShow_list_ " + count + " _debug_count " , urlhashes = = null ? " unknown " : Integer . toString ( urlhashes . size ( ) ) ) ;
}
2014-04-03 14:51:19 +02:00
hosts = hosts + " , " + profile . name ( ) ;
2012-05-23 18:00:37 +02:00
dark = ! dark ;
count + + ;
}
2012-11-25 15:43:42 +01:00
prop . put ( " crawlProfilesShow_debug " , debug ? 1 : 0 ) ;
2012-05-23 18:00:37 +02:00
prop . put ( " crawlProfilesShow_list " , count ) ;
2012-11-25 15:43:42 +01:00
prop . put ( " crawlProfilesShow_count " , count ) ;
2012-05-23 18:00:37 +02:00
prop . put ( " crawlProfilesShow " , count = = 0 ? 0 : 1 ) ;
2014-04-03 14:51:19 +02:00
prop . put ( " crawlProfilesShow_linkstructure " , 0 ) ;
2015-01-28 03:59:01 +01:00
if ( post ! = null ) { // handle config button to display graphic
if ( post . get ( " hidewebstructuregraph " ) ! = null ) sb . setConfig ( SwitchboardConstants . DECORATION_GRAFICS_LINKSTRUCTURE , false ) ;
if ( post . get ( " showwebstructuregraph " ) ! = null ) sb . setConfig ( SwitchboardConstants . DECORATION_GRAFICS_LINKSTRUCTURE , true ) ;
}
2014-10-08 17:12:35 +02:00
if ( count > 0 & & sb . getConfigBool ( SwitchboardConstants . DECORATION_GRAFICS_LINKSTRUCTURE , true ) ) {
2012-10-23 02:50:26 +02:00
// collect the host names for 'wide' crawls which can be visualized
2014-05-28 03:01:34 +02:00
boolean showLinkstructure = hosts . length ( ) > 0 & & ! hosts . contains ( " file: " ) ;
2012-10-23 02:50:26 +02:00
if ( showLinkstructure ) {
2014-04-03 14:51:19 +02:00
StringBuilder q = new StringBuilder ( ) ;
hosts = hosts . substring ( 1 ) ;
q . append ( CollectionSchema . host_s . getSolrFieldName ( ) ) . append ( ':' ) . append ( hosts ) . append ( " OR " ) . append ( CollectionSchema . host_s . getSolrFieldName ( ) ) . append ( ':' ) . append ( " www. " ) . append ( hosts ) ;
try {
prop . put ( " crawlProfilesShow_linkstructure " , count = = 1 & & sb . index . fulltext ( ) . getDefaultConnector ( ) . getCountByQuery ( q . toString ( ) ) > 0 ? 1 : 2 ) ;
prop . put ( " crawlProfilesShow_linkstructure_hosts " , hosts ) ;
} catch ( IOException e ) {
2012-10-23 02:50:26 +02:00
}
}
}
2012-06-06 14:27:18 +02:00
2006-12-19 01:29:45 +01:00
// return rewrite properties
return prop ;
}
2011-06-13 23:44:03 +02:00
2014-07-22 00:23:17 +02:00
private static Date timeParser ( final boolean recrawlIfOlderCheck , final int number , final String unit ) {
if ( ! recrawlIfOlderCheck ) return null ;
2015-03-02 12:55:31 +01:00
if ( " year " . equals ( unit ) ) return new Date ( System . currentTimeMillis ( ) - number * AbstractFormatter . normalyearMillis ) ;
if ( " month " . equals ( unit ) ) return new Date ( System . currentTimeMillis ( ) - number * AbstractFormatter . monthAverageMillis ) ;
if ( " day " . equals ( unit ) ) return new Date ( System . currentTimeMillis ( ) - number * AbstractFormatter . dayMillis ) ;
if ( " hour " . equals ( unit ) ) return new Date ( System . currentTimeMillis ( ) - number * AbstractFormatter . hourMillis ) ;
if ( " minute " . equals ( unit ) ) return new Date ( System . currentTimeMillis ( ) - number * AbstractFormatter . minuteMillis ) ;
2014-07-22 00:23:17 +02:00
return null ;
2006-12-19 01:29:45 +01:00
}
2011-06-13 23:44:03 +02:00
2006-12-19 01:29:45 +01:00
}