2010-01-12 11:05:28 +01:00
// Crawler_p.java
2008-07-20 19:14:51 +02:00
// (C) 2006 by Michael Peter Christen; mc@yacy.net, Frankfurt a. M., Germany
2006-12-19 01:29:45 +01:00
// first published 18.12.2006 on http://www.anomic.de
// this file was created using the an implementation from IndexCreate_p.java, published 02.12.2004
//
// This is a part of YaCy, a peer-to-peer based web search engine
//
2009-09-27 00:07:40 +02:00
// $LastChangedDate$
// $LastChangedRevision$
// $LastChangedBy$
2006-12-19 01:29:45 +01:00
//
// LICENSE
//
// This program is free software; you can redistribute it and/or modify
// it under the terms of the GNU General Public License as published by
// the Free Software Foundation; either version 2 of the License, or
// (at your option) any later version.
//
// This program is distributed in the hope that it will be useful,
// but WITHOUT ANY WARRANTY; without even the implied warranty of
// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
// GNU General Public License for more details.
//
// You should have received a copy of the GNU General Public License
// along with this program; if not, write to the Free Software
// Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
2007-03-02 22:09:28 +01:00
import java.io.File ;
import java.io.Writer ;
import java.net.MalformedURLException ;
import java.util.Date ;
import java.util.HashMap ;
import java.util.Iterator ;
import java.util.Map ;
2008-08-04 22:43:36 +02:00
import java.util.Set ;
2007-03-02 22:09:28 +01:00
import java.util.regex.Pattern ;
import java.util.regex.PatternSyntaxException ;
2010-05-25 14:54:57 +02:00
import net.yacy.cora.document.MultiProtocolURI ;
2010-08-23 14:32:02 +02:00
import net.yacy.cora.protocol.RequestHeader ;
2009-10-18 02:53:43 +02:00
import net.yacy.document.parser.html.ContentScraper ;
import net.yacy.document.parser.html.TransformerWriter ;
2009-10-11 02:12:19 +02:00
import net.yacy.kelondro.data.meta.DigestURI ;
2009-11-05 21:28:37 +01:00
import net.yacy.kelondro.logging.Log ;
2009-10-10 03:14:19 +02:00
import net.yacy.kelondro.util.FileUtils ;
2008-05-06 02:32:41 +02:00
import de.anomic.crawler.CrawlProfile ;
2008-05-06 15:44:38 +02:00
import de.anomic.crawler.SitemapImporter ;
2009-07-15 23:07:46 +02:00
import de.anomic.crawler.retrieval.Request ;
2010-02-01 23:18:56 +01:00
import de.anomic.data.BookmarkHelper ;
2010-02-04 12:26:23 +01:00
import de.anomic.data.WorkTables ;
2008-08-04 22:43:36 +02:00
import de.anomic.data.bookmarksDB ;
import de.anomic.data.listManager ;
2009-10-11 02:12:19 +02:00
import de.anomic.search.Segment ;
import de.anomic.search.Segments ;
2009-07-19 22:37:44 +02:00
import de.anomic.search.Switchboard ;
import de.anomic.search.SwitchboardConstants ;
2007-03-02 22:09:28 +01:00
import de.anomic.server.serverObjects ;
import de.anomic.server.serverSwitch ;
2007-03-21 12:09:15 +01:00
import de.anomic.yacy.yacyNewsPool ;
2007-03-02 22:09:28 +01:00
2010-01-12 11:05:28 +01:00
public class Crawler_p {
2007-05-06 11:52:04 +02:00
public static final String CRAWLING_MODE_URL = " url " ;
public static final String CRAWLING_MODE_FILE = " file " ;
public static final String CRAWLING_MODE_SITEMAP = " sitemap " ;
2006-12-19 01:29:45 +01:00
2010-01-12 11:05:28 +01:00
// this servlet does NOT create the Crawler servlet page content!
2006-12-19 01:29:45 +01:00
// this servlet starts a web crawl. The interface for entering the web crawl parameters is in IndexCreate_p.html
2009-07-19 22:37:44 +02:00
public static serverObjects respond ( final RequestHeader header , final serverObjects post , final serverSwitch env ) {
2006-12-19 01:29:45 +01:00
// return variable that accumulates replacements
2009-07-19 22:37:44 +02:00
final Switchboard sb = ( Switchboard ) env ;
2008-06-14 12:24:58 +02:00
// inital values for AJAX Elements (without JavaScript)
2009-01-22 01:03:54 +01:00
final serverObjects prop = new serverObjects ( ) ;
prop . put ( " rejected " , 0 ) ;
prop . put ( " urlpublictextSize " , 0 ) ;
prop . put ( " rwipublictextSize " , 0 ) ;
prop . put ( " list " , " 0 " ) ;
prop . put ( " loaderSize " , 0 ) ;
prop . put ( " loaderMax " , 0 ) ;
prop . put ( " list-loader " , 0 ) ;
prop . put ( " localCrawlSize " , 0 ) ;
prop . put ( " localCrawlState " , " " ) ;
prop . put ( " limitCrawlSize " , 0 ) ;
prop . put ( " limitCrawlState " , " " ) ;
prop . put ( " remoteCrawlSize " , 0 ) ;
prop . put ( " remoteCrawlState " , " " ) ;
prop . put ( " list-remote " , 0 ) ;
2007-10-24 23:38:19 +02:00
prop . put ( " forwardToCrawlStart " , " 0 " ) ;
2006-12-19 01:29:45 +01:00
2009-10-09 16:44:20 +02:00
// get segment
Segment indexSegment = null ;
if ( post ! = null & & post . containsKey ( " segment " ) ) {
String segmentName = post . get ( " segment " ) ;
if ( sb . indexSegments . segmentExist ( segmentName ) ) {
indexSegment = sb . indexSegments . segment ( segmentName ) ;
}
} else {
// take default segment
indexSegment = sb . indexSegments . segment ( Segments . Process . PUBLIC ) ;
}
2008-06-14 12:24:58 +02:00
prop . put ( " info " , " 0 " ) ;
if ( post ! = null ) {
// a crawl start
2006-12-19 01:29:45 +01:00
2007-02-22 23:26:11 +01:00
if ( post . containsKey ( " continue " ) ) {
// continue queue
2008-08-02 14:12:04 +02:00
final String queue = post . get ( " continue " , " " ) ;
2007-02-22 23:26:11 +01:00
if ( queue . equals ( " localcrawler " ) ) {
2009-07-19 22:37:44 +02:00
sb . continueCrawlJob ( SwitchboardConstants . CRAWLJOB_LOCAL_CRAWL ) ;
2007-02-22 23:26:11 +01:00
} else if ( queue . equals ( " remotecrawler " ) ) {
2009-07-19 22:37:44 +02:00
sb . continueCrawlJob ( SwitchboardConstants . CRAWLJOB_REMOTE_TRIGGERED_CRAWL ) ;
2007-02-22 23:26:11 +01:00
}
}
if ( post . containsKey ( " pause " ) ) {
// pause queue
2008-08-02 14:12:04 +02:00
final String queue = post . get ( " pause " , " " ) ;
2007-02-22 23:26:11 +01:00
if ( queue . equals ( " localcrawler " ) ) {
2009-07-19 22:37:44 +02:00
sb . pauseCrawlJob ( SwitchboardConstants . CRAWLJOB_LOCAL_CRAWL ) ;
2007-02-22 23:26:11 +01:00
} else if ( queue . equals ( " remotecrawler " ) ) {
2009-07-19 22:37:44 +02:00
sb . pauseCrawlJob ( SwitchboardConstants . CRAWLJOB_REMOTE_TRIGGERED_CRAWL ) ;
2007-02-22 23:26:11 +01:00
}
}
2006-12-19 01:29:45 +01:00
if ( post . containsKey ( " crawlingstart " ) ) {
// init crawl
2009-05-28 16:26:05 +02:00
if ( sb . peers = = null ) {
2007-10-24 23:38:19 +02:00
prop . put ( " info " , " 3 " ) ;
2006-12-19 01:29:45 +01:00
} else {
2008-06-04 23:34:57 +02:00
String crawlingStart = post . get ( " crawlingURL " , " " ) . trim ( ) ; // the crawljob start url
2010-01-21 23:06:03 +01:00
// add the prefix http:// if necessary
2008-06-04 23:34:57 +02:00
int pos = crawlingStart . indexOf ( " :// " ) ;
if ( pos = = - 1 ) crawlingStart = " http:// " + crawlingStart ;
// normalizing URL
2009-10-11 02:12:19 +02:00
DigestURI crawlingStartURL = null ;
try { crawlingStartURL = new DigestURI ( crawlingStart , null ) ; } catch ( final MalformedURLException e1 ) { }
2008-06-04 23:34:57 +02:00
crawlingStart = ( crawlingStartURL = = null ) ? null : crawlingStartURL . toNormalform ( true , true ) ;
2010-01-21 23:06:03 +01:00
// set new properties
final boolean fullDomain = post . get ( " range " , " wide " ) . equals ( " domain " ) ; // special property in simple crawl start
final boolean subPath = post . get ( " range " , " wide " ) . equals ( " subpath " ) ; // special property in simple crawl start
2008-06-04 23:34:57 +02:00
// set the crawling filter
2008-11-14 10:58:56 +01:00
String newcrawlingMustMatch = post . get ( " mustmatch " , CrawlProfile . MATCH_ALL ) ;
String newcrawlingMustNotMatch = post . get ( " mustnotmatch " , CrawlProfile . MATCH_NEVER ) ;
if ( newcrawlingMustMatch . length ( ) < 2 ) newcrawlingMustMatch = CrawlProfile . MATCH_ALL ; // avoid that all urls are filtered out if bad value was submitted
// special cases:
2008-06-04 23:34:57 +02:00
if ( crawlingStartURL ! = null & & fullDomain ) {
2008-11-14 10:58:56 +01:00
newcrawlingMustMatch = " .* " + crawlingStartURL . getHost ( ) + " .* " ;
2008-06-04 23:34:57 +02:00
}
2010-01-13 01:23:07 +01:00
if ( crawlingStart ! = null & & subPath & & ( pos = crawlingStart . lastIndexOf ( '/' ) ) > 0 ) {
2008-11-14 10:58:56 +01:00
newcrawlingMustMatch = crawlingStart . substring ( 0 , pos + 1 ) + " .* " ;
2008-06-04 23:34:57 +02:00
}
2006-12-19 01:29:45 +01:00
2008-08-02 14:12:04 +02:00
final boolean crawlOrder = post . get ( " crawlOrder " , " off " ) . equals ( " on " ) ;
2008-01-22 20:19:56 +01:00
env . setConfig ( " crawlOrder " , ( crawlOrder ) ? " true " : " false " ) ;
2007-09-05 11:01:35 +02:00
int newcrawlingdepth = Integer . parseInt ( post . get ( " crawlingDepth " , " 8 " ) ) ;
2006-12-19 01:29:45 +01:00
env . setConfig ( " crawlingDepth " , Integer . toString ( newcrawlingdepth ) ) ;
2008-01-22 20:19:56 +01:00
if ( ( crawlOrder ) & & ( newcrawlingdepth > 8 ) ) newcrawlingdepth = 8 ;
2006-12-19 01:29:45 +01:00
2010-08-20 01:52:38 +02:00
// recrawl
final String recrawl = post . get ( " recrawl " , " nodoubles " ) ; // nodoubles, reload, scheduler
boolean crawlingIfOlderCheck = post . get ( " crawlingIfOlderCheck " , " off " ) . equals ( " on " ) ;
int crawlingIfOlderNumber = Integer . parseInt ( post . get ( " crawlingIfOlderNumber " , " -1 " ) ) ;
String crawlingIfOlderUnit = post . get ( " crawlingIfOlderUnit " , " year " ) ; // year, month, day, hour
int repeat_time = Integer . parseInt ( post . get ( " repeat_time " , " -1 " ) ) ;
final String repeat_unit = post . get ( " repeat_unit " , " seldays " ) ; // selminutes, selhours, seldays
2006-12-19 01:29:45 +01:00
2010-08-20 09:42:38 +02:00
if ( recrawl . equals ( " scheduler " ) & & repeat_time > 0 ) {
2010-08-20 01:52:38 +02:00
// set crawlingIfOlder attributes that are appropriate for scheduled crawling
crawlingIfOlderCheck = true ;
crawlingIfOlderNumber = repeat_unit . equals ( " selminutes " ) ? 1 : repeat_unit . equals ( " selhours " ) ? repeat_time / 2 : repeat_time * 12 ;
crawlingIfOlderUnit = " hour " ;
} else if ( recrawl . equals ( " reload " ) ) {
repeat_time = - 1 ;
crawlingIfOlderCheck = true ;
} else if ( recrawl . equals ( " nodoubles " ) ) {
repeat_time = - 1 ;
crawlingIfOlderCheck = false ;
}
long crawlingIfOlder = recrawlIfOlderC ( crawlingIfOlderCheck , crawlingIfOlderNumber , crawlingIfOlderUnit ) ;
env . setConfig ( " crawlingIfOlder " , crawlingIfOlder ) ;
// store this call as api call
if ( repeat_time > 0 ) {
// store as scheduled api call
sb . tables . recordAPICall ( post , " Crawler_p.html " , WorkTables . TABLE_API_TYPE_CRAWLER , " crawl start for " + crawlingStart , repeat_time , repeat_unit . substring ( 3 ) ) ;
} else {
// store just a protocol
sb . tables . recordAPICall ( post , " Crawler_p.html " , WorkTables . TABLE_API_TYPE_CRAWLER , " crawl start for " + crawlingStart ) ;
}
2008-08-02 14:12:04 +02:00
final boolean crawlingDomFilterCheck = post . get ( " crawlingDomFilterCheck " , " off " ) . equals ( " on " ) ;
final int crawlingDomFilterDepth = ( crawlingDomFilterCheck ) ? Integer . parseInt ( post . get ( " crawlingDomFilterDepth " , " -1 " ) ) : - 1 ;
2006-12-19 01:29:45 +01:00
env . setConfig ( " crawlingDomFilterDepth " , Integer . toString ( crawlingDomFilterDepth ) ) ;
2008-08-02 14:12:04 +02:00
final boolean crawlingDomMaxCheck = post . get ( " crawlingDomMaxCheck " , " off " ) . equals ( " on " ) ;
final int crawlingDomMaxPages = ( crawlingDomMaxCheck ) ? Integer . parseInt ( post . get ( " crawlingDomMaxPages " , " -1 " ) ) : - 1 ;
2006-12-19 01:29:45 +01:00
env . setConfig ( " crawlingDomMaxPages " , Integer . toString ( crawlingDomMaxPages ) ) ;
2008-08-02 14:12:04 +02:00
final boolean crawlingQ = post . get ( " crawlingQ " , " off " ) . equals ( " on " ) ;
2006-12-19 01:29:45 +01:00
env . setConfig ( " crawlingQ " , ( crawlingQ ) ? " true " : " false " ) ;
2008-08-02 14:12:04 +02:00
final boolean indexText = post . get ( " indexText " , " off " ) . equals ( " on " ) ;
2006-12-19 01:29:45 +01:00
env . setConfig ( " indexText " , ( indexText ) ? " true " : " false " ) ;
2008-08-02 14:12:04 +02:00
final boolean indexMedia = post . get ( " indexMedia " , " off " ) . equals ( " on " ) ;
2006-12-19 01:29:45 +01:00
env . setConfig ( " indexMedia " , ( indexMedia ) ? " true " : " false " ) ;
2008-08-02 14:12:04 +02:00
final boolean storeHTCache = post . get ( " storeHTCache " , " off " ) . equals ( " on " ) ;
2006-12-19 01:29:45 +01:00
env . setConfig ( " storeHTCache " , ( storeHTCache ) ? " true " : " false " ) ;
2009-07-24 13:54:04 +02:00
final String cachePolicyString = post . get ( " cachePolicy " , " iffresh " ) ;
2010-05-14 20:30:11 +02:00
CrawlProfile . CacheStrategy cachePolicy = CrawlProfile . CacheStrategy . IFFRESH ;
if ( cachePolicyString . equals ( " nocache " ) ) cachePolicy = CrawlProfile . CacheStrategy . NOCACHE ;
if ( cachePolicyString . equals ( " iffresh " ) ) cachePolicy = CrawlProfile . CacheStrategy . IFFRESH ;
if ( cachePolicyString . equals ( " ifexist " ) ) cachePolicy = CrawlProfile . CacheStrategy . IFEXIST ;
if ( cachePolicyString . equals ( " cacheonly " ) ) cachePolicy = CrawlProfile . CacheStrategy . CACHEONLY ;
2009-07-24 13:54:04 +02:00
2008-08-02 14:12:04 +02:00
final boolean xsstopw = post . get ( " xsstopw " , " off " ) . equals ( " on " ) ;
2006-12-19 01:29:45 +01:00
env . setConfig ( " xsstopw " , ( xsstopw ) ? " true " : " false " ) ;
2008-08-02 14:12:04 +02:00
final boolean xdstopw = post . get ( " xdstopw " , " off " ) . equals ( " on " ) ;
2006-12-19 01:29:45 +01:00
env . setConfig ( " xdstopw " , ( xdstopw ) ? " true " : " false " ) ;
2008-08-02 14:12:04 +02:00
final boolean xpstopw = post . get ( " xpstopw " , " off " ) . equals ( " on " ) ;
2006-12-19 01:29:45 +01:00
env . setConfig ( " xpstopw " , ( xpstopw ) ? " true " : " false " ) ;
2008-08-02 14:12:04 +02:00
final String crawlingMode = post . get ( " crawlingMode " , " url " ) ;
2007-05-06 11:52:04 +02:00
if ( crawlingMode . equals ( CRAWLING_MODE_URL ) ) {
2006-12-19 01:29:45 +01:00
// check if pattern matches
2008-06-06 18:01:27 +02:00
if ( ( crawlingStart = = null | | crawlingStartURL = = null ) /* || (!(crawlingStart.matches(newcrawlingfilter))) */ ) {
2006-12-19 01:29:45 +01:00
// print error message
2007-10-24 23:38:19 +02:00
prop . put ( " info " , " 4 " ) ; //crawlfilter does not match url
2008-11-14 10:58:56 +01:00
prop . putHTML ( " info_newcrawlingfilter " , newcrawlingMustMatch ) ;
2007-10-24 23:38:19 +02:00
prop . putHTML ( " info_crawlingStart " , crawlingStart ) ;
2006-12-19 01:29:45 +01:00
} else try {
// check if the crawl filter works correctly
2008-11-14 10:58:56 +01:00
Pattern . compile ( newcrawlingMustMatch ) ;
2006-12-19 01:29:45 +01:00
// stack request
// first delete old entry, if exists
2009-10-11 02:12:19 +02:00
final DigestURI url = new DigestURI ( crawlingStart , null ) ;
2010-04-08 02:11:32 +02:00
final byte [ ] urlhash = url . hash ( ) ;
2009-10-09 16:44:20 +02:00
indexSegment . urlMetadata ( ) . remove ( urlhash ) ;
2008-05-06 01:13:47 +02:00
sb . crawlQueues . noticeURL . removeByURLHash ( urlhash ) ;
sb . crawlQueues . errorURL . remove ( urlhash ) ;
2006-12-19 01:29:45 +01:00
// stack url
2010-08-31 17:47:47 +02:00
sb . crawler . profilesPassiveCrawls . remove ( crawlingStartURL . hash ( ) ) ; // if there is an old entry, delete it
final CrawlProfile pe = new CrawlProfile (
2010-05-25 14:54:57 +02:00
( crawlingStartURL . getHost ( ) = = null ) ? Long . toHexString ( System . currentTimeMillis ( ) ) : crawlingStartURL . getHost ( ) ,
2008-11-14 10:58:56 +01:00
crawlingStartURL ,
newcrawlingMustMatch ,
newcrawlingMustNotMatch ,
newcrawlingdepth ,
2006-12-19 01:29:45 +01:00
crawlingIfOlder , crawlingDomFilterDepth , crawlingDomMaxPages ,
crawlingQ ,
indexText , indexMedia ,
2009-07-24 13:54:04 +02:00
storeHTCache , true , crawlOrder , xsstopw , xdstopw , xpstopw , cachePolicy ) ;
2010-08-31 17:47:47 +02:00
sb . crawler . profilesActiveCrawls . put ( pe . handle ( ) . getBytes ( ) , pe ) ;
2009-07-15 23:07:46 +02:00
final String reasonString = sb . crawlStacker . stackCrawl ( new Request (
2010-04-08 02:11:32 +02:00
sb . peers . mySeed ( ) . hash . getBytes ( ) ,
2008-12-17 23:53:06 +01:00
url ,
null ,
" CRAWLING-ROOT " ,
new Date ( ) ,
pe . handle ( ) ,
0 ,
0 ,
0
) ) ;
2006-12-19 01:29:45 +01:00
if ( reasonString = = null ) {
2008-08-04 22:43:36 +02:00
// create a bookmark from crawl start url
2010-02-01 23:18:56 +01:00
Set < String > tags = listManager . string2set ( BookmarkHelper . cleanTagsString ( post . get ( " bookmarkFolder " , " /crawlStart " ) ) ) ;
2008-08-04 22:43:36 +02:00
tags . add ( " crawlStart " ) ;
if ( post . get ( " createBookmark " , " off " ) . equals ( " on " ) ) {
bookmarksDB . Bookmark bookmark = sb . bookmarksDB . createBookmark ( crawlingStart , " admin " ) ;
if ( bookmark ! = null ) {
2008-08-21 23:07:21 +02:00
bookmark . setProperty ( bookmarksDB . Bookmark . BOOKMARK_TITLE , post . get ( " bookmarkTitle " , crawlingStart ) ) ;
2008-08-04 22:43:36 +02:00
bookmark . setOwner ( " admin " ) ;
bookmark . setPublic ( false ) ;
bookmark . setTags ( tags , true ) ;
sb . bookmarksDB . saveBookmark ( bookmark ) ;
}
}
2006-12-19 01:29:45 +01:00
// liftoff!
2007-10-24 23:38:19 +02:00
prop . put ( " info " , " 8 " ) ; //start msg
2008-06-06 18:01:27 +02:00
prop . putHTML ( " info_crawlingURL " , ( post . get ( " crawlingURL " ) ) ) ;
2006-12-19 01:29:45 +01:00
// generate a YaCyNews if the global flag was set
if ( crawlOrder ) {
2010-08-31 17:47:47 +02:00
final Map < String , String > m = new HashMap < String , String > ( pe ) ; // must be cloned
2006-12-19 01:29:45 +01:00
m . remove ( " specificDepth " ) ;
2006-12-19 04:10:46 +01:00
m . remove ( " indexText " ) ;
m . remove ( " indexMedia " ) ;
2006-12-19 01:29:45 +01:00
m . remove ( " remoteIndexing " ) ;
m . remove ( " xsstopw " ) ;
m . remove ( " xpstopw " ) ;
m . remove ( " xdstopw " ) ;
m . remove ( " storeTXCache " ) ;
m . remove ( " storeHTCache " ) ;
m . remove ( " generalFilter " ) ;
m . remove ( " specificFilter " ) ;
m . put ( " intention " , post . get ( " intention " , " " ) . replace ( ',' , '/' ) ) ;
2010-06-15 12:43:47 +02:00
sb . peers . newsPool . publishMyNews ( sb . peers . mySeed ( ) , yacyNewsPool . CATEGORY_CRAWL_START , m ) ;
2008-08-04 22:43:36 +02:00
}
2006-12-19 01:29:45 +01:00
} else {
2007-10-24 23:38:19 +02:00
prop . put ( " info " , " 5 " ) ; //Crawling failed
2008-06-06 18:01:27 +02:00
prop . putHTML ( " info_crawlingURL " , ( post . get ( " crawlingURL " ) ) ) ;
2007-10-24 23:38:19 +02:00
prop . putHTML ( " info_reasonString " , reasonString ) ;
2006-12-19 01:29:45 +01:00
2009-10-31 12:58:06 +01:00
sb . crawlQueues . errorURL . push (
2009-07-15 23:07:46 +02:00
new Request (
2010-04-08 02:11:32 +02:00
sb . peers . mySeed ( ) . hash . getBytes ( ) ,
2008-05-06 01:13:47 +02:00
crawlingStartURL ,
2010-04-08 02:11:32 +02:00
null ,
2008-05-06 01:13:47 +02:00
" " ,
new Date ( ) ,
pe . handle ( ) ,
0 ,
0 ,
0 ) ,
2010-04-08 02:11:32 +02:00
sb . peers . mySeed ( ) . hash . getBytes ( ) ,
2008-05-06 01:13:47 +02:00
new Date ( ) ,
1 ,
reasonString ) ;
2006-12-19 01:29:45 +01:00
}
2008-08-02 14:12:04 +02:00
} catch ( final PatternSyntaxException e ) {
2007-10-24 23:38:19 +02:00
prop . put ( " info " , " 4 " ) ; //crawlfilter does not match url
2008-11-14 10:58:56 +01:00
prop . putHTML ( " info_newcrawlingfilter " , newcrawlingMustMatch ) ;
2007-10-24 23:38:19 +02:00
prop . putHTML ( " info_error " , e . getMessage ( ) ) ;
2008-08-02 14:12:04 +02:00
} catch ( final Exception e ) {
2006-12-19 01:29:45 +01:00
// mist
2007-10-24 23:38:19 +02:00
prop . put ( " info " , " 6 " ) ; //Error with url
prop . putHTML ( " info_crawlingStart " , crawlingStart ) ;
prop . putHTML ( " info_error " , e . getMessage ( ) ) ;
2009-11-05 21:28:37 +01:00
Log . logException ( e ) ;
2007-10-24 23:38:19 +02:00
}
2006-12-19 01:29:45 +01:00
2007-10-24 23:38:19 +02:00
} else if ( crawlingMode . equals ( CRAWLING_MODE_FILE ) ) {
2006-12-19 01:29:45 +01:00
if ( post . containsKey ( " crawlingFile " ) ) {
// getting the name of the uploaded file
2008-08-02 14:12:04 +02:00
final String fileName = post . get ( " crawlingFile " ) ;
2007-10-24 23:38:19 +02:00
try {
2006-12-19 01:29:45 +01:00
// check if the crawl filter works correctly
2008-11-14 10:58:56 +01:00
Pattern . compile ( newcrawlingMustMatch ) ;
2006-12-19 01:29:45 +01:00
// loading the file content
2008-08-02 14:12:04 +02:00
final File file = new File ( fileName ) ;
2006-12-19 01:29:45 +01:00
// getting the content of the bookmark file
2008-08-02 14:12:04 +02:00
final String fileString = post . get ( " crawlingFile$file " ) ;
2006-12-19 01:29:45 +01:00
// parsing the bookmark file and fetching the headline and contained links
2009-10-11 02:12:19 +02:00
final ContentScraper scraper = new ContentScraper ( new DigestURI ( file ) ) ;
2006-12-19 01:29:45 +01:00
//OutputStream os = new htmlFilterOutputStream(null, scraper, null, false);
2009-07-08 23:48:08 +02:00
final Writer writer = new TransformerWriter ( null , null , scraper , null , false ) ;
2009-01-31 02:06:56 +01:00
FileUtils . copy ( fileString , writer ) ;
2006-12-19 01:29:45 +01:00
writer . close ( ) ;
//String headline = scraper.getHeadline();
2010-05-25 14:54:57 +02:00
final Map < MultiProtocolURI , String > hyperlinks = scraper . getAnchors ( ) ;
2006-12-19 01:29:45 +01:00
// creating a crawler profile
2009-10-11 02:12:19 +02:00
final DigestURI crawlURL = new DigestURI ( " file:// " + file . toString ( ) , null ) ;
2010-08-31 17:47:47 +02:00
final CrawlProfile profile = new CrawlProfile (
2009-06-30 11:27:46 +02:00
fileName , crawlURL ,
2008-11-14 10:58:56 +01:00
newcrawlingMustMatch ,
CrawlProfile . MATCH_NEVER ,
newcrawlingdepth ,
crawlingIfOlder ,
crawlingDomFilterDepth ,
crawlingDomMaxPages ,
crawlingQ ,
indexText ,
indexMedia ,
storeHTCache ,
true ,
crawlOrder ,
2009-07-23 23:31:51 +02:00
xsstopw , xdstopw , xpstopw ,
2009-07-24 13:54:04 +02:00
cachePolicy ) ;
2010-08-31 17:47:47 +02:00
sb . crawler . profilesActiveCrawls . put ( profile . handle ( ) . getBytes ( ) , profile ) ;
2006-12-19 01:29:45 +01:00
2007-10-02 02:49:38 +02:00
// pause local crawl here
2009-07-19 22:37:44 +02:00
sb . pauseCrawlJob ( SwitchboardConstants . CRAWLJOB_LOCAL_CRAWL ) ;
2007-10-02 02:49:38 +02:00
2006-12-19 01:29:45 +01:00
// loop through the contained links
2010-05-25 14:54:57 +02:00
final Iterator < Map . Entry < MultiProtocolURI , String > > linkiterator = hyperlinks . entrySet ( ) . iterator ( ) ;
2009-10-11 02:12:19 +02:00
DigestURI nexturl ;
2007-10-02 02:49:38 +02:00
while ( linkiterator . hasNext ( ) ) {
2010-05-25 14:54:57 +02:00
final Map . Entry < MultiProtocolURI , String > e = linkiterator . next ( ) ;
if ( e . getKey ( ) = = null ) continue ;
nexturl = new DigestURI ( e . getKey ( ) ) ;
2006-12-19 01:29:45 +01:00
// enqueuing the url for crawling
2009-07-15 23:07:46 +02:00
sb . crawlStacker . enqueueEntry ( new Request (
2010-04-08 02:11:32 +02:00
sb . peers . mySeed ( ) . hash . getBytes ( ) ,
2008-01-22 20:19:56 +01:00
nexturl ,
2010-04-08 02:11:32 +02:00
null ,
2008-06-06 18:01:27 +02:00
e . getValue ( ) ,
2008-12-17 23:53:06 +01:00
new Date ( ) ,
profile . handle ( ) ,
0 ,
0 ,
0
) ) ;
2007-10-24 23:38:19 +02:00
}
2006-12-19 01:29:45 +01:00
2008-08-02 14:12:04 +02:00
} catch ( final PatternSyntaxException e ) {
2006-12-19 01:29:45 +01:00
// print error message
2007-10-24 23:38:19 +02:00
prop . put ( " info " , " 4 " ) ; //crawlfilter does not match url
2008-11-14 10:58:56 +01:00
prop . putHTML ( " info_newcrawlingfilter " , newcrawlingMustMatch ) ;
2007-10-24 23:38:19 +02:00
prop . putHTML ( " info_error " , e . getMessage ( ) ) ;
2008-08-02 14:12:04 +02:00
} catch ( final Exception e ) {
2006-12-19 01:29:45 +01:00
// mist
2007-10-24 23:38:19 +02:00
prop . put ( " info " , " 7 " ) ; //Error with file
prop . putHTML ( " info_crawlingStart " , fileName ) ;
prop . putHTML ( " info_error " , e . getMessage ( ) ) ;
2009-11-05 21:28:37 +01:00
Log . logException ( e ) ;
2006-12-19 01:29:45 +01:00
}
2009-07-19 22:37:44 +02:00
sb . continueCrawlJob ( SwitchboardConstants . CRAWLJOB_LOCAL_CRAWL ) ;
2007-10-02 02:49:38 +02:00
}
2007-05-06 11:52:04 +02:00
} else if ( crawlingMode . equals ( CRAWLING_MODE_SITEMAP ) ) {
String sitemapURLStr = null ;
try {
// getting the sitemap URL
sitemapURLStr = post . get ( " sitemapURL " , " " ) ;
2009-10-11 02:12:19 +02:00
final DigestURI sitemapURL = new DigestURI ( sitemapURLStr , null ) ;
2007-09-28 03:21:31 +02:00
2007-05-06 11:52:04 +02:00
// create a new profile
2010-08-31 17:47:47 +02:00
final CrawlProfile pe = new CrawlProfile (
2009-06-30 11:27:46 +02:00
sitemapURLStr , sitemapURL ,
2008-11-14 10:58:56 +01:00
newcrawlingMustMatch ,
CrawlProfile . MATCH_NEVER ,
newcrawlingdepth ,
2007-05-06 11:52:04 +02:00
crawlingIfOlder , crawlingDomFilterDepth , crawlingDomMaxPages ,
crawlingQ ,
indexText , indexMedia ,
2009-07-23 23:31:51 +02:00
storeHTCache , true , crawlOrder ,
xsstopw , xdstopw , xpstopw ,
2009-07-24 13:54:04 +02:00
cachePolicy ) ;
2010-08-31 17:47:47 +02:00
sb . crawler . profilesActiveCrawls . put ( pe . handle ( ) . getBytes ( ) , pe ) ;
2007-05-06 11:52:04 +02:00
2007-10-24 23:38:19 +02:00
// create a new sitemap importer
2010-09-08 16:13:15 +02:00
final SitemapImporter importer = new SitemapImporter ( sb , new DigestURI ( sitemapURLStr , null ) , pe ) ;
importer . start ( ) ;
2008-08-02 14:12:04 +02:00
} catch ( final Exception e ) {
2007-05-06 11:52:04 +02:00
// mist
2007-10-24 23:38:19 +02:00
prop . put ( " info " , " 6 " ) ; //Error with url
prop . putHTML ( " info_crawlingStart " , sitemapURLStr ) ;
prop . putHTML ( " info_error " , e . getMessage ( ) ) ;
2009-11-05 21:28:37 +01:00
Log . logException ( e ) ;
2007-05-06 11:52:04 +02:00
}
2006-12-19 01:29:45 +01:00
}
}
}
2007-03-12 17:24:28 +01:00
if ( post . containsKey ( " crawlingPerformance " ) ) {
2008-05-06 01:13:47 +02:00
setPerformance ( sb , post ) ;
2007-03-12 17:24:28 +01:00
}
2006-12-19 01:29:45 +01:00
}
2007-03-12 17:24:28 +01:00
// performance settings
2009-07-19 22:37:44 +02:00
final long LCbusySleep = Integer . parseInt ( env . getConfig ( SwitchboardConstants . CRAWLJOB_LOCAL_CRAWL_BUSYSLEEP , " 1000 " ) ) ;
2008-08-02 14:12:04 +02:00
final int LCppm = ( int ) ( 60000L / Math . max ( 1 , LCbusySleep ) ) ;
2009-05-15 01:11:10 +02:00
prop . put ( " crawlingSpeedMaxChecked " , ( LCppm > = 30000 ) ? " 1 " : " 0 " ) ;
prop . put ( " crawlingSpeedCustChecked " , ( ( LCppm > 10 ) & & ( LCppm < 30000 ) ) ? " 1 " : " 0 " ) ;
2007-10-24 23:38:19 +02:00
prop . put ( " crawlingSpeedMinChecked " , ( LCppm < = 10 ) ? " 1 " : " 0 " ) ;
2008-09-17 01:04:24 +02:00
prop . put ( " customPPMdefault " , Integer . toString ( LCppm ) ) ;
2007-03-12 17:24:28 +01:00
2006-12-19 01:29:45 +01:00
// return rewrite properties
return prop ;
}
2009-01-22 01:03:54 +01:00
2008-08-25 22:31:32 +02:00
private static long recrawlIfOlderC ( final boolean recrawlIfOlderCheck , final int recrawlIfOlderNumber , final String crawlingIfOlderUnit ) {
if ( ! recrawlIfOlderCheck ) return 0L ;
if ( crawlingIfOlderUnit . equals ( " year " ) ) return System . currentTimeMillis ( ) - ( long ) recrawlIfOlderNumber * 1000L * 60L * 60L * 24L * 365L ;
if ( crawlingIfOlderUnit . equals ( " month " ) ) return System . currentTimeMillis ( ) - ( long ) recrawlIfOlderNumber * 1000L * 60L * 60L * 24L * 30L ;
if ( crawlingIfOlderUnit . equals ( " day " ) ) return System . currentTimeMillis ( ) - ( long ) recrawlIfOlderNumber * 1000L * 60L * 60L * 24L ;
if ( crawlingIfOlderUnit . equals ( " hour " ) ) return System . currentTimeMillis ( ) - ( long ) recrawlIfOlderNumber * 1000L * 60L * 60L ;
return System . currentTimeMillis ( ) - ( long ) recrawlIfOlderNumber ;
2006-12-19 01:29:45 +01:00
}
2009-07-19 22:37:44 +02:00
private static void setPerformance ( final Switchboard sb , final serverObjects post ) {
2008-09-17 01:04:24 +02:00
final String crawlingPerformance = post . get ( " crawlingPerformance " , " custom " ) ;
2009-07-19 22:37:44 +02:00
final long LCbusySleep = Integer . parseInt ( sb . getConfig ( SwitchboardConstants . CRAWLJOB_LOCAL_CRAWL_BUSYSLEEP , " 1000 " ) ) ;
2009-05-15 01:11:10 +02:00
int wantedPPM = ( LCbusySleep = = 0 ) ? 30000 : ( int ) ( 60000L / LCbusySleep ) ;
2007-03-12 17:24:28 +01:00
try {
2008-05-13 17:28:55 +02:00
wantedPPM = Integer . parseInt ( post . get ( " customPPM " , Integer . toString ( wantedPPM ) ) ) ;
2008-08-02 14:12:04 +02:00
} catch ( final NumberFormatException e ) { }
2008-09-17 01:04:24 +02:00
if ( crawlingPerformance . toLowerCase ( ) . equals ( " minimum " ) ) wantedPPM = 10 ;
2009-05-15 01:11:10 +02:00
if ( crawlingPerformance . toLowerCase ( ) . equals ( " maximum " ) ) wantedPPM = 30000 ;
2007-03-12 17:24:28 +01:00
sb . setPerformance ( wantedPPM ) ;
}
2006-12-19 01:29:45 +01:00
}