2011-06-13 23:44:03 +02:00
// CrawlProfile.java
2005-04-07 21:19:42 +02:00
// ------------------------
// part of YaCy
2008-07-20 19:14:51 +02:00
// (C) by Michael Peter Christen; mc@yacy.net
2005-04-07 21:19:42 +02:00
// first published on http://www.anomic.de
// Frankfurt, Germany, 2004
2011-07-04 01:55:55 +02:00
//
// $LastChangedDate$
// $LastChangedRevision$
// $LastChangedBy$
2005-04-07 21:19:42 +02:00
//
// This program is free software; you can redistribute it and/or modify
// it under the terms of the GNU General Public License as published by
// the Free Software Foundation; either version 2 of the License, or
// (at your option) any later version.
//
// This program is distributed in the hope that it will be useful,
// but WITHOUT ANY WARRANTY; without even the implied warranty of
// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
// GNU General Public License for more details.
//
// You should have received a copy of the GNU General Public License
// along with this program; if not, write to the Free Software
// Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
2012-09-21 15:48:16 +02:00
package net.yacy.crawler.data ;
2005-04-07 21:19:42 +02:00
2013-09-15 00:30:23 +02:00
import java.util.Collection ;
2014-07-22 00:23:17 +02:00
import java.util.Date ;
2013-08-25 00:13:48 +02:00
import java.util.HashMap ;
2011-11-08 16:38:08 +01:00
import java.util.Iterator ;
2013-08-25 00:13:48 +02:00
import java.util.LinkedHashMap ;
2013-10-18 13:55:37 +02:00
import java.util.LinkedHashSet ;
2005-05-05 07:32:19 +02:00
import java.util.Map ;
2009-06-11 11:38:25 +02:00
import java.util.concurrent.ConcurrentHashMap ;
2012-10-29 21:08:45 +01:00
import java.util.concurrent.atomic.AtomicInteger ;
2008-11-14 10:58:56 +01:00
import java.util.regex.Pattern ;
2012-09-14 16:49:29 +02:00
import java.util.regex.PatternSyntaxException ;
2005-05-05 07:32:19 +02:00
2013-09-15 00:30:23 +02:00
import net.yacy.cora.document.id.MultiProtocolURL ;
2012-09-25 21:20:03 +02:00
import net.yacy.cora.federate.yacy.CacheStrategy ;
2012-09-21 16:46:57 +02:00
import net.yacy.cora.order.Base64Order ;
import net.yacy.cora.order.Digest ;
2013-08-22 14:23:47 +02:00
import net.yacy.cora.protocol.ClientIdentification ;
2012-11-26 13:40:53 +01:00
import net.yacy.cora.util.CommonPattern ;
2013-07-09 14:28:25 +02:00
import net.yacy.cora.util.ConcurrentLog ;
2012-09-21 15:48:16 +02:00
import net.yacy.crawler.CrawlSwitchboard ;
2015-01-30 13:20:56 +01:00
import net.yacy.document.VocabularyScraper ;
2009-10-11 02:12:19 +02:00
import net.yacy.kelondro.data.word.Word ;
2013-08-25 00:13:48 +02:00
import net.yacy.search.query.QueryParams ;
2012-09-21 15:48:16 +02:00
import net.yacy.server.serverObjects ;
2009-10-10 01:13:30 +02:00
2010-08-31 17:47:47 +02:00
public class CrawlProfile extends ConcurrentHashMap < String , String > implements Map < String , String > {
private static final long serialVersionUID = 5527325718810703504L ;
2011-06-13 23:44:03 +02:00
2011-09-29 17:17:39 +02:00
public static final String MATCH_ALL_STRING = " .* " ;
public static final String MATCH_NEVER_STRING = " " ;
public static final Pattern MATCH_ALL_PATTERN = Pattern . compile ( MATCH_ALL_STRING ) ;
public static final Pattern MATCH_NEVER_PATTERN = Pattern . compile ( MATCH_NEVER_STRING ) ;
2011-06-13 23:44:03 +02:00
2015-01-06 15:22:59 +01:00
public static final String CRAWL_PROFILE_PUSH_STUB = " push_ " ;
2011-06-13 23:44:03 +02:00
// this is a simple record structure that hold all properties of a single crawl start
2013-08-22 14:23:47 +02:00
private static final String HANDLE = " handle " ;
public static final String AGENT_NAME = " agentName " ;
2012-10-31 15:13:05 +01:00
public static final String NAME = " name " ;
public static final String DEPTH = " generalDepth " ;
public static final String DIRECT_DOC_BY_URL = " directDocByURL " ;
public static final String RECRAWL_IF_OLDER = " recrawlIfOlder " ;
public static final String DOM_MAX_PAGES = " domMaxPages " ;
public static final String CRAWLING_Q = " crawlingQ " ;
2013-07-03 14:50:06 +02:00
public static final String FOLLOW_FRAMES = " followFrames " ;
public static final String OBEY_HTML_ROBOTS_NOINDEX = " obeyHtmlRobotsNoindex " ;
2014-07-18 12:43:01 +02:00
public static final String OBEY_HTML_ROBOTS_NOFOLLOW = " obeyHtmlRobotsNofollow " ;
2012-10-31 15:13:05 +01:00
public static final String INDEX_TEXT = " indexText " ;
public static final String INDEX_MEDIA = " indexMedia " ;
public static final String STORE_HTCACHE = " storeHTCache " ;
public static final String REMOTE_INDEXING = " remoteIndexing " ;
public static final String CACHE_STRAGEGY = " cacheStrategy " ;
public static final String COLLECTIONS = " collections " ;
2015-01-30 13:20:56 +01:00
public static final String SCRAPER = " scraper " ;
2012-10-31 15:13:05 +01:00
public static final String CRAWLER_URL_MUSTMATCH = " crawlerURLMustMatch " ;
public static final String CRAWLER_URL_MUSTNOTMATCH = " crawlerURLMustNotMatch " ;
public static final String CRAWLER_IP_MUSTMATCH = " crawlerIPMustMatch " ;
public static final String CRAWLER_IP_MUSTNOTMATCH = " crawlerIPMustNotMatch " ;
public static final String CRAWLER_COUNTRY_MUSTMATCH = " crawlerCountryMustMatch " ;
public static final String CRAWLER_URL_NODEPTHLIMITMATCH = " crawlerNoLimitURLMustMatch " ;
public static final String INDEXING_URL_MUSTMATCH = " indexURLMustMatch " ;
public static final String INDEXING_URL_MUSTNOTMATCH = " indexURLMustNotMatch " ;
2013-04-26 10:49:55 +02:00
public static final String INDEXING_CONTENT_MUSTMATCH = " indexContentMustMatch " ;
public static final String INDEXING_CONTENT_MUSTNOTMATCH = " indexContentMustNotMatch " ;
2014-12-01 15:03:09 +01:00
public static final String SNAPSHOTS_MAXDEPTH = " snapshotsMaxDepth " ; // if previews shall be loaded, this is positive and denotes the maximum depth; if not this is -1
public static final String SNAPSHOTS_REPLACEOLD = " snapshotsReplaceOld " ; // if this is set to true, only one version of a snapshot per day is stored, otherwise we store also different versions per day
2014-12-09 16:20:34 +01:00
public static final String SNAPSHOTS_LOADIMAGE = " snapshotsLoadImage " ; // if true, an image is loaded
2012-09-14 16:49:29 +02:00
private Pattern crawlerurlmustmatch = null , crawlerurlmustnotmatch = null ;
private Pattern crawleripmustmatch = null , crawleripmustnotmatch = null ;
private Pattern crawlernodepthlimitmatch = null ;
private Pattern indexurlmustmatch = null , indexurlmustnotmatch = null ;
2013-04-26 10:49:55 +02:00
private Pattern indexcontentmustmatch = null , indexcontentmustnotmatch = null ;
2011-06-13 23:44:03 +02:00
2012-10-29 21:08:45 +01:00
private final Map < String , AtomicInteger > doms ;
2015-01-30 13:20:56 +01:00
private final VocabularyScraper scraper ;
2011-11-08 16:38:08 +01:00
2011-07-04 01:55:55 +02:00
/ * *
* Constructor which creates CrawlPofile from parameters .
* @param name name of the crawl profile
* @param startURL root URL of the crawl
2012-09-14 16:49:29 +02:00
* @param crawlerUrlMustMatch URLs which do not match this regex will be ignored in the crawler
* @param crawlerUrlMustNotMatch URLs which match this regex will be ignored in the crawler
* @param crawlerIpMustMatch IPs from URLs which do not match this regex will be ignored in the crawler
* @param crawlerIpMustNotMatch IPs from URLs which match this regex will be ignored in the crawler
* @param crawlerCountryMustMatch URLs from a specific country must match
* @param crawlerNoDepthLimitMatch if matches , no depth limit is applied to the crawler
* @param indexUrlMustMatch URLs which do not match this regex will be ignored for indexing
* @param indexUrlMustNotMatch URLs which match this regex will be ignored for indexing
2013-04-26 10:49:55 +02:00
* @param indexContentMustMatch content which do not match this regex will be ignored for indexing
* @param indexContentMustNotMatch content which match this regex will be ignored for indexing
2011-07-04 01:55:55 +02:00
* @param depth height of the tree which will be created by the crawler
2011-09-30 14:38:28 +02:00
* @param directDocByURL if true , then linked documents that cannot be parsed are indexed as document
2014-07-22 00:23:17 +02:00
* @param recrawlIfOlder documents which have been indexed in the past will be indexed again if they are older than the given date
2011-07-04 01:55:55 +02:00
* @param domMaxPages maximum number from one domain which will be indexed
* @param crawlingQ true if URLs containing questionmarks shall be indexed
* @param indexText true if text content of URL shall be indexed
* @param indexMedia true if media content of URL shall be indexed
* @param storeHTCache true if content chall be kept in cache after indexing
* @param remoteIndexing true if part of the crawl job shall be distributed
* @param xsstopw true if static stop words shall be ignored
* @param xdstopw true if dynamic stop words shall be ignored
* @param xpstopw true if parent stop words shall be ignored
* @param cacheStrategy determines if and how cache is used loading content
2012-09-03 15:26:08 +02:00
* @param collections a comma - separated list of tags which are attached to index entries
2011-07-04 01:55:55 +02:00
* /
2010-10-01 01:57:58 +02:00
public CrawlProfile (
2012-09-14 12:25:46 +02:00
String name ,
2012-09-14 16:49:29 +02:00
final String crawlerUrlMustMatch , final String crawlerUrlMustNotMatch ,
final String crawlerIpMustMatch , final String crawlerIpMustNotMatch ,
final String crawlerCountryMustMatch , final String crawlerNoDepthLimitMatch ,
final String indexUrlMustMatch , final String indexUrlMustNotMatch ,
2013-04-26 10:49:55 +02:00
final String indexContentMustMatch , final String indexContentMustNotMatch ,
2010-08-31 17:47:47 +02:00
final int depth ,
2011-09-30 14:38:28 +02:00
final boolean directDocByURL ,
2014-07-22 00:23:17 +02:00
final Date recrawlIfOlder /*date*/ ,
2010-09-30 14:50:34 +02:00
final int domMaxPages ,
2014-07-18 12:43:01 +02:00
final boolean crawlingQ , final boolean followFrames ,
final boolean obeyHtmlRobotsNoindex , final boolean obeyHtmlRobotsNofollow ,
2010-10-01 01:57:58 +02:00
final boolean indexText ,
final boolean indexMedia ,
final boolean storeHTCache ,
2010-08-31 17:47:47 +02:00
final boolean remoteIndexing ,
2014-12-01 15:03:09 +01:00
final int snapshotsMaxDepth ,
2014-12-09 16:20:34 +01:00
final boolean snapshotsLoadImage ,
2014-12-01 15:03:09 +01:00
final boolean snapshotsReplaceOld ,
2012-09-03 15:26:08 +02:00
final CacheStrategy cacheStrategy ,
2013-08-22 14:23:47 +02:00
final String collections ,
2015-01-30 13:20:56 +01:00
final String userAgentName ,
final VocabularyScraper scraper ) {
2010-08-31 17:47:47 +02:00
super ( 40 ) ;
2011-07-04 01:55:55 +02:00
if ( name = = null | | name . isEmpty ( ) ) {
throw new NullPointerException ( " name must not be null or empty " ) ;
}
2012-10-25 10:20:55 +02:00
if ( name . length ( ) > 256 ) name = name . substring ( 256 ) ;
2012-10-29 21:08:45 +01:00
this . doms = new ConcurrentHashMap < String , AtomicInteger > ( ) ;
2013-09-25 18:27:54 +02:00
final String handle = Base64Order . enhancedCoder . encode ( Digest . encodeMD5Raw ( name + crawlerUrlMustMatch + depth + crawlerUrlMustNotMatch + domMaxPages + collections ) ) . substring ( 0 , Word . commonHashLength ) ;
2010-08-31 17:47:47 +02:00
put ( HANDLE , handle ) ;
put ( NAME , name ) ;
2013-08-22 14:23:47 +02:00
put ( AGENT_NAME , userAgentName ) ;
2012-09-14 16:49:29 +02:00
put ( CRAWLER_URL_MUSTMATCH , ( crawlerUrlMustMatch = = null ) ? CrawlProfile . MATCH_ALL_STRING : crawlerUrlMustMatch ) ;
put ( CRAWLER_URL_MUSTNOTMATCH , ( crawlerUrlMustNotMatch = = null ) ? CrawlProfile . MATCH_NEVER_STRING : crawlerUrlMustNotMatch ) ;
put ( CRAWLER_IP_MUSTMATCH , ( crawlerIpMustMatch = = null ) ? CrawlProfile . MATCH_ALL_STRING : crawlerIpMustMatch ) ;
put ( CRAWLER_IP_MUSTNOTMATCH , ( crawlerIpMustNotMatch = = null ) ? CrawlProfile . MATCH_NEVER_STRING : crawlerIpMustNotMatch ) ;
2012-10-08 10:50:40 +02:00
put ( CRAWLER_COUNTRY_MUSTMATCH , ( crawlerCountryMustMatch = = null ) ? CrawlProfile . MATCH_NEVER_STRING : crawlerCountryMustMatch ) ;
put ( CRAWLER_URL_NODEPTHLIMITMATCH , ( crawlerNoDepthLimitMatch = = null ) ? CrawlProfile . MATCH_NEVER_STRING : crawlerNoDepthLimitMatch ) ;
put ( INDEXING_URL_MUSTMATCH , ( indexUrlMustMatch = = null ) ? CrawlProfile . MATCH_NEVER_STRING : indexUrlMustMatch ) ;
put ( INDEXING_URL_MUSTNOTMATCH , ( indexUrlMustNotMatch = = null ) ? CrawlProfile . MATCH_NEVER_STRING : indexUrlMustNotMatch ) ;
2013-04-26 10:49:55 +02:00
put ( INDEXING_CONTENT_MUSTMATCH , ( indexContentMustMatch = = null ) ? CrawlProfile . MATCH_NEVER_STRING : indexContentMustMatch ) ;
put ( INDEXING_CONTENT_MUSTNOTMATCH , ( indexContentMustNotMatch = = null ) ? CrawlProfile . MATCH_NEVER_STRING : indexContentMustNotMatch ) ;
2010-08-31 17:47:47 +02:00
put ( DEPTH , depth ) ;
2011-09-30 14:38:28 +02:00
put ( DIRECT_DOC_BY_URL , directDocByURL ) ;
2014-07-22 00:23:17 +02:00
put ( RECRAWL_IF_OLDER , recrawlIfOlder = = null ? Long . MAX_VALUE : recrawlIfOlder . getTime ( ) ) ;
2010-08-31 17:47:47 +02:00
put ( DOM_MAX_PAGES , domMaxPages ) ;
put ( CRAWLING_Q , crawlingQ ) ; // crawling of urls with '?'
2013-07-03 14:50:06 +02:00
put ( FOLLOW_FRAMES , followFrames ) ; // load pages contained in frames or ifames
put ( OBEY_HTML_ROBOTS_NOINDEX , obeyHtmlRobotsNoindex ) ; // if false, then a meta robots tag containing 'noindex' is ignored
2014-07-18 12:43:01 +02:00
put ( OBEY_HTML_ROBOTS_NOFOLLOW , obeyHtmlRobotsNofollow ) ;
2010-08-31 17:47:47 +02:00
put ( INDEX_TEXT , indexText ) ;
put ( INDEX_MEDIA , indexMedia ) ;
put ( STORE_HTCACHE , storeHTCache ) ;
put ( REMOTE_INDEXING , remoteIndexing ) ;
2014-12-01 15:03:09 +01:00
put ( SNAPSHOTS_MAXDEPTH , snapshotsMaxDepth ) ;
2014-12-09 16:20:34 +01:00
put ( SNAPSHOTS_LOADIMAGE , snapshotsLoadImage ) ;
2014-12-01 15:03:09 +01:00
put ( SNAPSHOTS_REPLACEOLD , snapshotsReplaceOld ) ;
2010-08-31 17:47:47 +02:00
put ( CACHE_STRAGEGY , cacheStrategy . toString ( ) ) ;
2012-11-26 13:40:53 +01:00
put ( COLLECTIONS , CommonPattern . SPACE . matcher ( collections . trim ( ) ) . replaceAll ( " " ) ) ;
2015-01-30 13:20:56 +01:00
// we transform the scraper information into a JSON Array
this . scraper = scraper = = null ? new VocabularyScraper ( ) : scraper ;
String jsonString = this . scraper . toString ( ) ;
assert jsonString ! = null & & jsonString . length ( ) > 0 & & jsonString . charAt ( 0 ) = = '{' : " jsonString = " + jsonString ;
put ( SCRAPER , jsonString ) ;
2005-04-07 21:19:42 +02:00
}
2011-06-13 23:44:03 +02:00
2011-07-04 01:55:55 +02:00
/ * *
2015-01-30 13:20:56 +01:00
* Constructor which creates a CrawlProfile from values in a Map .
2011-07-04 01:55:55 +02:00
* @param ext contains values
* /
2011-06-13 23:44:03 +02:00
public CrawlProfile ( final Map < String , String > ext ) {
2010-08-31 17:47:47 +02:00
super ( ext = = null ? 1 : ext . size ( ) ) ;
2011-06-13 23:44:03 +02:00
if ( ext ! = null ) putAll ( ext ) ;
2012-10-29 21:08:45 +01:00
this . doms = new ConcurrentHashMap < String , AtomicInteger > ( ) ;
2015-01-30 13:20:56 +01:00
String jsonString = ext . get ( SCRAPER ) ;
this . scraper = jsonString = = null | | jsonString . length ( ) = = 0 ? new VocabularyScraper ( ) : new VocabularyScraper ( jsonString ) ;
2011-11-08 16:38:08 +01:00
}
2015-01-30 13:20:56 +01:00
public VocabularyScraper scraper ( ) {
return this . scraper ;
}
2012-10-29 21:08:45 +01:00
public void domInc ( final String domain ) {
final AtomicInteger dp = this . doms . get ( domain ) ;
2011-11-08 16:38:08 +01:00
if ( dp = = null ) {
// new domain
2012-10-29 21:08:45 +01:00
this . doms . put ( domain , new AtomicInteger ( 1 ) ) ;
2011-11-08 16:38:08 +01:00
} else {
// increase counter
2012-10-29 21:08:45 +01:00
dp . incrementAndGet ( ) ;
2011-11-08 16:38:08 +01:00
}
}
2012-10-29 21:08:45 +01:00
private String domName ( final boolean attr , final int index ) {
final Iterator < Map . Entry < String , AtomicInteger > > domnamesi = this . doms . entrySet ( ) . iterator ( ) ;
2011-11-08 16:38:08 +01:00
String domname = " " ;
2012-10-29 21:08:45 +01:00
Map . Entry < String , AtomicInteger > ey ;
AtomicInteger dp ;
2011-11-08 16:38:08 +01:00
int i = 0 ;
while ( ( domnamesi . hasNext ( ) ) & & ( i < index ) ) {
ey = domnamesi . next ( ) ;
i + + ;
}
if ( domnamesi . hasNext ( ) ) {
ey = domnamesi . next ( ) ;
dp = ey . getValue ( ) ;
2012-10-29 21:08:45 +01:00
domname = ey . getKey ( ) + ( ( attr ) ? ( " /c= " + dp . get ( ) ) : " " ) ;
2011-11-08 16:38:08 +01:00
}
return domname ;
}
2013-08-22 14:23:47 +02:00
public ClientIdentification . Agent getAgent ( ) {
String agentName = this . get ( AGENT_NAME ) ;
return ClientIdentification . getAgent ( agentName ) ;
}
2012-10-29 21:08:45 +01:00
public AtomicInteger getCount ( final String domain ) {
2014-08-05 21:32:25 +02:00
AtomicInteger dp = this . doms . get ( domain ) ;
if ( dp = = null ) {
// new domain
dp = new AtomicInteger ( 0 ) ;
this . doms . put ( domain , dp ) ;
}
return dp ;
2005-06-19 07:27:42 +02:00
}
2011-09-27 23:58:18 +02:00
2011-07-04 01:55:55 +02:00
/ * *
* Adds a parameter to CrawlProfile .
* @param key name of the parameter
* @param value values if the parameter
* /
public final void put ( final String key , final boolean value ) {
2010-08-31 17:47:47 +02:00
super . put ( key , Boolean . toString ( value ) ) ;
2009-07-17 15:59:21 +02:00
}
2011-06-13 23:44:03 +02:00
2011-07-04 01:55:55 +02:00
/ * *
* Adds a parameter to CrawlProfile .
* @param key name of the parameter
* @param value values if the parameter
* /
2012-10-29 21:08:45 +01:00
private final void put ( final String key , final int value ) {
2010-08-31 17:47:47 +02:00
super . put ( key , Integer . toString ( value ) ) ;
2005-04-07 21:19:42 +02:00
}
2011-06-13 23:44:03 +02:00
2011-07-04 01:55:55 +02:00
/ * *
* Adds a parameter to CrawlProfile .
* @param key name of the parameter
* @param value values if the parameter
* /
2012-10-29 21:08:45 +01:00
private final void put ( final String key , final long value ) {
2010-08-31 17:47:47 +02:00
super . put ( key , Long . toString ( value ) ) ;
2005-04-07 21:19:42 +02:00
}
2011-06-13 23:44:03 +02:00
2011-07-04 01:55:55 +02:00
/ * *
* Gets handle of the CrawlProfile .
* @return handle of the profile
* /
2010-08-31 17:47:47 +02:00
public String handle ( ) {
final String r = get ( HANDLE ) ;
2012-09-14 12:25:46 +02:00
assert r ! = null ;
2010-08-31 17:47:47 +02:00
//if (r == null) return null;
return r ;
}
2013-08-25 00:13:48 +02:00
private Map < String , Pattern > cmap = null ;
2011-09-27 23:58:18 +02:00
2012-09-03 15:26:08 +02:00
/ * *
* get the collections for this crawl
* @return a list of collection names
* /
2013-08-25 00:13:48 +02:00
public Map < String , Pattern > collections ( ) {
if ( cmap ! = null ) return cmap ;
2012-09-03 15:26:08 +02:00
final String r = get ( COLLECTIONS ) ;
2013-08-25 00:13:48 +02:00
this . cmap = collectionParser ( r ) ;
return this . cmap ;
}
public static Map < String , Pattern > collectionParser ( String collectionString ) {
if ( collectionString = = null | | collectionString . length ( ) = = 0 ) return new HashMap < String , Pattern > ( ) ;
2015-01-29 01:46:22 +01:00
String [ ] cs = CommonPattern . COMMA . split ( collectionString ) ;
2013-08-25 00:13:48 +02:00
final Map < String , Pattern > cm = new LinkedHashMap < String , Pattern > ( ) ;
for ( String c : cs ) {
int p = c . indexOf ( ':' ) ;
if ( p < 0 ) cm . put ( c , QueryParams . catchall_pattern ) ; else cm . put ( c . substring ( 0 , p ) , Pattern . compile ( c . substring ( p + 1 ) ) ) ;
}
return cm ;
2012-09-03 15:26:08 +02:00
}
2011-07-04 01:55:55 +02:00
/ * *
* Gets the name of the CrawlProfile .
* @return name of the profile
* /
2010-08-31 17:47:47 +02:00
public String name ( ) {
final String r = get ( NAME ) ;
if ( r = = null ) return " " ;
return r ;
}
2011-09-27 23:58:18 +02:00
2012-10-31 14:08:33 +01:00
/ * *
* create a name that takes the collection as name if this is not " user " .
* @return the name of the collection if that is not " user " or the name ( ) otherwise ;
* /
public String collectionName ( ) {
final String r = get ( COLLECTIONS ) ;
return r = = null | | r . length ( ) = = 0 | | " user " . equals ( r ) ? name ( ) : r ;
}
2011-07-04 01:55:55 +02:00
/ * *
* Gets the regex which must be matched by URLs in order to be crawled .
* @return regex which must be matched
* /
2011-09-29 17:17:39 +02:00
public Pattern urlMustMatchPattern ( ) {
2012-09-14 16:49:29 +02:00
if ( this . crawlerurlmustmatch = = null ) {
final String r = get ( CRAWLER_URL_MUSTMATCH ) ;
try {
2013-04-26 10:49:55 +02:00
this . crawlerurlmustmatch = ( r = = null | | r . equals ( CrawlProfile . MATCH_ALL_STRING ) ) ? CrawlProfile . MATCH_ALL_PATTERN : Pattern . compile ( r , Pattern . CASE_INSENSITIVE ) ;
2013-07-17 18:31:30 +02:00
} catch ( final PatternSyntaxException e ) { this . crawlerurlmustmatch = CrawlProfile . MATCH_NEVER_PATTERN ; }
2005-04-07 21:19:42 +02:00
}
2012-09-14 16:49:29 +02:00
return this . crawlerurlmustmatch ;
2010-08-31 17:47:47 +02:00
}
2011-09-27 23:58:18 +02:00
2011-07-04 01:55:55 +02:00
/ * *
* Gets the regex which must not be matched by URLs in order to be crawled .
* @return regex which must not be matched
* /
2011-09-29 17:17:39 +02:00
public Pattern urlMustNotMatchPattern ( ) {
2012-09-14 16:49:29 +02:00
if ( this . crawlerurlmustnotmatch = = null ) {
final String r = get ( CRAWLER_URL_MUSTNOTMATCH ) ;
try {
2013-04-26 10:49:55 +02:00
this . crawlerurlmustnotmatch = ( r = = null | | r . equals ( CrawlProfile . MATCH_NEVER_STRING ) ) ? CrawlProfile . MATCH_NEVER_PATTERN : Pattern . compile ( r , Pattern . CASE_INSENSITIVE ) ;
2013-07-17 18:31:30 +02:00
} catch ( final PatternSyntaxException e ) { this . crawlerurlmustnotmatch = CrawlProfile . MATCH_NEVER_PATTERN ; }
2005-04-07 21:19:42 +02:00
}
2012-09-14 16:49:29 +02:00
return this . crawlerurlmustnotmatch ;
2011-09-29 17:17:39 +02:00
}
/ * *
* Gets the regex which must be matched by IPs in order to be crawled .
* @return regex which must be matched
* /
public Pattern ipMustMatchPattern ( ) {
2012-09-14 16:49:29 +02:00
if ( this . crawleripmustmatch = = null ) {
final String r = get ( CRAWLER_IP_MUSTMATCH ) ;
try {
2013-04-26 10:49:55 +02:00
this . crawleripmustmatch = ( r = = null | | r . equals ( CrawlProfile . MATCH_ALL_STRING ) ) ? CrawlProfile . MATCH_ALL_PATTERN : Pattern . compile ( r , Pattern . CASE_INSENSITIVE ) ;
2013-07-17 18:31:30 +02:00
} catch ( final PatternSyntaxException e ) { this . crawleripmustmatch = CrawlProfile . MATCH_NEVER_PATTERN ; }
2011-09-29 17:17:39 +02:00
}
2012-09-14 16:49:29 +02:00
return this . crawleripmustmatch ;
2011-09-29 17:17:39 +02:00
}
/ * *
* Gets the regex which must not be matched by IPs in order to be crawled .
* @return regex which must not be matched
* /
public Pattern ipMustNotMatchPattern ( ) {
2012-09-14 16:49:29 +02:00
if ( this . crawleripmustnotmatch = = null ) {
final String r = get ( CRAWLER_IP_MUSTNOTMATCH ) ;
try {
2013-04-26 10:49:55 +02:00
this . crawleripmustnotmatch = ( r = = null | | r . equals ( CrawlProfile . MATCH_NEVER_STRING ) ) ? CrawlProfile . MATCH_NEVER_PATTERN : Pattern . compile ( r , Pattern . CASE_INSENSITIVE ) ;
2013-07-17 18:31:30 +02:00
} catch ( final PatternSyntaxException e ) { this . crawleripmustnotmatch = CrawlProfile . MATCH_NEVER_PATTERN ; }
2011-09-29 17:17:39 +02:00
}
2012-09-14 16:49:29 +02:00
return this . crawleripmustnotmatch ;
2011-09-29 17:17:39 +02:00
}
/ * *
* get the list of countries that must match for the locations of the URLs IPs
* @return a list of country codes
* /
public String [ ] countryMustMatchList ( ) {
2012-09-14 16:49:29 +02:00
String countryMustMatch = get ( CRAWLER_COUNTRY_MUSTMATCH ) ;
2012-10-08 10:50:40 +02:00
if ( countryMustMatch = = null ) countryMustMatch = CrawlProfile . MATCH_NEVER_STRING ;
2012-07-10 22:59:03 +02:00
if ( countryMustMatch . isEmpty ( ) ) return new String [ 0 ] ;
2015-01-29 01:46:22 +01:00
String [ ] list = CommonPattern . COMMA . split ( countryMustMatch ) ;
2011-09-29 17:17:39 +02:00
if ( list . length = = 1 & & list . length = = 0 ) list = new String [ 0 ] ;
return list ;
2005-04-07 21:19:42 +02:00
}
2012-09-14 16:49:29 +02:00
/ * *
* If the regex matches with the url , then there is no depth limit on the crawl ( it overrides depth = = 0 )
* @return regex which must be matched
* /
public Pattern crawlerNoDepthLimitMatchPattern ( ) {
if ( this . crawlernodepthlimitmatch = = null ) {
final String r = get ( CRAWLER_URL_NODEPTHLIMITMATCH ) ;
try {
2013-04-26 10:49:55 +02:00
this . crawlernodepthlimitmatch = ( r = = null | | r . equals ( CrawlProfile . MATCH_NEVER_STRING ) ) ? CrawlProfile . MATCH_NEVER_PATTERN : Pattern . compile ( r , Pattern . CASE_INSENSITIVE ) ;
2013-07-17 18:31:30 +02:00
} catch ( final PatternSyntaxException e ) { this . crawlernodepthlimitmatch = CrawlProfile . MATCH_NEVER_PATTERN ; }
2012-09-14 16:49:29 +02:00
}
return this . crawlernodepthlimitmatch ;
}
/ * *
* Gets the regex which must be matched by URLs in order to be indexed .
* @return regex which must be matched
* /
public Pattern indexUrlMustMatchPattern ( ) {
if ( this . indexurlmustmatch = = null ) {
final String r = get ( INDEXING_URL_MUSTMATCH ) ;
try {
2013-04-26 10:49:55 +02:00
this . indexurlmustmatch = ( r = = null | | r . equals ( CrawlProfile . MATCH_ALL_STRING ) ) ? CrawlProfile . MATCH_ALL_PATTERN : Pattern . compile ( r , Pattern . CASE_INSENSITIVE ) ;
2013-07-17 18:31:30 +02:00
} catch ( final PatternSyntaxException e ) { this . indexurlmustmatch = CrawlProfile . MATCH_NEVER_PATTERN ; }
2012-09-14 16:49:29 +02:00
}
return this . indexurlmustmatch ;
}
2011-09-27 23:58:18 +02:00
2012-09-14 16:49:29 +02:00
/ * *
* Gets the regex which must not be matched by URLs in order to be indexed .
* @return regex which must not be matched
* /
public Pattern indexUrlMustNotMatchPattern ( ) {
if ( this . indexurlmustnotmatch = = null ) {
final String r = get ( INDEXING_URL_MUSTNOTMATCH ) ;
try {
2013-04-26 10:49:55 +02:00
this . indexurlmustnotmatch = ( r = = null | | r . equals ( CrawlProfile . MATCH_NEVER_STRING ) ) ? CrawlProfile . MATCH_NEVER_PATTERN : Pattern . compile ( r , Pattern . CASE_INSENSITIVE ) ;
2013-07-17 18:31:30 +02:00
} catch ( final PatternSyntaxException e ) { this . indexurlmustnotmatch = CrawlProfile . MATCH_NEVER_PATTERN ; }
2012-09-14 16:49:29 +02:00
}
return this . indexurlmustnotmatch ;
}
2013-04-26 10:49:55 +02:00
/ * *
* Gets the regex which must be matched by URLs in order to be indexed .
* @return regex which must be matched
* /
public Pattern indexContentMustMatchPattern ( ) {
if ( this . indexcontentmustmatch = = null ) {
final String r = get ( INDEXING_CONTENT_MUSTMATCH ) ;
try {
this . indexcontentmustmatch = ( r = = null | | r . equals ( CrawlProfile . MATCH_ALL_STRING ) ) ? CrawlProfile . MATCH_ALL_PATTERN : Pattern . compile ( r , Pattern . CASE_INSENSITIVE ) ;
2013-07-17 18:31:30 +02:00
} catch ( final PatternSyntaxException e ) { this . indexcontentmustmatch = CrawlProfile . MATCH_NEVER_PATTERN ; }
2013-04-26 10:49:55 +02:00
}
return this . indexcontentmustmatch ;
}
/ * *
* Gets the regex which must not be matched by URLs in order to be indexed .
* @return regex which must not be matched
* /
public Pattern indexContentMustNotMatchPattern ( ) {
if ( this . indexcontentmustnotmatch = = null ) {
final String r = get ( INDEXING_CONTENT_MUSTNOTMATCH ) ;
try {
this . indexcontentmustnotmatch = ( r = = null | | r . equals ( CrawlProfile . MATCH_NEVER_STRING ) ) ? CrawlProfile . MATCH_NEVER_PATTERN : Pattern . compile ( r , Pattern . CASE_INSENSITIVE ) ;
2013-07-17 18:31:30 +02:00
} catch ( final PatternSyntaxException e ) { this . indexcontentmustnotmatch = CrawlProfile . MATCH_NEVER_PATTERN ; }
2013-04-26 10:49:55 +02:00
}
return this . indexcontentmustnotmatch ;
}
2011-07-04 01:55:55 +02:00
/ * *
2011-09-27 23:58:18 +02:00
* Gets depth of crawl job ( or height of the tree which will be
2011-07-04 01:55:55 +02:00
* created by the crawler ) .
* @return depth of crawl job
* /
2010-08-31 17:47:47 +02:00
public int depth ( ) {
final String r = get ( DEPTH ) ;
if ( r = = null ) return 0 ;
2005-04-07 21:19:42 +02:00
try {
2010-08-31 17:47:47 +02:00
return Integer . parseInt ( r ) ;
} catch ( final NumberFormatException e ) {
2013-07-09 14:28:25 +02:00
ConcurrentLog . logException ( e ) ;
2010-08-31 17:47:47 +02:00
return 0 ;
2009-11-05 17:40:15 +01:00
}
2005-04-07 21:19:42 +02:00
}
2011-09-27 23:58:18 +02:00
2011-09-30 14:38:28 +02:00
public boolean directDocByURL ( ) {
final String r = get ( DIRECT_DOC_BY_URL ) ;
if ( r = = null ) return false ;
return ( r . equals ( Boolean . TRUE . toString ( ) ) ) ;
}
2010-08-31 17:47:47 +02:00
public CacheStrategy cacheStrategy ( ) {
final String r = get ( CACHE_STRAGEGY ) ;
2011-03-22 11:35:26 +01:00
if ( r = = null ) return CacheStrategy . IFEXIST ;
2006-02-02 17:46:58 +01:00
try {
2010-08-31 17:47:47 +02:00
return CacheStrategy . decode ( Integer . parseInt ( r ) ) ;
} catch ( final NumberFormatException e ) {
2013-07-09 14:28:25 +02:00
ConcurrentLog . logException ( e ) ;
2011-03-22 11:35:26 +01:00
return CacheStrategy . IFEXIST ;
2006-02-02 17:46:58 +01:00
}
}
2011-09-27 23:58:18 +02:00
2011-06-13 23:44:03 +02:00
public void setCacheStrategy ( final CacheStrategy newStrategy ) {
2010-08-31 17:47:47 +02:00
put ( CACHE_STRAGEGY , newStrategy . toString ( ) ) ;
}
2011-09-27 23:58:18 +02:00
2011-07-04 01:55:55 +02:00
/ * *
2014-07-22 00:23:17 +02:00
* Gets the minimum date that an entry must have to be re - crawled .
* @return time in ms representing a date
2011-07-04 01:55:55 +02:00
* /
2010-08-31 17:47:47 +02:00
public long recrawlIfOlder ( ) {
// returns a long (millis) that is the minimum age that
// an entry must have to be re-crawled
final String r = get ( RECRAWL_IF_OLDER ) ;
if ( r = = null ) return 0L ;
2005-06-19 07:27:42 +02:00
try {
2010-08-31 17:47:47 +02:00
final long l = Long . parseLong ( r ) ;
return ( l < 0 ) ? 0L : l ;
} catch ( final NumberFormatException e ) {
2013-07-09 14:28:25 +02:00
ConcurrentLog . logException ( e ) ;
2010-08-31 17:47:47 +02:00
return 0L ;
2005-06-19 07:27:42 +02:00
}
2005-04-07 21:19:42 +02:00
}
2011-09-27 23:58:18 +02:00
2010-08-31 17:47:47 +02:00
public int domMaxPages ( ) {
// this is the maximum number of pages that are crawled for a single domain
// if -1, this means no limit
final String r = get ( DOM_MAX_PAGES ) ;
if ( r = = null ) return Integer . MAX_VALUE ;
try {
final int i = Integer . parseInt ( r ) ;
if ( i < 0 ) return Integer . MAX_VALUE ;
return i ;
} catch ( final NumberFormatException e ) {
2013-07-09 14:28:25 +02:00
ConcurrentLog . logException ( e ) ;
2010-08-31 17:47:47 +02:00
return Integer . MAX_VALUE ;
2008-07-11 09:15:46 +02:00
}
2010-08-31 17:47:47 +02:00
}
2011-09-27 23:58:18 +02:00
2010-08-31 17:47:47 +02:00
public boolean crawlingQ ( ) {
final String r = get ( CRAWLING_Q ) ;
if ( r = = null ) return false ;
return ( r . equals ( Boolean . TRUE . toString ( ) ) ) ;
}
2013-07-03 14:50:06 +02:00
public boolean followFrames ( ) {
final String r = get ( FOLLOW_FRAMES ) ;
if ( r = = null ) return false ;
return ( r . equals ( Boolean . TRUE . toString ( ) ) ) ;
}
public boolean obeyHtmlRobotsNoindex ( ) {
final String r = get ( OBEY_HTML_ROBOTS_NOINDEX ) ;
if ( r = = null ) return false ;
return ( r . equals ( Boolean . TRUE . toString ( ) ) ) ;
}
2011-09-27 23:58:18 +02:00
2014-07-18 12:43:01 +02:00
public boolean obeyHtmlRobotsNofollow ( ) {
final String r = get ( OBEY_HTML_ROBOTS_NOFOLLOW ) ;
if ( r = = null ) return false ;
return ( r . equals ( Boolean . TRUE . toString ( ) ) ) ;
}
2010-08-31 17:47:47 +02:00
public boolean indexText ( ) {
final String r = get ( INDEX_TEXT ) ;
if ( r = = null ) return true ;
return ( r . equals ( Boolean . TRUE . toString ( ) ) ) ;
}
2011-09-27 23:58:18 +02:00
2010-08-31 17:47:47 +02:00
public boolean indexMedia ( ) {
final String r = get ( INDEX_MEDIA ) ;
if ( r = = null ) return true ;
return ( r . equals ( Boolean . TRUE . toString ( ) ) ) ;
}
2011-09-27 23:58:18 +02:00
2010-08-31 17:47:47 +02:00
public boolean storeHTCache ( ) {
final String r = get ( STORE_HTCACHE ) ;
if ( r = = null ) return false ;
return ( r . equals ( Boolean . TRUE . toString ( ) ) ) ;
}
2014-11-29 11:56:32 +01:00
2010-08-31 17:47:47 +02:00
public boolean remoteIndexing ( ) {
final String r = get ( REMOTE_INDEXING ) ;
if ( r = = null ) return false ;
return ( r . equals ( Boolean . TRUE . toString ( ) ) ) ;
}
2014-11-29 11:56:32 +01:00
2014-12-01 15:03:09 +01:00
public int snapshotMaxdepth ( ) {
final String r = get ( SNAPSHOTS_MAXDEPTH ) ;
2014-11-29 11:56:32 +01:00
if ( r = = null ) return - 1 ;
try {
final int i = Integer . parseInt ( r ) ;
if ( i < 0 ) return - 1 ;
return i ;
} catch ( final NumberFormatException e ) {
ConcurrentLog . logException ( e ) ;
return - 1 ;
}
}
2014-12-09 16:20:34 +01:00
public boolean snapshotLoadImage ( ) {
final String r = get ( SNAPSHOTS_LOADIMAGE ) ;
if ( r = = null ) return false ;
return ( r . equals ( Boolean . TRUE . toString ( ) ) ) ;
}
2011-09-27 23:58:18 +02:00
2014-12-01 15:03:09 +01:00
public boolean snapshotReplaceold ( ) {
final String r = get ( SNAPSHOTS_REPLACEOLD ) ;
if ( r = = null ) return false ;
return ( r . equals ( Boolean . TRUE . toString ( ) ) ) ;
}
2014-07-22 00:23:17 +02:00
/ * *
* get a recrawl date for a given age in minutes
* @param oldTimeMinutes
* @return a Date representing the recrawl date limit
* /
public static Date getRecrawlDate ( final long oldTimeMinutes ) {
return new Date ( System . currentTimeMillis ( ) - ( 60000L * oldTimeMinutes ) ) ;
2005-04-07 21:19:42 +02:00
}
2011-12-03 01:27:01 +01:00
2013-10-18 13:55:37 +02:00
public static String siteFilter ( final Collection < ? extends MultiProtocolURL > urls ) {
LinkedHashSet < String > filters = new LinkedHashSet < String > ( ) ; // first collect in a set to eliminate doubles
for ( final MultiProtocolURL url : urls ) filters . add ( mustMatchFilterFullDomain ( url ) ) ;
2012-11-04 02:58:26 +01:00
final StringBuilder filter = new StringBuilder ( ) ;
2013-10-18 13:55:37 +02:00
for ( final String urlfilter : filters ) filter . append ( '|' ) . append ( urlfilter ) ;
2012-11-04 02:58:26 +01:00
return filter . length ( ) > 0 ? filter . substring ( 1 ) : CrawlProfile . MATCH_ALL_STRING ;
}
2013-10-18 13:55:37 +02:00
public static String mustMatchFilterFullDomain ( final MultiProtocolURL url ) {
String host = url . getHost ( ) ;
if ( host = = null ) return url . getProtocol ( ) + " .* " ;
2012-10-10 10:40:32 +02:00
if ( host . startsWith ( " www. " ) ) host = host . substring ( 4 ) ;
2013-10-18 13:55:37 +02:00
String protocol = url . getProtocol ( ) ;
2012-10-10 10:40:32 +02:00
if ( " http " . equals ( protocol ) | | " https " . equals ( protocol ) ) protocol = " https?+ " ;
return new StringBuilder ( host . length ( ) + 20 ) . append ( protocol ) . append ( " ://(www.)? " ) . append ( Pattern . quote ( host ) ) . append ( " .* " ) . toString ( ) ;
}
2013-10-18 13:55:37 +02:00
public static String subpathFilter ( final Collection < ? extends MultiProtocolURL > urls ) {
LinkedHashSet < String > filters = new LinkedHashSet < String > ( ) ; // first collect in a set to eliminate doubles
for ( final MultiProtocolURL url : urls ) filters . add ( mustMatchSubpath ( url ) ) ;
2012-10-10 10:40:32 +02:00
final StringBuilder filter = new StringBuilder ( ) ;
2013-10-18 13:55:37 +02:00
for ( final String urlfilter : filters ) filter . append ( '|' ) . append ( urlfilter ) ;
2012-10-10 10:40:32 +02:00
return filter . length ( ) > 0 ? filter . substring ( 1 ) : CrawlProfile . MATCH_ALL_STRING ;
}
2012-05-23 18:00:37 +02:00
2013-10-18 13:55:37 +02:00
public static String mustMatchSubpath ( final MultiProtocolURL url ) {
2014-08-06 01:33:24 +02:00
String host = url . getHost ( ) ;
if ( host = = null ) return url . getProtocol ( ) + " .* " ;
if ( host . startsWith ( " www. " ) ) host = host . substring ( 4 ) ;
String protocol = url . getProtocol ( ) ;
if ( " http " . equals ( protocol ) | | " https " . equals ( protocol ) ) protocol = " https?+ " ;
return new StringBuilder ( host . length ( ) + 20 ) . append ( protocol ) . append ( " ://(www.)? " ) . append ( Pattern . quote ( host ) ) . append ( url . getPath ( ) ) . append ( " .* " ) . toString ( ) ;
2012-11-04 02:58:26 +01:00
}
2015-01-06 15:22:59 +01:00
public boolean isPushCrawlProfile ( ) {
return this . name ( ) . startsWith ( CrawlProfile . CRAWL_PROFILE_PUSH_STUB ) ;
}
2012-11-04 02:58:26 +01:00
2012-05-23 18:00:37 +02:00
public void putProfileEntry (
final String CRAWL_PROFILE_PREFIX ,
final serverObjects prop ,
final boolean active ,
final boolean dark ,
final int count ,
final int domlistlength ) {
2014-10-08 18:48:57 +02:00
boolean terminateButton = active & & ! CrawlSwitchboard . DEFAULT_PROFILES . contains ( this . name ( ) ) ;
boolean deleteButton = ! active ;
2012-05-23 18:00:37 +02:00
prop . put ( CRAWL_PROFILE_PREFIX + count + " _dark " , dark ? " 1 " : " 0 " ) ;
2014-10-09 13:27:20 +02:00
prop . putXML ( CRAWL_PROFILE_PREFIX + count + " _handle " , this . handle ( ) ) ;
prop . putXML ( CRAWL_PROFILE_PREFIX + count + " _name " , this . name ( ) ) ;
//prop.putXML(CRAWL_PROFILE_PREFIX + count + "_collection", this.get(COLLECTIONS)); // TODO: remove, replace with 'collections'
prop . putXML ( CRAWL_PROFILE_PREFIX + count + " _collections " , this . get ( COLLECTIONS ) ) ;
prop . putXML ( CRAWL_PROFILE_PREFIX + count + " _agentName " , this . get ( AGENT_NAME ) ) ;
prop . putXML ( CRAWL_PROFILE_PREFIX + count + " _userAgent " , this . getAgent ( ) . userAgent ) ;
prop . put ( CRAWL_PROFILE_PREFIX + count + " _depth " , this . depth ( ) ) ;
prop . put ( CRAWL_PROFILE_PREFIX + count + " _directDocByURL " , this . directDocByURL ( ) ? 1 : 0 ) ;
prop . putXML ( CRAWL_PROFILE_PREFIX + count + " _recrawlIfOlder " , this . recrawlIfOlder ( ) = = Long . MAX_VALUE ? " eternity " : ( new Date ( this . recrawlIfOlder ( ) ) . toString ( ) ) ) ;
prop . put ( CRAWL_PROFILE_PREFIX + count + " _domMaxPages " , this . domMaxPages ( ) ) ;
//prop.put(CRAWL_PROFILE_PREFIX + count + "_crawlingDomMaxPages", (this.domMaxPages() == Integer.MAX_VALUE) ? "unlimited" : Integer.toString(this.domMaxPages())); // TODO: remove, replace with 'domMaxPages'
prop . put ( CRAWL_PROFILE_PREFIX + count + " _crawlingQ " , this . crawlingQ ( ) ? 1 : 0 ) ;
//prop.put(CRAWL_PROFILE_PREFIX + count + "_withQuery", (this.crawlingQ()) ? "1" : "0"); // TODO: remove, replace with crawlingQ
prop . put ( CRAWL_PROFILE_PREFIX + count + " _followFrames " , this . followFrames ( ) ? 1 : 0 ) ;
prop . put ( CRAWL_PROFILE_PREFIX + count + " _obeyHtmlRobotsNoindex " , this . obeyHtmlRobotsNoindex ( ) ? 1 : 0 ) ;
prop . put ( CRAWL_PROFILE_PREFIX + count + " _obeyHtmlRobotsNofollow " , this . obeyHtmlRobotsNofollow ( ) ? 1 : 0 ) ;
prop . put ( CRAWL_PROFILE_PREFIX + count + " _indexText " , this . indexText ( ) ? 1 : 0 ) ;
prop . put ( CRAWL_PROFILE_PREFIX + count + " _indexMedia " , this . indexMedia ( ) ? 1 : 0 ) ;
//prop.put(CRAWL_PROFILE_PREFIX + count + "_storeCache", this.storeHTCache() ? 1 : 0); // TODO: remove, replace with 'storeHTCache'
prop . put ( CRAWL_PROFILE_PREFIX + count + " _storeHTCache " , this . storeHTCache ( ) ? 1 : 0 ) ;
prop . put ( CRAWL_PROFILE_PREFIX + count + " _remoteIndexing " , this . remoteIndexing ( ) ? 1 : 0 ) ;
prop . putXML ( CRAWL_PROFILE_PREFIX + count + " _cacheStrategy " , this . get ( CACHE_STRAGEGY ) ) ;
prop . putXML ( CRAWL_PROFILE_PREFIX + count + " _crawlerURLMustMatch " , this . get ( CRAWLER_URL_MUSTMATCH ) ) ;
prop . putXML ( CRAWL_PROFILE_PREFIX + count + " _crawlerURLMustNotMatch " , this . get ( CRAWLER_URL_MUSTNOTMATCH ) ) ;
prop . putXML ( CRAWL_PROFILE_PREFIX + count + " _crawlerIPMustMatch " , this . get ( CRAWLER_IP_MUSTMATCH ) ) ;
prop . putXML ( CRAWL_PROFILE_PREFIX + count + " _crawlerIPMustNotMatch " , this . get ( CRAWLER_IP_MUSTNOTMATCH ) ) ;
prop . putXML ( CRAWL_PROFILE_PREFIX + count + " _crawlerCountryMustMatch " , this . get ( CRAWLER_COUNTRY_MUSTMATCH ) ) ;
prop . putXML ( CRAWL_PROFILE_PREFIX + count + " _crawlerNoLimitURLMustMatch " , this . get ( CRAWLER_URL_NODEPTHLIMITMATCH ) ) ;
prop . putXML ( CRAWL_PROFILE_PREFIX + count + " _indexURLMustMatch " , this . get ( INDEXING_URL_MUSTMATCH ) ) ;
prop . putXML ( CRAWL_PROFILE_PREFIX + count + " _indexURLMustNotMatch " , this . get ( INDEXING_URL_MUSTNOTMATCH ) ) ;
prop . putXML ( CRAWL_PROFILE_PREFIX + count + " _indexContentMustMatch " , this . get ( INDEXING_CONTENT_MUSTMATCH ) ) ;
prop . putXML ( CRAWL_PROFILE_PREFIX + count + " _indexContentMustNotMatch " , this . get ( INDEXING_CONTENT_MUSTNOTMATCH ) ) ;
//prop.putXML(CRAWL_PROFILE_PREFIX + count + "_mustmatch", this.urlMustMatchPattern().toString()); // TODO: remove, replace with crawlerURLMustMatch
//prop.putXML(CRAWL_PROFILE_PREFIX + count + "_mustnotmatch", this.urlMustNotMatchPattern().toString()); // TODO: remove, replace with crawlerURLMustNotMatch
//prop.put(CRAWL_PROFILE_PREFIX + count + "_crawlingIfOlder", (this.recrawlIfOlder() == 0L) ? "no re-crawl" : DateFormat.getDateTimeInstance().format(this.recrawlIfOlder())); // TODO: remove, replace with recrawlIfOlder
prop . put ( CRAWL_PROFILE_PREFIX + count + " _crawlingDomFilterDepth " , " inactive " ) ;
2014-10-08 18:48:57 +02:00
prop . put ( CRAWL_PROFILE_PREFIX + count + " _status " , terminateButton ? 1 : deleteButton ? 0 : 2 ) ;
prop . put ( CRAWL_PROFILE_PREFIX + count + " _terminateButton " , terminateButton ) ;
2012-05-23 18:00:37 +02:00
prop . put ( CRAWL_PROFILE_PREFIX + count + " _terminateButton_handle " , this . handle ( ) ) ;
2014-10-08 18:48:57 +02:00
prop . put ( CRAWL_PROFILE_PREFIX + count + " _deleteButton " , deleteButton ) ;
2012-05-23 18:00:37 +02:00
prop . put ( CRAWL_PROFILE_PREFIX + count + " _deleteButton_handle " , this . handle ( ) ) ;
2014-10-09 13:27:20 +02:00
2012-05-23 18:00:37 +02:00
int i = 0 ;
2014-10-09 13:27:20 +02:00
if ( active & & this . domMaxPages ( ) > 0 & & this . domMaxPages ( ) ! = Integer . MAX_VALUE ) {
String item ;
while ( i < = domlistlength & & ! ( item = this . domName ( true , i ) ) . isEmpty ( ) ) {
if ( i = = domlistlength ) item + = " ... " ;
prop . putHTML ( CRAWL_PROFILE_PREFIX + count + " _crawlingDomFilterContent_ " + i + " _item " , item ) ;
i + + ;
2012-05-23 18:00:37 +02:00
}
}
prop . put ( CRAWL_PROFILE_PREFIX + count + " _crawlingDomFilterContent " , i ) ;
}
2005-04-07 21:19:42 +02:00
}