Create autocrawl profiles

This commit is contained in:
Ryszard Goń 2016-01-12 16:28:34 +01:00
parent abd8ecb503
commit 1728cd30c6
3 changed files with 92 additions and 2 deletions

View File

@ -538,6 +538,15 @@ proxyURL.access=127.0.0.1,0:0:0:0:0:0:0:1
proxyURL.rewriteURLs=domainlist
proxyURL.useforresults=false
# Autocrawl configuration
autocrawl=false
autocrawl.index.text=true
autocrawl.index.meia=true
autocrawl.deep.depth=3
autocrawl.deep.recrawl=43200
autocrawl.shallow.depth=1
autocrawl.shallow.recrawl=1440
# From the 'IndexCreate' menu point you can also define a crawling start point.
# The crawling works the same way as the prefetch, but it is possible to
# assign a different crawling depth.

View File

@ -58,7 +58,9 @@ import net.yacy.search.Switchboard;
import net.yacy.search.SwitchboardConstants;
public final class CrawlSwitchboard {
public static final String CRAWL_PROFILE_AUTOCRAWL_DEEP = "autocrawlDeep";
public static final String CRAWL_PROFILE_AUTOCRAWL_SHALLOW = "autocrawlShallow";
public static final String CRAWL_PROFILE_PROXY = "proxy";
public static final String CRAWL_PROFILE_REMOTE = "remote";
public static final String CRAWL_PROFILE_SNIPPET_LOCAL_TEXT = "snippetLocalText";
@ -70,6 +72,8 @@ public final class CrawlSwitchboard {
public static Set<String> DEFAULT_PROFILES = new HashSet<String>();
static {
DEFAULT_PROFILES.add(CRAWL_PROFILE_AUTOCRAWL_DEEP);
DEFAULT_PROFILES.add(CRAWL_PROFILE_AUTOCRAWL_SHALLOW);
DEFAULT_PROFILES.add(CRAWL_PROFILE_PROXY);
DEFAULT_PROFILES.add(CRAWL_PROFILE_REMOTE);
DEFAULT_PROFILES.add(CRAWL_PROFILE_SNIPPET_LOCAL_TEXT);
@ -98,6 +102,7 @@ public final class CrawlSwitchboard {
private final Map<String, RowHandleSet> profilesActiveCrawlsCounter;
public CrawlProfile defaultProxyProfile, defaultRemoteProfile, defaultTextSnippetLocalProfile, defaultTextSnippetGlobalProfile;
public CrawlProfile defaultTextGreedyLearningProfile, defaultMediaSnippetLocalProfile, defaultMediaSnippetGlobalProfile, defaultSurrogateProfile;
public CrawlProfile defaultAutocrawlDeepProfile, defaultAutocrawlShallowProfile;
private Map<String, CrawlProfile> defaultPushProfiles; // for each collection one profile
private final File queuesRoot;
private Switchboard switchboard;
@ -268,8 +273,75 @@ public final class CrawlSwitchboard {
private void initActiveCrawlProfiles() {
// generate new default entry for proxy crawling
final Switchboard sb = Switchboard.getSwitchboard();
// generate new default entry for deep auto crawl
this.defaultAutocrawlDeepProfile =
new CrawlProfile(
CRAWL_PROFILE_AUTOCRAWL_DEEP,
CrawlProfile.MATCH_ALL_STRING, //crawlerUrlMustMatch
CrawlProfile.MATCH_NEVER_STRING, //crawlerUrlMustNotMatch
CrawlProfile.MATCH_ALL_STRING, //crawlerIpMustMatch
CrawlProfile.MATCH_NEVER_STRING, //crawlerIpMustNotMatch
CrawlProfile.MATCH_NEVER_STRING, //crawlerCountryMustMatch
CrawlProfile.MATCH_NEVER_STRING, //crawlerNoDepthLimitMatch
CrawlProfile.MATCH_ALL_STRING, //indexUrlMustMatch
CrawlProfile.MATCH_NEVER_STRING, //indexUrlMustNotMatch
CrawlProfile.MATCH_ALL_STRING, //indexContentMustMatch
CrawlProfile.MATCH_NEVER_STRING, //indexContentMustNotMatch
Integer.parseInt(sb.getConfig(SwitchboardConstants.AUTOCRAWL_DEEP_DEPTH, "3")),
true,
CrawlProfile.getRecrawlDate(Integer.parseInt(sb.getConfig(SwitchboardConstants.AUTOCRAWL_DEEP_RECRAWL, "43200"))),
-1,
true, true, true, false, // crawlingQ, followFrames, obeyHtmlRobotsNoindex, obeyHtmlRobotsNofollow,
sb.getConfigBool(SwitchboardConstants.AUTOCRAWL_INDEX_TEXT, true),
sb.getConfigBool(SwitchboardConstants.AUTOCRAWL_INDEX_MEDIA, true),
false,
false,
-1,
false, true, CrawlProfile.MATCH_NEVER_STRING,
CacheStrategy.NOCACHE,
"robot_" + CRAWL_PROFILE_AUTOCRAWL_DEEP,
ClientIdentification.yacyInternetCrawlerAgentName,
null,
0);
this.profilesActiveCrawls.put(
UTF8.getBytes(this.defaultAutocrawlDeepProfile.handle()),
this.defaultAutocrawlDeepProfile);
// generate new default entry for shallow auto crawl
this.defaultAutocrawlShallowProfile =
new CrawlProfile(
CRAWL_PROFILE_AUTOCRAWL_SHALLOW,
CrawlProfile.MATCH_ALL_STRING, //crawlerUrlMustMatch
CrawlProfile.MATCH_NEVER_STRING, //crawlerUrlMustNotMatch
CrawlProfile.MATCH_ALL_STRING, //crawlerIpMustMatch
CrawlProfile.MATCH_NEVER_STRING, //crawlerIpMustNotMatch
CrawlProfile.MATCH_NEVER_STRING, //crawlerCountryMustMatch
CrawlProfile.MATCH_NEVER_STRING, //crawlerNoDepthLimitMatch
CrawlProfile.MATCH_ALL_STRING, //indexUrlMustMatch
CrawlProfile.MATCH_NEVER_STRING, //indexUrlMustNotMatch
CrawlProfile.MATCH_ALL_STRING, //indexContentMustMatch
CrawlProfile.MATCH_NEVER_STRING, //indexContentMustNotMatch
Integer.parseInt(sb.getConfig(SwitchboardConstants.AUTOCRAWL_SHALLOW_DEPTH, "1")),
true,
CrawlProfile.getRecrawlDate(Integer.parseInt(sb.getConfig(SwitchboardConstants.AUTOCRAWL_SHALLOW_RECRAWL, "1440"))),
-1,
true, true, true, false, // crawlingQ, followFrames, obeyHtmlRobotsNoindex, obeyHtmlRobotsNofollow,
sb.getConfigBool(SwitchboardConstants.AUTOCRAWL_INDEX_TEXT, true),
sb.getConfigBool(SwitchboardConstants.AUTOCRAWL_INDEX_MEDIA, true),
false,
false,
-1,
false, true, CrawlProfile.MATCH_NEVER_STRING,
CacheStrategy.NOCACHE,
"robot_" + CRAWL_PROFILE_AUTOCRAWL_SHALLOW,
ClientIdentification.yacyInternetCrawlerAgentName,
null,
0);
this.profilesActiveCrawls.put(
UTF8.getBytes(this.defaultAutocrawlShallowProfile.handle()),
this.defaultAutocrawlShallowProfile);
// generate new default entry for proxy crawling
this.defaultProxyProfile =
new CrawlProfile(
CRAWL_PROFILE_PROXY,

View File

@ -308,6 +308,15 @@ public final class SwitchboardConstants {
* @see Switchboard#PROXY_CACHE_LAYOUT_HASH
*/
public static final String PROXY_YACY_ONLY = "proxyYacyOnly";
public static final String AUTOCRAWL = "autocrawl";
public static final String AUTOCRAWL_INDEX_TEXT = "autocrawl.index.text";
public static final String AUTOCRAWL_INDEX_MEDIA = "autocrawl.index.media";
public static final String AUTOCRAWL_DEEP_DEPTH = "autocrawl.deep.depth";
public static final String AUTOCRAWL_DEEP_RECRAWL = "autocrawl.deep.recrawl";
public static final String AUTOCRAWL_SHALLOW_DEPTH = "autocrawl.shallow.depth";
public static final String AUTOCRAWL_SHALLOW_RECRAWL = "autocrawl.shallow.recrawl";
//////////////////////////////////////////////////////////////////////////////////////////////
// Cluster settings