mirror of
https://github.com/yacy/yacy_search_server.git
synced 2024-09-19 00:01:41 +02:00
Create autocrawl profiles
This commit is contained in:
parent
abd8ecb503
commit
1728cd30c6
|
@ -538,6 +538,15 @@ proxyURL.access=127.0.0.1,0:0:0:0:0:0:0:1
|
|||
proxyURL.rewriteURLs=domainlist
|
||||
proxyURL.useforresults=false
|
||||
|
||||
# Autocrawl configuration
|
||||
autocrawl=false
|
||||
autocrawl.index.text=true
|
||||
autocrawl.index.meia=true
|
||||
autocrawl.deep.depth=3
|
||||
autocrawl.deep.recrawl=43200
|
||||
autocrawl.shallow.depth=1
|
||||
autocrawl.shallow.recrawl=1440
|
||||
|
||||
# From the 'IndexCreate' menu point you can also define a crawling start point.
|
||||
# The crawling works the same way as the prefetch, but it is possible to
|
||||
# assign a different crawling depth.
|
||||
|
|
|
@ -58,7 +58,9 @@ import net.yacy.search.Switchboard;
|
|||
import net.yacy.search.SwitchboardConstants;
|
||||
|
||||
public final class CrawlSwitchboard {
|
||||
|
||||
|
||||
public static final String CRAWL_PROFILE_AUTOCRAWL_DEEP = "autocrawlDeep";
|
||||
public static final String CRAWL_PROFILE_AUTOCRAWL_SHALLOW = "autocrawlShallow";
|
||||
public static final String CRAWL_PROFILE_PROXY = "proxy";
|
||||
public static final String CRAWL_PROFILE_REMOTE = "remote";
|
||||
public static final String CRAWL_PROFILE_SNIPPET_LOCAL_TEXT = "snippetLocalText";
|
||||
|
@ -70,6 +72,8 @@ public final class CrawlSwitchboard {
|
|||
|
||||
public static Set<String> DEFAULT_PROFILES = new HashSet<String>();
|
||||
static {
|
||||
DEFAULT_PROFILES.add(CRAWL_PROFILE_AUTOCRAWL_DEEP);
|
||||
DEFAULT_PROFILES.add(CRAWL_PROFILE_AUTOCRAWL_SHALLOW);
|
||||
DEFAULT_PROFILES.add(CRAWL_PROFILE_PROXY);
|
||||
DEFAULT_PROFILES.add(CRAWL_PROFILE_REMOTE);
|
||||
DEFAULT_PROFILES.add(CRAWL_PROFILE_SNIPPET_LOCAL_TEXT);
|
||||
|
@ -98,6 +102,7 @@ public final class CrawlSwitchboard {
|
|||
private final Map<String, RowHandleSet> profilesActiveCrawlsCounter;
|
||||
public CrawlProfile defaultProxyProfile, defaultRemoteProfile, defaultTextSnippetLocalProfile, defaultTextSnippetGlobalProfile;
|
||||
public CrawlProfile defaultTextGreedyLearningProfile, defaultMediaSnippetLocalProfile, defaultMediaSnippetGlobalProfile, defaultSurrogateProfile;
|
||||
public CrawlProfile defaultAutocrawlDeepProfile, defaultAutocrawlShallowProfile;
|
||||
private Map<String, CrawlProfile> defaultPushProfiles; // for each collection one profile
|
||||
private final File queuesRoot;
|
||||
private Switchboard switchboard;
|
||||
|
@ -268,8 +273,75 @@ public final class CrawlSwitchboard {
|
|||
|
||||
|
||||
private void initActiveCrawlProfiles() {
|
||||
// generate new default entry for proxy crawling
|
||||
final Switchboard sb = Switchboard.getSwitchboard();
|
||||
|
||||
// generate new default entry for deep auto crawl
|
||||
this.defaultAutocrawlDeepProfile =
|
||||
new CrawlProfile(
|
||||
CRAWL_PROFILE_AUTOCRAWL_DEEP,
|
||||
CrawlProfile.MATCH_ALL_STRING, //crawlerUrlMustMatch
|
||||
CrawlProfile.MATCH_NEVER_STRING, //crawlerUrlMustNotMatch
|
||||
CrawlProfile.MATCH_ALL_STRING, //crawlerIpMustMatch
|
||||
CrawlProfile.MATCH_NEVER_STRING, //crawlerIpMustNotMatch
|
||||
CrawlProfile.MATCH_NEVER_STRING, //crawlerCountryMustMatch
|
||||
CrawlProfile.MATCH_NEVER_STRING, //crawlerNoDepthLimitMatch
|
||||
CrawlProfile.MATCH_ALL_STRING, //indexUrlMustMatch
|
||||
CrawlProfile.MATCH_NEVER_STRING, //indexUrlMustNotMatch
|
||||
CrawlProfile.MATCH_ALL_STRING, //indexContentMustMatch
|
||||
CrawlProfile.MATCH_NEVER_STRING, //indexContentMustNotMatch
|
||||
Integer.parseInt(sb.getConfig(SwitchboardConstants.AUTOCRAWL_DEEP_DEPTH, "3")),
|
||||
true,
|
||||
CrawlProfile.getRecrawlDate(Integer.parseInt(sb.getConfig(SwitchboardConstants.AUTOCRAWL_DEEP_RECRAWL, "43200"))),
|
||||
-1,
|
||||
true, true, true, false, // crawlingQ, followFrames, obeyHtmlRobotsNoindex, obeyHtmlRobotsNofollow,
|
||||
sb.getConfigBool(SwitchboardConstants.AUTOCRAWL_INDEX_TEXT, true),
|
||||
sb.getConfigBool(SwitchboardConstants.AUTOCRAWL_INDEX_MEDIA, true),
|
||||
false,
|
||||
false,
|
||||
-1,
|
||||
false, true, CrawlProfile.MATCH_NEVER_STRING,
|
||||
CacheStrategy.NOCACHE,
|
||||
"robot_" + CRAWL_PROFILE_AUTOCRAWL_DEEP,
|
||||
ClientIdentification.yacyInternetCrawlerAgentName,
|
||||
null,
|
||||
0);
|
||||
this.profilesActiveCrawls.put(
|
||||
UTF8.getBytes(this.defaultAutocrawlDeepProfile.handle()),
|
||||
this.defaultAutocrawlDeepProfile);
|
||||
// generate new default entry for shallow auto crawl
|
||||
this.defaultAutocrawlShallowProfile =
|
||||
new CrawlProfile(
|
||||
CRAWL_PROFILE_AUTOCRAWL_SHALLOW,
|
||||
CrawlProfile.MATCH_ALL_STRING, //crawlerUrlMustMatch
|
||||
CrawlProfile.MATCH_NEVER_STRING, //crawlerUrlMustNotMatch
|
||||
CrawlProfile.MATCH_ALL_STRING, //crawlerIpMustMatch
|
||||
CrawlProfile.MATCH_NEVER_STRING, //crawlerIpMustNotMatch
|
||||
CrawlProfile.MATCH_NEVER_STRING, //crawlerCountryMustMatch
|
||||
CrawlProfile.MATCH_NEVER_STRING, //crawlerNoDepthLimitMatch
|
||||
CrawlProfile.MATCH_ALL_STRING, //indexUrlMustMatch
|
||||
CrawlProfile.MATCH_NEVER_STRING, //indexUrlMustNotMatch
|
||||
CrawlProfile.MATCH_ALL_STRING, //indexContentMustMatch
|
||||
CrawlProfile.MATCH_NEVER_STRING, //indexContentMustNotMatch
|
||||
Integer.parseInt(sb.getConfig(SwitchboardConstants.AUTOCRAWL_SHALLOW_DEPTH, "1")),
|
||||
true,
|
||||
CrawlProfile.getRecrawlDate(Integer.parseInt(sb.getConfig(SwitchboardConstants.AUTOCRAWL_SHALLOW_RECRAWL, "1440"))),
|
||||
-1,
|
||||
true, true, true, false, // crawlingQ, followFrames, obeyHtmlRobotsNoindex, obeyHtmlRobotsNofollow,
|
||||
sb.getConfigBool(SwitchboardConstants.AUTOCRAWL_INDEX_TEXT, true),
|
||||
sb.getConfigBool(SwitchboardConstants.AUTOCRAWL_INDEX_MEDIA, true),
|
||||
false,
|
||||
false,
|
||||
-1,
|
||||
false, true, CrawlProfile.MATCH_NEVER_STRING,
|
||||
CacheStrategy.NOCACHE,
|
||||
"robot_" + CRAWL_PROFILE_AUTOCRAWL_SHALLOW,
|
||||
ClientIdentification.yacyInternetCrawlerAgentName,
|
||||
null,
|
||||
0);
|
||||
this.profilesActiveCrawls.put(
|
||||
UTF8.getBytes(this.defaultAutocrawlShallowProfile.handle()),
|
||||
this.defaultAutocrawlShallowProfile);
|
||||
// generate new default entry for proxy crawling
|
||||
this.defaultProxyProfile =
|
||||
new CrawlProfile(
|
||||
CRAWL_PROFILE_PROXY,
|
||||
|
|
|
@ -308,6 +308,15 @@ public final class SwitchboardConstants {
|
|||
* @see Switchboard#PROXY_CACHE_LAYOUT_HASH
|
||||
*/
|
||||
public static final String PROXY_YACY_ONLY = "proxyYacyOnly";
|
||||
|
||||
public static final String AUTOCRAWL = "autocrawl";
|
||||
public static final String AUTOCRAWL_INDEX_TEXT = "autocrawl.index.text";
|
||||
public static final String AUTOCRAWL_INDEX_MEDIA = "autocrawl.index.media";
|
||||
public static final String AUTOCRAWL_DEEP_DEPTH = "autocrawl.deep.depth";
|
||||
public static final String AUTOCRAWL_DEEP_RECRAWL = "autocrawl.deep.recrawl";
|
||||
public static final String AUTOCRAWL_SHALLOW_DEPTH = "autocrawl.shallow.depth";
|
||||
public static final String AUTOCRAWL_SHALLOW_RECRAWL = "autocrawl.shallow.recrawl";
|
||||
|
||||
|
||||
//////////////////////////////////////////////////////////////////////////////////////////////
|
||||
// Cluster settings
|
||||
|
|
Loading…
Reference in New Issue
Block a user