mirror of
https://github.com/yacy/yacy_search_server.git
synced 2024-09-19 00:01:41 +02:00
*) Invalid crawl profiles (containing invalid mustmatch/mustnotmatch filters) will be moved from active crawls to invalid crawls (new file: DATA/INDEX/freeworld/QUEUES/crawlProfilesInvalid.heap). This file can not be edited yet, but it shoudl be easy to extend the CrawlProfileEditor accordingly.
*) Corrupt crawlProfilesPassive.heap would cause crawlProfilesActive.heap to be deleted. Don't know if this ever happend, but will not happen anymore. *) Cleaned up a little bit. *) Added some comments. git-svn-id: https://svn.berlios.de/svnroot/repos/yacy/trunk@7827 6c8d7289-2bf4-0310-a012-ef5d649a1542
This commit is contained in:
parent
b84089ff04
commit
c7b95e8c81
|
@ -101,7 +101,10 @@ public class CrawlProfileEditor_p {
|
|||
labels.add(new eentry(CrawlProfile.XPSTOPW, "Parent stop-words", false, eentry.BOOLEAN));
|
||||
}
|
||||
|
||||
public static serverObjects respond(final RequestHeader header, final serverObjects post, final serverSwitch env) {
|
||||
public static serverObjects respond(
|
||||
final RequestHeader header,
|
||||
final serverObjects post,
|
||||
final serverSwitch env) {
|
||||
final servletProperties prop = new servletProperties();
|
||||
final Switchboard sb = (Switchboard)env;
|
||||
|
||||
|
@ -131,7 +134,7 @@ public class CrawlProfileEditor_p {
|
|||
|
||||
// generate handle list: first sort by handle name
|
||||
CrawlProfile selentry;
|
||||
Map<String, String> orderdHandles = new TreeMap<String, String>();
|
||||
final Map<String, String> orderdHandles = new TreeMap<String, String>();
|
||||
for (final byte[] h : sb.crawler.getActive()) {
|
||||
selentry = sb.crawler.getActive(h);
|
||||
if (selentry != null && !ignoreNames.contains(selentry.name())) {
|
||||
|
@ -219,7 +222,8 @@ public class CrawlProfileEditor_p {
|
|||
prop.put(EDIT_ENTRIES_PREFIX + count + "_readonly_label", ee.label);
|
||||
prop.put(EDIT_ENTRIES_PREFIX + count + "_readonly_type", ee.type);
|
||||
if (ee.type == eentry.BOOLEAN) {
|
||||
prop.put(EDIT_ENTRIES_PREFIX + count + "_readonly_type_checked", Boolean.parseBoolean(val) ? "1" : "0");
|
||||
prop.put(EDIT_ENTRIES_PREFIX + count + "_readonly_type_checked",
|
||||
Boolean.parseBoolean(val) ? "1" : "0");
|
||||
} else {
|
||||
prop.put(EDIT_ENTRIES_PREFIX + count + "_readonly_type_value", val);
|
||||
}
|
||||
|
@ -231,7 +235,14 @@ public class CrawlProfileEditor_p {
|
|||
return prop;
|
||||
}
|
||||
|
||||
private static void putProfileEntry(final servletProperties prop, final CrawlStacker crawlStacker, final CrawlProfile profile, final boolean active, final boolean dark, final int count, final int domlistlength) {
|
||||
private static void putProfileEntry(
|
||||
final servletProperties prop,
|
||||
final CrawlStacker crawlStacker,
|
||||
final CrawlProfile profile,
|
||||
final boolean active,
|
||||
final boolean dark,
|
||||
final int count,
|
||||
final int domlistlength) {
|
||||
|
||||
prop.put(CRAWL_PROFILE_PREFIX + count + "_dark", dark ? "1" : "0");
|
||||
prop.put(CRAWL_PROFILE_PREFIX + count + "_name", profile.name());
|
||||
|
@ -247,13 +258,13 @@ public class CrawlProfileEditor_p {
|
|||
prop.put(CRAWL_PROFILE_PREFIX + count + "_crawlingIfOlder", (profile.recrawlIfOlder() == 0L) ? "no re-crawl" : DateFormat.getDateTimeInstance().format(profile.recrawlIfOlder()));
|
||||
prop.put(CRAWL_PROFILE_PREFIX + count + "_crawlingDomFilterDepth", "inactive");
|
||||
|
||||
// start contrib [MN]
|
||||
int i = 0;
|
||||
if (active && profile.domMaxPages() > 0 && profile.domMaxPages() != Integer.MAX_VALUE) {
|
||||
if (active && profile.domMaxPages() > 0
|
||||
&& profile.domMaxPages() != Integer.MAX_VALUE) {
|
||||
String item;
|
||||
while (i <= domlistlength && !"".equals(item = crawlStacker.domName(true, i))){
|
||||
while (i <= domlistlength && !(item = crawlStacker.domName(true, i)).isEmpty()){
|
||||
if (i == domlistlength) {
|
||||
item = item + " ...";
|
||||
item += " ...";
|
||||
}
|
||||
prop.putHTML(CRAWL_PROFILE_PREFIX + count + "_crawlingDomFilterContent_" + i + "_item", item);
|
||||
i++;
|
||||
|
|
|
@ -4,7 +4,10 @@
|
|||
// (C) by Michael Peter Christen; mc@yacy.net
|
||||
// first published on http://www.anomic.de
|
||||
// Frankfurt, Germany, 2004
|
||||
// last major change: 31.08.2010
|
||||
//
|
||||
// $LastChangedDate$
|
||||
// $LastChangedRevision$
|
||||
// $LastChangedBy$
|
||||
//
|
||||
// This program is free software; you can redistribute it and/or modify
|
||||
// it under the terms of the GNU General Public License as published by
|
||||
|
@ -63,6 +66,26 @@ public class CrawlProfile extends ConcurrentHashMap<String, String> implements M
|
|||
|
||||
private Pattern mustmatch = null, mustnotmatch = null;
|
||||
|
||||
/**
|
||||
* Constructor which creates CrawlPofile from parameters.
|
||||
* @param name name of the crawl profile
|
||||
* @param startURL root URL of the crawl
|
||||
* @param mustmatch URLs which do not match this regex will be ignored
|
||||
* @param mustnotmatch URLs which match this regex will be ignored
|
||||
* @param depth height of the tree which will be created by the crawler
|
||||
* @param recrawlIfOlder documents which have been indexed in the past will
|
||||
* be indexed again if they are older than the time (ms) in this parameter
|
||||
* @param domMaxPages maximum number from one domain which will be indexed
|
||||
* @param crawlingQ true if URLs containing questionmarks shall be indexed
|
||||
* @param indexText true if text content of URL shall be indexed
|
||||
* @param indexMedia true if media content of URL shall be indexed
|
||||
* @param storeHTCache true if content chall be kept in cache after indexing
|
||||
* @param remoteIndexing true if part of the crawl job shall be distributed
|
||||
* @param xsstopw true if static stop words shall be ignored
|
||||
* @param xdstopw true if dynamic stop words shall be ignored
|
||||
* @param xpstopw true if parent stop words shall be ignored
|
||||
* @param cacheStrategy determines if and how cache is used loading content
|
||||
*/
|
||||
public CrawlProfile(
|
||||
final String name,
|
||||
final DigestURI startURL,
|
||||
|
@ -81,8 +104,12 @@ public class CrawlProfile extends ConcurrentHashMap<String, String> implements M
|
|||
final boolean xpstopw,
|
||||
final CacheStrategy cacheStrategy) {
|
||||
super(40);
|
||||
if (name == null || name.length() == 0) throw new NullPointerException("name must not be null");
|
||||
final String handle = (startURL == null) ? Base64Order.enhancedCoder.encode(Digest.encodeMD5Raw(name)).substring(0, Word.commonHashLength) : ASCII.String(startURL.hash());
|
||||
if (name == null || name.isEmpty()) {
|
||||
throw new NullPointerException("name must not be null or empty");
|
||||
}
|
||||
final String handle = (startURL == null)
|
||||
? Base64Order.enhancedCoder.encode(Digest.encodeMD5Raw(name)).substring(0, Word.commonHashLength)
|
||||
: ASCII.String(startURL.hash());
|
||||
put(HANDLE, handle);
|
||||
put(NAME, name);
|
||||
put(START_URL, (startURL == null) ? "" : startURL.toNormalform(true, false));
|
||||
|
@ -102,37 +129,75 @@ public class CrawlProfile extends ConcurrentHashMap<String, String> implements M
|
|||
put(CACHE_STRAGEGY, cacheStrategy.toString());
|
||||
}
|
||||
|
||||
/**
|
||||
* Constructor which creats a CrawlProfile from values in a Map.
|
||||
* @param ext contains values
|
||||
*/
|
||||
public CrawlProfile(final Map<String, String> ext) {
|
||||
super(ext == null ? 1 : ext.size());
|
||||
if (ext != null) putAll(ext);
|
||||
}
|
||||
|
||||
public void put(final String key, final boolean value) {
|
||||
|
||||
/**
|
||||
* Adds a parameter to CrawlProfile.
|
||||
* @param key name of the parameter
|
||||
* @param value values if the parameter
|
||||
*/
|
||||
public final void put(final String key, final boolean value) {
|
||||
super.put(key, Boolean.toString(value));
|
||||
}
|
||||
|
||||
public void put(final String key, final int value) {
|
||||
/**
|
||||
* Adds a parameter to CrawlProfile.
|
||||
* @param key name of the parameter
|
||||
* @param value values if the parameter
|
||||
*/
|
||||
public final void put(final String key, final int value) {
|
||||
super.put(key, Integer.toString(value));
|
||||
}
|
||||
|
||||
public void put(final String key, final long value) {
|
||||
/**
|
||||
* Adds a parameter to CrawlProfile.
|
||||
* @param key name of the parameter
|
||||
* @param value values if the parameter
|
||||
*/
|
||||
public final void put(final String key, final long value) {
|
||||
super.put(key, Long.toString(value));
|
||||
}
|
||||
|
||||
/**
|
||||
* Gets handle of the CrawlProfile.
|
||||
* @return handle of the profile
|
||||
*/
|
||||
public String handle() {
|
||||
final String r = get(HANDLE);
|
||||
//if (r == null) return null;
|
||||
return r;
|
||||
}
|
||||
|
||||
/**
|
||||
* Gets the name of the CrawlProfile.
|
||||
* @return name of the profile
|
||||
*/
|
||||
public String name() {
|
||||
final String r = get(NAME);
|
||||
if (r == null) return "";
|
||||
return r;
|
||||
}
|
||||
|
||||
/**
|
||||
* Gets the root URL of the crawl job.
|
||||
* @return root URL
|
||||
*/
|
||||
public String startURL() {
|
||||
final String r = get(START_URL);
|
||||
return r;
|
||||
}
|
||||
|
||||
/**
|
||||
* Gets the regex which must be matched by URLs in order to be crawled.
|
||||
* @return regex which must be matched
|
||||
*/
|
||||
public Pattern mustMatchPattern() {
|
||||
if (this.mustmatch == null) {
|
||||
String r = get(FILTER_MUSTMATCH);
|
||||
|
@ -141,6 +206,11 @@ public class CrawlProfile extends ConcurrentHashMap<String, String> implements M
|
|||
}
|
||||
return this.mustmatch;
|
||||
}
|
||||
|
||||
/**
|
||||
* Gets the regex which must not be matched by URLs in order to be crawled.
|
||||
* @return regex which must not be matched
|
||||
*/
|
||||
public Pattern mustNotMatchPattern() {
|
||||
if (this.mustnotmatch == null) {
|
||||
String r = get(FILTER_MUSTNOTMATCH);
|
||||
|
@ -149,6 +219,12 @@ public class CrawlProfile extends ConcurrentHashMap<String, String> implements M
|
|||
}
|
||||
return this.mustnotmatch;
|
||||
}
|
||||
|
||||
/**
|
||||
* Gets depth of crawl job (or height of the tree which will be
|
||||
* created by the crawler).
|
||||
* @return depth of crawl job
|
||||
*/
|
||||
public int depth() {
|
||||
final String r = get(DEPTH);
|
||||
if (r == null) return 0;
|
||||
|
@ -159,6 +235,7 @@ public class CrawlProfile extends ConcurrentHashMap<String, String> implements M
|
|||
return 0;
|
||||
}
|
||||
}
|
||||
|
||||
public CacheStrategy cacheStrategy() {
|
||||
final String r = get(CACHE_STRAGEGY);
|
||||
if (r == null) return CacheStrategy.IFEXIST;
|
||||
|
@ -169,9 +246,15 @@ public class CrawlProfile extends ConcurrentHashMap<String, String> implements M
|
|||
return CacheStrategy.IFEXIST;
|
||||
}
|
||||
}
|
||||
|
||||
public void setCacheStrategy(final CacheStrategy newStrategy) {
|
||||
put(CACHE_STRAGEGY, newStrategy.toString());
|
||||
}
|
||||
|
||||
/**
|
||||
* Gets the minimum age that an entry must have to be re-crawled.
|
||||
* @return time in ms
|
||||
*/
|
||||
public long recrawlIfOlder() {
|
||||
// returns a long (millis) that is the minimum age that
|
||||
// an entry must have to be re-crawled
|
||||
|
@ -185,6 +268,7 @@ public class CrawlProfile extends ConcurrentHashMap<String, String> implements M
|
|||
return 0L;
|
||||
}
|
||||
}
|
||||
|
||||
public int domMaxPages() {
|
||||
// this is the maximum number of pages that are crawled for a single domain
|
||||
// if -1, this means no limit
|
||||
|
@ -199,26 +283,31 @@ public class CrawlProfile extends ConcurrentHashMap<String, String> implements M
|
|||
return Integer.MAX_VALUE;
|
||||
}
|
||||
}
|
||||
|
||||
public boolean crawlingQ() {
|
||||
final String r = get(CRAWLING_Q);
|
||||
if (r == null) return false;
|
||||
return (r.equals(Boolean.TRUE.toString()));
|
||||
}
|
||||
|
||||
public boolean pushSolr() {
|
||||
final String r = get(PUSH_SOLR);
|
||||
if (r == null) return true;
|
||||
return (r.equals(Boolean.TRUE.toString()));
|
||||
}
|
||||
|
||||
public boolean indexText() {
|
||||
final String r = get(INDEX_TEXT);
|
||||
if (r == null) return true;
|
||||
return (r.equals(Boolean.TRUE.toString()));
|
||||
}
|
||||
|
||||
public boolean indexMedia() {
|
||||
final String r = get(INDEX_MEDIA);
|
||||
if (r == null) return true;
|
||||
return (r.equals(Boolean.TRUE.toString()));
|
||||
}
|
||||
|
||||
public boolean storeHTCache() {
|
||||
final String r = get(STORE_HTCACHE);
|
||||
if (r == null) return false;
|
||||
|
@ -229,16 +318,19 @@ public class CrawlProfile extends ConcurrentHashMap<String, String> implements M
|
|||
if (r == null) return false;
|
||||
return (r.equals(Boolean.TRUE.toString()));
|
||||
}
|
||||
|
||||
public boolean excludeStaticStopwords() {
|
||||
final String r = get(XSSTOPW);
|
||||
if (r == null) return false;
|
||||
return (r.equals(Boolean.TRUE.toString()));
|
||||
}
|
||||
|
||||
public boolean excludeDynamicStopwords() {
|
||||
final String r = get(XDSTOPW);
|
||||
if (r == null) return false;
|
||||
return (r.equals(Boolean.TRUE.toString()));
|
||||
}
|
||||
|
||||
public boolean excludeParentStopwords() {
|
||||
final String r = get(XPSTOPW);
|
||||
if (r == null) return false;
|
||||
|
|
|
@ -39,6 +39,7 @@ import net.yacy.kelondro.logging.Log;
|
|||
import net.yacy.kelondro.order.NaturalOrder;
|
||||
import net.yacy.kelondro.util.FileUtils;
|
||||
import net.yacy.kelondro.util.kelondroException;
|
||||
import net.yacy.repository.RegexHelper;
|
||||
|
||||
public final class CrawlSwitchboard {
|
||||
|
||||
|
@ -52,6 +53,7 @@ public final class CrawlSwitchboard {
|
|||
|
||||
public static final String DBFILE_ACTIVE_CRAWL_PROFILES = "crawlProfilesActive.heap";
|
||||
public static final String DBFILE_PASSIVE_CRAWL_PROFILES = "crawlProfilesPassive.heap";
|
||||
public static final String DBFILE_INVALID_CRAWL_PROFILES = "crawlProfilesInvalid.heap";
|
||||
|
||||
public static final long CRAWL_PROFILE_PROXY_RECRAWL_CYCLE = 60L * 24L;
|
||||
public static final long CRAWL_PROFILE_SNIPPET_LOCAL_TEXT_RECRAWL_CYCLE = 60L * 24L * 30L;
|
||||
|
@ -61,7 +63,7 @@ public final class CrawlSwitchboard {
|
|||
public static final long CRAWL_PROFILE_SURROGATE_RECRAWL_CYCLE = 60L * 24L * 30L;
|
||||
|
||||
private final Log log;
|
||||
private Map<byte[], Map<String, String>> profilesActiveCrawls, profilesPassiveCrawls;
|
||||
private Map<byte[], Map<String, String>> profilesActiveCrawls, profilesPassiveCrawls, profilesInvalidCrawls;
|
||||
public CrawlProfile defaultProxyProfile;
|
||||
public CrawlProfile defaultRemoteProfile;
|
||||
public CrawlProfile defaultTextSnippetLocalProfile, defaultTextSnippetGlobalProfile;
|
||||
|
@ -87,40 +89,37 @@ public final class CrawlSwitchboard {
|
|||
this.queuesRoot.mkdirs();
|
||||
this.log.logConfig("Initializing Crawl Profiles");
|
||||
|
||||
final File profilesInvalidFile = new File(queuesRoot, DBFILE_INVALID_CRAWL_PROFILES);
|
||||
this.profilesInvalidCrawls = loadFromDB(profilesInvalidFile);
|
||||
|
||||
final File profilesActiveFile = new File(queuesRoot, DBFILE_ACTIVE_CRAWL_PROFILES);
|
||||
try {
|
||||
this.profilesActiveCrawls = new MapHeap(profilesActiveFile, Word.commonHashLength, NaturalOrder.naturalOrder, 1024 * 64, 500, ' ');
|
||||
} catch (final IOException e) {
|
||||
Log.logException(e);Log.logException(e);
|
||||
FileUtils.deletedelete(profilesActiveFile);
|
||||
try {
|
||||
this.profilesActiveCrawls = new MapHeap(profilesActiveFile, Word.commonHashLength, NaturalOrder.naturalOrder, 1024 * 64, 500, ' ');
|
||||
} catch (final IOException e1) {
|
||||
Log.logException(e1);
|
||||
this.profilesActiveCrawls = null;
|
||||
this.profilesActiveCrawls = loadFromDB(profilesActiveFile);
|
||||
for (final byte[] handle : this.profilesActiveCrawls.keySet()) {
|
||||
final CrawlProfile p;
|
||||
p = new CrawlProfile(this.profilesActiveCrawls.get(handle));
|
||||
if (!RegexHelper.isValidRegex(p.get(CrawlProfile.FILTER_MUSTMATCH))) {
|
||||
this.removeActive(handle);
|
||||
this.putInvalid(handle, p);
|
||||
Log.logWarning("CrawlProfiles", "removed Profile " + p.handle() + ": " + p.name()
|
||||
+ " from active crawls since " + CrawlProfile.FILTER_MUSTMATCH
|
||||
+ " is no valid regular expression: " + p.get(CrawlProfile.FILTER_MUSTMATCH));
|
||||
} else if (!RegexHelper.isValidRegex(p.get(CrawlProfile.FILTER_MUSTNOTMATCH))) {
|
||||
this.putInvalid(handle, p);
|
||||
this.removeActive(handle);
|
||||
Log.logWarning("CrawlProfiles", "removed Profile " + p.handle() + ": " + p.name()
|
||||
+ " from active crawls since " + CrawlProfile.FILTER_MUSTNOTMATCH
|
||||
+ " is no valid regular expression: " + p.get(CrawlProfile.FILTER_MUSTNOTMATCH));
|
||||
} else {
|
||||
Log.logInfo("CrawlProfiles", "loaded Profile " + p.handle() + ": " + p.name());
|
||||
}
|
||||
}
|
||||
for (final byte[] handle: this.profilesActiveCrawls.keySet()) {
|
||||
final CrawlProfile p = new CrawlProfile(this.profilesActiveCrawls.get(handle));
|
||||
Log.logInfo("CrawlProfiles", "loaded Profile " + p.handle() + ": " + p.name());
|
||||
|
||||
}
|
||||
initActiveCrawlProfiles();
|
||||
log.logInfo("Loaded active crawl profiles from file " + profilesActiveFile.getName() + ", " + this.profilesActiveCrawls.size() + " entries");
|
||||
|
||||
final File profilesPassiveFile = new File(queuesRoot, DBFILE_PASSIVE_CRAWL_PROFILES);
|
||||
try {
|
||||
this.profilesPassiveCrawls = new MapHeap(profilesPassiveFile, Word.commonHashLength, NaturalOrder.naturalOrder, 1024 * 64, 500, ' ');
|
||||
} catch (final IOException e) {
|
||||
Log.logException(e);Log.logException(e);
|
||||
FileUtils.deletedelete(profilesActiveFile);
|
||||
try {
|
||||
this.profilesPassiveCrawls = new MapHeap(profilesPassiveFile, Word.commonHashLength, NaturalOrder.naturalOrder, 1024 * 64, 500, ' ');
|
||||
} catch (final IOException e1) {
|
||||
Log.logException(e1);
|
||||
this.profilesPassiveCrawls = null;
|
||||
}
|
||||
}
|
||||
for (final byte[] handle: this.profilesPassiveCrawls.keySet()) {
|
||||
this.profilesPassiveCrawls = loadFromDB(profilesPassiveFile);
|
||||
for (final byte[] handle : this.profilesPassiveCrawls.keySet()) {
|
||||
final CrawlProfile p = new CrawlProfile(this.profilesPassiveCrawls.get(handle));
|
||||
Log.logInfo("CrawlProfiles", "loaded Profile " + p.handle() + ": " + p.name());
|
||||
}
|
||||
|
@ -135,6 +134,13 @@ public final class CrawlSwitchboard {
|
|||
if (m == null) return null;
|
||||
return new CrawlProfile(m);
|
||||
}
|
||||
|
||||
public CrawlProfile getInvalid(final byte[] profileKey) {
|
||||
if (profileKey == null) return null;
|
||||
final Map<String, String> m = this.profilesInvalidCrawls.get(profileKey);
|
||||
if (m == null) return null;
|
||||
return new CrawlProfile(m);
|
||||
}
|
||||
|
||||
public CrawlProfile getPassive(final byte[] profileKey) {
|
||||
if (profileKey == null) return null;
|
||||
|
@ -146,6 +152,10 @@ public final class CrawlSwitchboard {
|
|||
public Set<byte[]> getActive() {
|
||||
return this.profilesActiveCrawls.keySet();
|
||||
}
|
||||
|
||||
public Set<byte[]> getInvalid() {
|
||||
return this.profilesInvalidCrawls.keySet();
|
||||
}
|
||||
|
||||
public Set<byte[]> getPassive() {
|
||||
return this.profilesPassiveCrawls.keySet();
|
||||
|
@ -155,6 +165,11 @@ public final class CrawlSwitchboard {
|
|||
if (profileKey == null) return;
|
||||
this.profilesActiveCrawls.remove(profileKey);
|
||||
}
|
||||
|
||||
public void removeInvalid(final byte[] profileKey) {
|
||||
if (profileKey == null) return;
|
||||
this.profilesInvalidCrawls.remove(profileKey);
|
||||
}
|
||||
|
||||
public void removePassive(final byte[] profileKey) {
|
||||
if (profileKey == null) return;
|
||||
|
@ -164,6 +179,10 @@ public final class CrawlSwitchboard {
|
|||
public void putActive(final byte[] profileKey, final CrawlProfile profile) {
|
||||
this.profilesActiveCrawls.put(profileKey, profile);
|
||||
}
|
||||
|
||||
public void putInvalid(final byte[] profileKey, final CrawlProfile profile) {
|
||||
this.profilesInvalidCrawls.put(profileKey, profile);
|
||||
}
|
||||
|
||||
public void putPassive(final byte[] profileKey, final CrawlProfile profile) {
|
||||
this.profilesPassiveCrawls.put(profileKey, profile);
|
||||
|
@ -302,7 +321,31 @@ public final class CrawlSwitchboard {
|
|||
|
||||
public void close() {
|
||||
((MapHeap) this.profilesActiveCrawls).close();
|
||||
((MapHeap) this.profilesInvalidCrawls).close();
|
||||
((MapHeap) this.profilesPassiveCrawls).close();
|
||||
}
|
||||
|
||||
|
||||
/**
|
||||
* Loads crawl profiles from a DB file.
|
||||
* @param file DB file
|
||||
* @return crawl profile data
|
||||
*/
|
||||
private Map<byte[], Map<String, String>> loadFromDB(final File file) {
|
||||
Map<byte[], Map<String, String>> ret;
|
||||
try {
|
||||
ret = new MapHeap(file, Word.commonHashLength, NaturalOrder.naturalOrder, 1024 * 64, 500, ' ');
|
||||
} catch (final IOException e) {
|
||||
Log.logException(e);Log.logException(e);
|
||||
FileUtils.deletedelete(file);
|
||||
try {
|
||||
ret = new MapHeap(file, Word.commonHashLength, NaturalOrder.naturalOrder, 1024 * 64, 500, ' ');
|
||||
} catch (final IOException e1) {
|
||||
Log.logException(e1);
|
||||
ret = null;
|
||||
}
|
||||
}
|
||||
return ret;
|
||||
}
|
||||
|
||||
}
|
||||
|
|
|
@ -1,11 +1,43 @@
|
|||
// CacheStrategy.java
|
||||
// ------------------------
|
||||
// part of YaCy
|
||||
// (C) by Michael Peter Christen; mc@yacy.net
|
||||
// first published on http://www.anomic.de
|
||||
// Frankfurt, Germany, 2011
|
||||
//
|
||||
// $LastChangedDate$
|
||||
// $LastChangedRevision$
|
||||
// $LastChangedBy$
|
||||
//
|
||||
// This program is free software; you can redistribute it and/or modify
|
||||
// it under the terms of the GNU General Public License as published by
|
||||
// the Free Software Foundation; either version 2 of the License, or
|
||||
// (at your option) any later version.
|
||||
//
|
||||
// This program is distributed in the hope that it will be useful,
|
||||
// but WITHOUT ANY WARRANTY; without even the implied warranty of
|
||||
// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
|
||||
// GNU General Public License for more details.
|
||||
//
|
||||
// You should have received a copy of the GNU General Public License
|
||||
// along with this program; if not, write to the Free Software
|
||||
// Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
|
||||
|
||||
package net.yacy.cora.services.federated.yacy;
|
||||
|
||||
public enum CacheStrategy {
|
||||
|
||||
NOCACHE(0), // never use the cache, all content from fresh internet source
|
||||
IFFRESH(1), // use the cache if the cache exists and is fresh using the proxy-fresh rules
|
||||
IFEXIST(2), // use the cache if the cache exist. Do no check freshness. Otherwise use online source.
|
||||
CACHEONLY(3); // never go online, use all content from cache. If no cache entry exist, consider content nevertheless as available
|
||||
/** Never use the cache, all content from fresh internet source. */
|
||||
NOCACHE(0),
|
||||
/** Use the cache if the cache exists and is fresh using the
|
||||
* proxy-fresh rules. */
|
||||
IFFRESH(1),
|
||||
/** Use the cache if the cache exists. Do not check freshness. Otherwise
|
||||
* use online source. */
|
||||
IFEXIST(2),
|
||||
/** Never go online, use all content from cache. If no cache entry exist,
|
||||
* consider content nevertheless as available */
|
||||
CACHEONLY(3);
|
||||
// the fifth case may be that the CacheStrategy object is assigned NULL. That means that no snippet creation is wanted.
|
||||
|
||||
public int code;
|
||||
|
@ -14,6 +46,7 @@ public enum CacheStrategy {
|
|||
this.code = code;
|
||||
}
|
||||
|
||||
@Override
|
||||
public String toString() {
|
||||
return Integer.toString(this.code);
|
||||
}
|
||||
|
|
|
@ -444,7 +444,7 @@ public class Blacklist {
|
|||
path = element.substring(slashPos + 1);
|
||||
}
|
||||
|
||||
if (!allowRegex || !isValidRegex(host)) {
|
||||
if (!allowRegex || !RegexHelper.isValidRegex(host)) {
|
||||
final int i = host.indexOf('*');
|
||||
|
||||
// check whether host begins illegally
|
||||
|
@ -470,33 +470,18 @@ public class Blacklist {
|
|||
if (host.indexOf("*", i + 1) > -1) {
|
||||
return BlacklistError.TWO_WILDCARDS_IN_HOST;
|
||||
}
|
||||
} else if (allowRegex && !isValidRegex(host)) {
|
||||
} else if (allowRegex && !RegexHelper.isValidRegex(host)) {
|
||||
return BlacklistError.HOST_REGEX;
|
||||
}
|
||||
|
||||
// check for errors on regex-compiling path
|
||||
if (!isValidRegex(path) && !"*".equals(path)) {
|
||||
if (!RegexHelper.isValidRegex(path) && !"*".equals(path)) {
|
||||
return BlacklistError.PATH_REGEX;
|
||||
}
|
||||
|
||||
return BlacklistError.NO_ERROR;
|
||||
}
|
||||
|
||||
/**
|
||||
* Checks if a given expression is a valid regular expression.
|
||||
* @param expression The expression to be checked.
|
||||
* @return True if the expression is a valid regular expression, else false.
|
||||
*/
|
||||
private static boolean isValidRegex(final String expression) {
|
||||
boolean ret = true;
|
||||
try {
|
||||
Pattern.compile(expression);
|
||||
} catch (final PatternSyntaxException e) {
|
||||
ret = false;
|
||||
}
|
||||
return ret;
|
||||
}
|
||||
|
||||
public static String defaultBlacklist(final File listsPath) {
|
||||
final List<String> dirlist = FileUtils.getDirListing(listsPath, Blacklist.BLACKLIST_FILENAME_FILTER);
|
||||
if (dirlist.isEmpty()) {
|
||||
|
|
|
@ -45,6 +45,7 @@ public class FilterEngine {
|
|||
this.types = types;
|
||||
}
|
||||
|
||||
@Override
|
||||
public int compareTo(FilterEntry fe) {
|
||||
return this.path.compareToIgnoreCase(fe.path);
|
||||
}
|
||||
|
@ -229,7 +230,7 @@ public class FilterEngine {
|
|||
path = element.substring(slashPos + 1);
|
||||
}
|
||||
|
||||
if (!allowRegex || !isValidRegex(host)) {
|
||||
if (!allowRegex || !RegexHelper.isValidRegex(host)) {
|
||||
final int i = host.indexOf('*');
|
||||
|
||||
// check whether host begins illegally
|
||||
|
@ -255,33 +256,16 @@ public class FilterEngine {
|
|||
if (host.indexOf("*", i + 1) > -1) {
|
||||
return ERR_TWO_WILDCARDS_IN_HOST;
|
||||
}
|
||||
} else if (allowRegex && !isValidRegex(host)) {
|
||||
} else if (allowRegex && !RegexHelper.isValidRegex(host)) {
|
||||
return ERR_HOST_REGEX;
|
||||
}
|
||||
|
||||
// check for errors on regex-compiling path
|
||||
if (!isValidRegex(path) && !path.equals("*")) {
|
||||
if (!RegexHelper.isValidRegex(path) && !path.equals("*")) {
|
||||
return ERR_PATH_REGEX;
|
||||
}
|
||||
|
||||
return 0;
|
||||
}
|
||||
|
||||
/**
|
||||
* Checks if a given expression is a valid regular expression.
|
||||
* @param expression The expression to be checked.
|
||||
* @return True if the expression is a valid regular expression, else false.
|
||||
*/
|
||||
private static boolean isValidRegex(String expression) {
|
||||
boolean ret = true;
|
||||
try {
|
||||
Pattern.compile(expression);
|
||||
} catch (final PatternSyntaxException e) {
|
||||
|
||||
ret = false;
|
||||
}
|
||||
return ret;
|
||||
}
|
||||
|
||||
|
||||
}
|
||||
|
|
52
source/net/yacy/repository/RegexHelper.java
Normal file
52
source/net/yacy/repository/RegexHelper.java
Normal file
|
@ -0,0 +1,52 @@
|
|||
// RegexHelper.java
|
||||
// ------------------------
|
||||
// part of YaCy
|
||||
// (C) by Marc Nause; marc.nause@gmx.de
|
||||
// first published on http://www.anomic.de
|
||||
// Braunchweig, Germany, 2011
|
||||
//
|
||||
// $LastChangedDate$
|
||||
// $LastChangedRevision$
|
||||
// $LastChangedBy$
|
||||
//
|
||||
// This program is free software; you can redistribute it and/or modify
|
||||
// it under the terms of the GNU General Public License as published by
|
||||
// the Free Software Foundation; either version 2 of the License, or
|
||||
// (at your option) any later version.
|
||||
//
|
||||
// This program is distributed in the hope that it will be useful,
|
||||
// but WITHOUT ANY WARRANTY; without even the implied warranty of
|
||||
// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
|
||||
// GNU General Public License for more details.
|
||||
//
|
||||
// You should have received a copy of the GNU General Public License
|
||||
// along with this program; if not, write to the Free Software
|
||||
// Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
|
||||
|
||||
package net.yacy.repository;
|
||||
|
||||
import java.util.regex.Pattern;
|
||||
import java.util.regex.PatternSyntaxException;
|
||||
|
||||
|
||||
public final class RegexHelper {
|
||||
|
||||
/** Private constructor to avoid instantiation of static class. */
|
||||
private RegexHelper() { }
|
||||
|
||||
/**
|
||||
* Checks if a given expression is a valid regular expression.
|
||||
* @param expression expression to be checked
|
||||
* @return true if the expression is a valid regular expression, else false
|
||||
*/
|
||||
public static boolean isValidRegex(final String expression) {
|
||||
boolean ret = true;
|
||||
try {
|
||||
Pattern.compile(expression);
|
||||
} catch (final PatternSyntaxException e) {
|
||||
ret = false;
|
||||
}
|
||||
return ret;
|
||||
}
|
||||
|
||||
}
|
Loading…
Reference in New Issue
Block a user