mirror of
https://github.com/yacy/yacy_search_server.git
synced 2024-09-19 00:01:41 +02:00
self-healing of mistakenly deactivated crawl profiles. This fixes a bug
which can happen in rare cases when a crawl start and a cleanup process happen at the same time.
This commit is contained in:
parent
095053a9b4
commit
91a875dff5
|
@ -394,7 +394,7 @@ public class Balancer {
|
|||
|
||||
// at this point we must check if the crawlEntry has relevance because the crawl profile still exists
|
||||
// if not: return null. A calling method must handle the null value and try again
|
||||
profileEntry = cs.getActive(UTF8.getBytes(crawlEntry.profileHandle()));
|
||||
profileEntry = cs.get(UTF8.getBytes(crawlEntry.profileHandle()));
|
||||
if (profileEntry == null) {
|
||||
ConcurrentLog.warn("Balancer", "no profile entry for handle " + crawlEntry.profileHandle());
|
||||
continue;
|
||||
|
@ -481,7 +481,7 @@ public class Balancer {
|
|||
rowEntry = this.urlFileIndex.get(urlhash, false);
|
||||
if (rowEntry == null) continue; // may have been deleted there manwhile
|
||||
Request crawlEntry = new Request(rowEntry);
|
||||
CrawlProfile profileEntry = cs.getActive(UTF8.getBytes(crawlEntry.profileHandle()));
|
||||
CrawlProfile profileEntry = cs.get(UTF8.getBytes(crawlEntry.profileHandle()));
|
||||
if (profileEntry == null) {
|
||||
ConcurrentLog.warn("Balancer", "no profile entry for handle " + crawlEntry.profileHandle());
|
||||
continue;
|
||||
|
|
|
@ -149,7 +149,7 @@ public final class CrawlStacker {
|
|||
|
||||
// if the url was rejected we store it into the error URL db
|
||||
if (rejectReason != null && !rejectReason.startsWith("double in")) {
|
||||
final CrawlProfile profile = this.crawler.getActive(UTF8.getBytes(entry.profileHandle()));
|
||||
final CrawlProfile profile = this.crawler.get(UTF8.getBytes(entry.profileHandle()));
|
||||
this.nextQueue.errorURL.push(entry.url(), profile, FailCategory.FINAL_LOAD_CONTEXT, rejectReason, -1);
|
||||
}
|
||||
} catch (final Exception e) {
|
||||
|
@ -294,7 +294,8 @@ public final class CrawlStacker {
|
|||
public String stackCrawl(final Request entry) {
|
||||
//this.log.logFinest("stackCrawl: nexturlString='" + nexturlString + "'");
|
||||
|
||||
final CrawlProfile profile = this.crawler.getActive(UTF8.getBytes(entry.profileHandle()));
|
||||
byte[] handle = UTF8.getBytes(entry.profileHandle());
|
||||
final CrawlProfile profile = this.crawler.get(handle);
|
||||
String error;
|
||||
if (profile == null) {
|
||||
error = "LOST STACKER PROFILE HANDLE '" + entry.profileHandle() + "' for URL " + entry.url();
|
||||
|
|
|
@ -166,6 +166,23 @@ public final class CrawlSwitchboard {
|
|||
/ 1024);
|
||||
}
|
||||
|
||||
/**
|
||||
* Get a profile from active or passive stack. Should be used to be sure not to miss old, cleaned profiles.
|
||||
* A profile that was discovered from the passive stack is automatically shifted back to the active stack.
|
||||
* @param profileKey
|
||||
* @return
|
||||
*/
|
||||
public CrawlProfile get(final byte[] profileKey) {
|
||||
CrawlProfile profile = getActive(profileKey);
|
||||
if (profile != null) return profile;
|
||||
profile = getPassive(profileKey);
|
||||
if (profile == null) return null;
|
||||
// clean up
|
||||
this.putActive(profileKey, profile);
|
||||
this.removePassive(profileKey);
|
||||
return profile;
|
||||
}
|
||||
|
||||
public CrawlProfile getActive(final byte[] profileKey) {
|
||||
if ( profileKey == null ) {
|
||||
return null;
|
||||
|
|
|
@ -148,7 +148,7 @@ public class CrawlProfile extends ConcurrentHashMap<String, String> implements M
|
|||
}
|
||||
if (name.length() > 256) name = name.substring(256);
|
||||
this.doms = new ConcurrentHashMap<String, AtomicInteger>();
|
||||
final String handle = Base64Order.enhancedCoder.encode(Digest.encodeMD5Raw(name + crawlerUrlMustMatch + depth + crawlerUrlMustNotMatch + domMaxPages)).substring(0, Word.commonHashLength);
|
||||
final String handle = Base64Order.enhancedCoder.encode(Digest.encodeMD5Raw(name + crawlerUrlMustMatch + depth + crawlerUrlMustNotMatch + domMaxPages + collections)).substring(0, Word.commonHashLength);
|
||||
put(HANDLE, handle);
|
||||
put(NAME, name);
|
||||
put(AGENT_NAME, userAgentName);
|
||||
|
|
|
@ -255,7 +255,7 @@ public class CrawlQueues {
|
|||
this.log.severe(stats + ": NULL PROFILE HANDLE '" + urlEntry.profileHandle() + "' for URL " + urlEntry.url());
|
||||
return true;
|
||||
}
|
||||
final CrawlProfile profile = this.sb.crawler.getActive(ASCII.getBytes(profileHandle));
|
||||
final CrawlProfile profile = this.sb.crawler.get(ASCII.getBytes(profileHandle));
|
||||
if (profile == null) {
|
||||
this.log.severe(stats + ": NULL PROFILE HANDLE '" + urlEntry.profileHandle() + "' for URL " + urlEntry.url());
|
||||
return true;
|
||||
|
@ -297,7 +297,7 @@ public class CrawlQueues {
|
|||
* @return
|
||||
*/
|
||||
private void load(final Request urlEntry, final String stats, final String profileHandle) {
|
||||
final CrawlProfile profile = this.sb.crawler.getActive(UTF8.getBytes(profileHandle));
|
||||
final CrawlProfile profile = this.sb.crawler.get(UTF8.getBytes(profileHandle));
|
||||
if (profile != null) {
|
||||
|
||||
// check if the protocol is supported
|
||||
|
@ -606,7 +606,7 @@ public class CrawlQueues {
|
|||
this.request.setStatus("worker-initialized", WorkflowJob.STATUS_INITIATED);
|
||||
this.code = Integer.valueOf(entry.hashCode());
|
||||
this.setPriority(Thread.MIN_PRIORITY); // http requests from the crawler should not cause that other functions work worse
|
||||
this.profile = CrawlQueues.this.sb.crawler.getActive(UTF8.getBytes(this.request.profileHandle()));
|
||||
this.profile = CrawlQueues.this.sb.crawler.get(UTF8.getBytes(this.request.profileHandle()));
|
||||
}
|
||||
|
||||
private long age() {
|
||||
|
|
|
@ -101,7 +101,7 @@ public class FTPLoader {
|
|||
// create new ftp client
|
||||
final FTPClient ftpClient = new FTPClient();
|
||||
|
||||
final CrawlProfile profile = this.sb.crawler.getActive(ASCII.getBytes(request.profileHandle()));
|
||||
final CrawlProfile profile = this.sb.crawler.get(ASCII.getBytes(request.profileHandle()));
|
||||
// get a connection
|
||||
if (openConnection(ftpClient, entryUrl)) {
|
||||
// test if the specified file is a directory
|
||||
|
@ -249,7 +249,7 @@ public class FTPLoader {
|
|||
|
||||
// create response with metadata only
|
||||
responseHeader.put(HeaderFramework.CONTENT_TYPE, "text/plain");
|
||||
final CrawlProfile profile = this.sb.crawler.getActive(ASCII.getBytes(request.profileHandle()));
|
||||
final CrawlProfile profile = this.sb.crawler.get(ASCII.getBytes(request.profileHandle()));
|
||||
final Response response = new Response(
|
||||
request,
|
||||
requestHeader,
|
||||
|
@ -264,7 +264,7 @@ public class FTPLoader {
|
|||
final byte[] b = ftpClient.get(path);
|
||||
|
||||
// create a response
|
||||
final CrawlProfile profile = this.sb.crawler.getActive(ASCII.getBytes(request.profileHandle()));
|
||||
final CrawlProfile profile = this.sb.crawler.get(ASCII.getBytes(request.profileHandle()));
|
||||
final Response response = new Response(
|
||||
request,
|
||||
requestHeader,
|
||||
|
|
|
@ -83,7 +83,7 @@ public class FileLoader {
|
|||
ResponseHeader responseHeader = new ResponseHeader(200);
|
||||
responseHeader.put(HeaderFramework.LAST_MODIFIED, HeaderFramework.formatRFC1123(new Date()));
|
||||
responseHeader.put(HeaderFramework.CONTENT_TYPE, "text/html");
|
||||
final CrawlProfile profile = this.sb.crawler.getActive(ASCII.getBytes(request.profileHandle()));
|
||||
final CrawlProfile profile = this.sb.crawler.get(ASCII.getBytes(request.profileHandle()));
|
||||
Response response = new Response(
|
||||
request,
|
||||
requestHeader,
|
||||
|
@ -123,7 +123,7 @@ public class FileLoader {
|
|||
|
||||
// create response with metadata only
|
||||
responseHeader.put(HeaderFramework.CONTENT_TYPE, "text/plain");
|
||||
final CrawlProfile profile = this.sb.crawler.getActive(ASCII.getBytes(request.profileHandle()));
|
||||
final CrawlProfile profile = this.sb.crawler.get(ASCII.getBytes(request.profileHandle()));
|
||||
Response response = new Response(
|
||||
request,
|
||||
requestHeader,
|
||||
|
@ -140,7 +140,7 @@ public class FileLoader {
|
|||
is.close();
|
||||
|
||||
// create response with loaded content
|
||||
final CrawlProfile profile = this.sb.crawler.getActive(ASCII.getBytes(request.profileHandle()));
|
||||
final CrawlProfile profile = this.sb.crawler.get(ASCII.getBytes(request.profileHandle()));
|
||||
Response response = new Response(
|
||||
request,
|
||||
requestHeader,
|
||||
|
|
|
@ -101,7 +101,7 @@ public class SMBLoader {
|
|||
ResponseHeader responseHeader = new ResponseHeader(200);
|
||||
responseHeader.put(HeaderFramework.LAST_MODIFIED, HeaderFramework.formatRFC1123(new Date()));
|
||||
responseHeader.put(HeaderFramework.CONTENT_TYPE, "text/html");
|
||||
final CrawlProfile profile = this.sb.crawler.getActive(ASCII.getBytes(request.profileHandle()));
|
||||
final CrawlProfile profile = this.sb.crawler.get(ASCII.getBytes(request.profileHandle()));
|
||||
Response response = new Response(
|
||||
request,
|
||||
requestHeader,
|
||||
|
@ -141,7 +141,7 @@ public class SMBLoader {
|
|||
|
||||
// create response with metadata only
|
||||
responseHeader.put(HeaderFramework.CONTENT_TYPE, "text/plain");
|
||||
final CrawlProfile profile = this.sb.crawler.getActive(request.profileHandle().getBytes());
|
||||
final CrawlProfile profile = this.sb.crawler.get(request.profileHandle().getBytes());
|
||||
Response response = new Response(
|
||||
request,
|
||||
requestHeader,
|
||||
|
@ -158,7 +158,7 @@ public class SMBLoader {
|
|||
is.close();
|
||||
|
||||
// create response with loaded content
|
||||
final CrawlProfile profile = this.sb.crawler.getActive(request.profileHandle().getBytes());
|
||||
final CrawlProfile profile = this.sb.crawler.get(request.profileHandle().getBytes());
|
||||
Response response = new Response(
|
||||
request,
|
||||
requestHeader,
|
||||
|
|
|
@ -187,7 +187,7 @@ public final class LoaderDispatcher {
|
|||
if (url.isFile() || url.isSMB()) cacheStrategy = CacheStrategy.NOCACHE; // load just from the file system
|
||||
final String protocol = url.getProtocol();
|
||||
final String host = url.getHost();
|
||||
final CrawlProfile crawlProfile = request.profileHandle() == null ? null : this.sb.crawler.getActive(UTF8.getBytes(request.profileHandle()));
|
||||
final CrawlProfile crawlProfile = request.profileHandle() == null ? null : this.sb.crawler.get(UTF8.getBytes(request.profileHandle()));
|
||||
|
||||
// check if url is in blacklist
|
||||
if (blacklistType != null && host != null && Switchboard.urlBlacklist.isListed(blacklistType, host.toLowerCase(), url.getFile())) {
|
||||
|
|
|
@ -2514,7 +2514,7 @@ public final class Switchboard extends serverSwitch {
|
|||
} else {
|
||||
// we consider this as fail urls to have a tracking of the problem
|
||||
if (rejectReason != null && !rejectReason.startsWith("double in")) {
|
||||
final CrawlProfile profile = this.crawler.getActive(UTF8.getBytes(response.profile().handle()));
|
||||
final CrawlProfile profile = this.crawler.get(UTF8.getBytes(response.profile().handle()));
|
||||
this.crawlStacker.nextQueue.errorURL.push(response.url(), profile, FailCategory.FINAL_LOAD_CONTEXT, rejectReason, -1);
|
||||
}
|
||||
}
|
||||
|
@ -3002,7 +3002,7 @@ public final class Switchboard extends serverSwitch {
|
|||
continue;
|
||||
}
|
||||
final Request request = this.loader.request(e.getValue(), true, true);
|
||||
final CrawlProfile profile = this.crawler.getActive(ASCII.getBytes(request.profileHandle()));
|
||||
final CrawlProfile profile = this.crawler.get(ASCII.getBytes(request.profileHandle()));
|
||||
final String acceptedError = this.crawlStacker.checkAcceptance(e.getValue(), profile, 0);
|
||||
if (acceptedError != null) {
|
||||
this.log.warn("addToIndex: cannot load " + urlName + ": " + acceptedError);
|
||||
|
@ -3032,7 +3032,7 @@ public final class Switchboard extends serverSwitch {
|
|||
final Document[] documents = response.parse();
|
||||
if (documents != null) {
|
||||
for (final Document document: documents) {
|
||||
final CrawlProfile profile = crawler.getActive(ASCII.getBytes(request.profileHandle()));
|
||||
final CrawlProfile profile = crawler.get(ASCII.getBytes(request.profileHandle()));
|
||||
if (document.indexingDenied() && (profile == null || profile.obeyHtmlRobotsNoindex())) {
|
||||
throw new Parser.Failure("indexing is denied", url);
|
||||
}
|
||||
|
@ -3075,7 +3075,7 @@ public final class Switchboard extends serverSwitch {
|
|||
if (existingids.contains(e.getKey())) continue; // double
|
||||
DigestURL url = e.getValue();
|
||||
final Request request = this.loader.request(url, true, true);
|
||||
final CrawlProfile profile = this.crawler.getActive(ASCII.getBytes(request.profileHandle()));
|
||||
final CrawlProfile profile = this.crawler.get(ASCII.getBytes(request.profileHandle()));
|
||||
final String acceptedError = this.crawlStacker.checkAcceptance(url, profile, 0);
|
||||
if (acceptedError != null) {
|
||||
this.log.info("addToCrawler: cannot load " + url.toNormalform(true) + ": " + acceptedError);
|
||||
|
|
Loading…
Reference in New Issue
Block a user