self-healing of mistakenly deactivated crawl profiles. This fixes a bug

which can happen in rare cases when a crawl start and a cleanup process
happen at the same time.
This commit is contained in:
Michael Peter Christen 2013-09-25 18:27:54 +02:00
parent 095053a9b4
commit 91a875dff5
10 changed files with 40 additions and 22 deletions

View File

@ -394,7 +394,7 @@ public class Balancer {
// at this point we must check if the crawlEntry has relevance because the crawl profile still exists
// if not: return null. A calling method must handle the null value and try again
profileEntry = cs.getActive(UTF8.getBytes(crawlEntry.profileHandle()));
profileEntry = cs.get(UTF8.getBytes(crawlEntry.profileHandle()));
if (profileEntry == null) {
ConcurrentLog.warn("Balancer", "no profile entry for handle " + crawlEntry.profileHandle());
continue;
@ -481,7 +481,7 @@ public class Balancer {
rowEntry = this.urlFileIndex.get(urlhash, false);
if (rowEntry == null) continue; // may have been deleted there manwhile
Request crawlEntry = new Request(rowEntry);
CrawlProfile profileEntry = cs.getActive(UTF8.getBytes(crawlEntry.profileHandle()));
CrawlProfile profileEntry = cs.get(UTF8.getBytes(crawlEntry.profileHandle()));
if (profileEntry == null) {
ConcurrentLog.warn("Balancer", "no profile entry for handle " + crawlEntry.profileHandle());
continue;

View File

@ -149,7 +149,7 @@ public final class CrawlStacker {
// if the url was rejected we store it into the error URL db
if (rejectReason != null && !rejectReason.startsWith("double in")) {
final CrawlProfile profile = this.crawler.getActive(UTF8.getBytes(entry.profileHandle()));
final CrawlProfile profile = this.crawler.get(UTF8.getBytes(entry.profileHandle()));
this.nextQueue.errorURL.push(entry.url(), profile, FailCategory.FINAL_LOAD_CONTEXT, rejectReason, -1);
}
} catch (final Exception e) {
@ -294,7 +294,8 @@ public final class CrawlStacker {
public String stackCrawl(final Request entry) {
//this.log.logFinest("stackCrawl: nexturlString='" + nexturlString + "'");
final CrawlProfile profile = this.crawler.getActive(UTF8.getBytes(entry.profileHandle()));
byte[] handle = UTF8.getBytes(entry.profileHandle());
final CrawlProfile profile = this.crawler.get(handle);
String error;
if (profile == null) {
error = "LOST STACKER PROFILE HANDLE '" + entry.profileHandle() + "' for URL " + entry.url();

View File

@ -166,6 +166,23 @@ public final class CrawlSwitchboard {
/ 1024);
}
/**
* Get a profile from active or passive stack. Should be used to be sure not to miss old, cleaned profiles.
* A profile that was discovered from the passive stack is automatically shifted back to the active stack.
* @param profileKey
* @return
*/
public CrawlProfile get(final byte[] profileKey) {
CrawlProfile profile = getActive(profileKey);
if (profile != null) return profile;
profile = getPassive(profileKey);
if (profile == null) return null;
// clean up
this.putActive(profileKey, profile);
this.removePassive(profileKey);
return profile;
}
public CrawlProfile getActive(final byte[] profileKey) {
if ( profileKey == null ) {
return null;

View File

@ -148,7 +148,7 @@ public class CrawlProfile extends ConcurrentHashMap<String, String> implements M
}
if (name.length() > 256) name = name.substring(256);
this.doms = new ConcurrentHashMap<String, AtomicInteger>();
final String handle = Base64Order.enhancedCoder.encode(Digest.encodeMD5Raw(name + crawlerUrlMustMatch + depth + crawlerUrlMustNotMatch + domMaxPages)).substring(0, Word.commonHashLength);
final String handle = Base64Order.enhancedCoder.encode(Digest.encodeMD5Raw(name + crawlerUrlMustMatch + depth + crawlerUrlMustNotMatch + domMaxPages + collections)).substring(0, Word.commonHashLength);
put(HANDLE, handle);
put(NAME, name);
put(AGENT_NAME, userAgentName);

View File

@ -255,7 +255,7 @@ public class CrawlQueues {
this.log.severe(stats + ": NULL PROFILE HANDLE '" + urlEntry.profileHandle() + "' for URL " + urlEntry.url());
return true;
}
final CrawlProfile profile = this.sb.crawler.getActive(ASCII.getBytes(profileHandle));
final CrawlProfile profile = this.sb.crawler.get(ASCII.getBytes(profileHandle));
if (profile == null) {
this.log.severe(stats + ": NULL PROFILE HANDLE '" + urlEntry.profileHandle() + "' for URL " + urlEntry.url());
return true;
@ -297,7 +297,7 @@ public class CrawlQueues {
* @return
*/
private void load(final Request urlEntry, final String stats, final String profileHandle) {
final CrawlProfile profile = this.sb.crawler.getActive(UTF8.getBytes(profileHandle));
final CrawlProfile profile = this.sb.crawler.get(UTF8.getBytes(profileHandle));
if (profile != null) {
// check if the protocol is supported
@ -606,7 +606,7 @@ public class CrawlQueues {
this.request.setStatus("worker-initialized", WorkflowJob.STATUS_INITIATED);
this.code = Integer.valueOf(entry.hashCode());
this.setPriority(Thread.MIN_PRIORITY); // http requests from the crawler should not cause that other functions work worse
this.profile = CrawlQueues.this.sb.crawler.getActive(UTF8.getBytes(this.request.profileHandle()));
this.profile = CrawlQueues.this.sb.crawler.get(UTF8.getBytes(this.request.profileHandle()));
}
private long age() {

View File

@ -101,7 +101,7 @@ public class FTPLoader {
// create new ftp client
final FTPClient ftpClient = new FTPClient();
final CrawlProfile profile = this.sb.crawler.getActive(ASCII.getBytes(request.profileHandle()));
final CrawlProfile profile = this.sb.crawler.get(ASCII.getBytes(request.profileHandle()));
// get a connection
if (openConnection(ftpClient, entryUrl)) {
// test if the specified file is a directory
@ -249,7 +249,7 @@ public class FTPLoader {
// create response with metadata only
responseHeader.put(HeaderFramework.CONTENT_TYPE, "text/plain");
final CrawlProfile profile = this.sb.crawler.getActive(ASCII.getBytes(request.profileHandle()));
final CrawlProfile profile = this.sb.crawler.get(ASCII.getBytes(request.profileHandle()));
final Response response = new Response(
request,
requestHeader,
@ -264,7 +264,7 @@ public class FTPLoader {
final byte[] b = ftpClient.get(path);
// create a response
final CrawlProfile profile = this.sb.crawler.getActive(ASCII.getBytes(request.profileHandle()));
final CrawlProfile profile = this.sb.crawler.get(ASCII.getBytes(request.profileHandle()));
final Response response = new Response(
request,
requestHeader,

View File

@ -83,7 +83,7 @@ public class FileLoader {
ResponseHeader responseHeader = new ResponseHeader(200);
responseHeader.put(HeaderFramework.LAST_MODIFIED, HeaderFramework.formatRFC1123(new Date()));
responseHeader.put(HeaderFramework.CONTENT_TYPE, "text/html");
final CrawlProfile profile = this.sb.crawler.getActive(ASCII.getBytes(request.profileHandle()));
final CrawlProfile profile = this.sb.crawler.get(ASCII.getBytes(request.profileHandle()));
Response response = new Response(
request,
requestHeader,
@ -123,7 +123,7 @@ public class FileLoader {
// create response with metadata only
responseHeader.put(HeaderFramework.CONTENT_TYPE, "text/plain");
final CrawlProfile profile = this.sb.crawler.getActive(ASCII.getBytes(request.profileHandle()));
final CrawlProfile profile = this.sb.crawler.get(ASCII.getBytes(request.profileHandle()));
Response response = new Response(
request,
requestHeader,
@ -140,7 +140,7 @@ public class FileLoader {
is.close();
// create response with loaded content
final CrawlProfile profile = this.sb.crawler.getActive(ASCII.getBytes(request.profileHandle()));
final CrawlProfile profile = this.sb.crawler.get(ASCII.getBytes(request.profileHandle()));
Response response = new Response(
request,
requestHeader,

View File

@ -101,7 +101,7 @@ public class SMBLoader {
ResponseHeader responseHeader = new ResponseHeader(200);
responseHeader.put(HeaderFramework.LAST_MODIFIED, HeaderFramework.formatRFC1123(new Date()));
responseHeader.put(HeaderFramework.CONTENT_TYPE, "text/html");
final CrawlProfile profile = this.sb.crawler.getActive(ASCII.getBytes(request.profileHandle()));
final CrawlProfile profile = this.sb.crawler.get(ASCII.getBytes(request.profileHandle()));
Response response = new Response(
request,
requestHeader,
@ -141,7 +141,7 @@ public class SMBLoader {
// create response with metadata only
responseHeader.put(HeaderFramework.CONTENT_TYPE, "text/plain");
final CrawlProfile profile = this.sb.crawler.getActive(request.profileHandle().getBytes());
final CrawlProfile profile = this.sb.crawler.get(request.profileHandle().getBytes());
Response response = new Response(
request,
requestHeader,
@ -158,7 +158,7 @@ public class SMBLoader {
is.close();
// create response with loaded content
final CrawlProfile profile = this.sb.crawler.getActive(request.profileHandle().getBytes());
final CrawlProfile profile = this.sb.crawler.get(request.profileHandle().getBytes());
Response response = new Response(
request,
requestHeader,

View File

@ -187,7 +187,7 @@ public final class LoaderDispatcher {
if (url.isFile() || url.isSMB()) cacheStrategy = CacheStrategy.NOCACHE; // load just from the file system
final String protocol = url.getProtocol();
final String host = url.getHost();
final CrawlProfile crawlProfile = request.profileHandle() == null ? null : this.sb.crawler.getActive(UTF8.getBytes(request.profileHandle()));
final CrawlProfile crawlProfile = request.profileHandle() == null ? null : this.sb.crawler.get(UTF8.getBytes(request.profileHandle()));
// check if url is in blacklist
if (blacklistType != null && host != null && Switchboard.urlBlacklist.isListed(blacklistType, host.toLowerCase(), url.getFile())) {

View File

@ -2514,7 +2514,7 @@ public final class Switchboard extends serverSwitch {
} else {
// we consider this as fail urls to have a tracking of the problem
if (rejectReason != null && !rejectReason.startsWith("double in")) {
final CrawlProfile profile = this.crawler.getActive(UTF8.getBytes(response.profile().handle()));
final CrawlProfile profile = this.crawler.get(UTF8.getBytes(response.profile().handle()));
this.crawlStacker.nextQueue.errorURL.push(response.url(), profile, FailCategory.FINAL_LOAD_CONTEXT, rejectReason, -1);
}
}
@ -3002,7 +3002,7 @@ public final class Switchboard extends serverSwitch {
continue;
}
final Request request = this.loader.request(e.getValue(), true, true);
final CrawlProfile profile = this.crawler.getActive(ASCII.getBytes(request.profileHandle()));
final CrawlProfile profile = this.crawler.get(ASCII.getBytes(request.profileHandle()));
final String acceptedError = this.crawlStacker.checkAcceptance(e.getValue(), profile, 0);
if (acceptedError != null) {
this.log.warn("addToIndex: cannot load " + urlName + ": " + acceptedError);
@ -3032,7 +3032,7 @@ public final class Switchboard extends serverSwitch {
final Document[] documents = response.parse();
if (documents != null) {
for (final Document document: documents) {
final CrawlProfile profile = crawler.getActive(ASCII.getBytes(request.profileHandle()));
final CrawlProfile profile = crawler.get(ASCII.getBytes(request.profileHandle()));
if (document.indexingDenied() && (profile == null || profile.obeyHtmlRobotsNoindex())) {
throw new Parser.Failure("indexing is denied", url);
}
@ -3075,7 +3075,7 @@ public final class Switchboard extends serverSwitch {
if (existingids.contains(e.getKey())) continue; // double
DigestURL url = e.getValue();
final Request request = this.loader.request(url, true, true);
final CrawlProfile profile = this.crawler.getActive(ASCII.getBytes(request.profileHandle()));
final CrawlProfile profile = this.crawler.get(ASCII.getBytes(request.profileHandle()));
final String acceptedError = this.crawlStacker.checkAcceptance(url, profile, 0);
if (acceptedError != null) {
this.log.info("addToCrawler: cannot load " + url.toNormalform(true) + ": " + acceptedError);