self-healing of mistakenly deactivated crawl profiles. This fixes a bug

which can happen in rare cases when a crawl start and a cleanup process happen at the same time.
2024-09-19 00:01:41 +02:00 · 2013-09-25 18:27:54 +02:00 · 2013-09-25 18:27:54 +02:00 · 91a875dff5
commit 91a875dff5
parent 095053a9b4
10 changed files with 40 additions and 22 deletions
--- a/source/net/yacy/crawler/Balancer.java
+++ b/source/net/yacy/crawler/Balancer.java
@ -394,7 +394,7 @@ public class Balancer {
    
    	        // at this point we must check if the crawlEntry has relevance because the crawl profile still exists
    	        // if not: return null. A calling method must handle the null value and try again
-    	        profileEntry = cs.getActive(UTF8.getBytes(crawlEntry.profileHandle()));
+    	        profileEntry = cs.get(UTF8.getBytes(crawlEntry.profileHandle()));
    	        if (profileEntry == null) {
    	        	ConcurrentLog.warn("Balancer", "no profile entry for handle " + crawlEntry.profileHandle());
    	        	continue;
@ -481,7 +481,7 @@ public class Balancer {
                    rowEntry = this.urlFileIndex.get(urlhash, false);
                    if (rowEntry == null) continue; // may have been deleted there manwhile
                    Request crawlEntry = new Request(rowEntry);
-                    CrawlProfile profileEntry = cs.getActive(UTF8.getBytes(crawlEntry.profileHandle()));
+                    CrawlProfile profileEntry = cs.get(UTF8.getBytes(crawlEntry.profileHandle()));
                    if (profileEntry == null) {
                        ConcurrentLog.warn("Balancer", "no profile entry for handle " + crawlEntry.profileHandle());
                        continue;
--- a/source/net/yacy/crawler/CrawlStacker.java
+++ b/source/net/yacy/crawler/CrawlStacker.java
@ -149,7 +149,7 @@ public final class CrawlStacker {

            // if the url was rejected we store it into the error URL db
            if (rejectReason != null && !rejectReason.startsWith("double in")) {
-                final CrawlProfile profile = this.crawler.getActive(UTF8.getBytes(entry.profileHandle()));
+                final CrawlProfile profile = this.crawler.get(UTF8.getBytes(entry.profileHandle()));
                this.nextQueue.errorURL.push(entry.url(), profile, FailCategory.FINAL_LOAD_CONTEXT, rejectReason, -1);
            }
        } catch (final Exception e) {
@ -294,7 +294,8 @@ public final class CrawlStacker {
    public String stackCrawl(final Request entry) {
        //this.log.logFinest("stackCrawl: nexturlString='" + nexturlString + "'");

-        final CrawlProfile profile = this.crawler.getActive(UTF8.getBytes(entry.profileHandle()));
+        byte[] handle = UTF8.getBytes(entry.profileHandle());
+        final CrawlProfile profile = this.crawler.get(handle);
        String error;
        if (profile == null) {
            error = "LOST STACKER PROFILE HANDLE '" + entry.profileHandle() + "' for URL " + entry.url();
--- a/source/net/yacy/crawler/CrawlSwitchboard.java
+++ b/source/net/yacy/crawler/CrawlSwitchboard.java
@ -166,6 +166,23 @@ public final class CrawlSwitchboard {
            / 1024);
    }

+    /**
+     * Get a profile from active or passive stack. Should be used to be sure not to miss old, cleaned profiles.
+     * A profile that was discovered from the passive stack is automatically shifted back to the active stack.
+     * @param profileKey
+     * @return
+     */
+    public CrawlProfile get(final byte[] profileKey) {
+        CrawlProfile profile = getActive(profileKey);
+        if (profile != null) return profile;
+        profile = getPassive(profileKey);
+        if (profile == null) return null;
+        // clean up
+        this.putActive(profileKey, profile);
+        this.removePassive(profileKey);
+        return profile;
+    }
+
    public CrawlProfile getActive(final byte[] profileKey) {
        if ( profileKey == null ) {
            return null;
--- a/source/net/yacy/crawler/data/CrawlProfile.java
+++ b/source/net/yacy/crawler/data/CrawlProfile.java
@ -148,7 +148,7 @@ public class CrawlProfile extends ConcurrentHashMap<String, String> implements M
        }
        if (name.length() > 256) name = name.substring(256);
        this.doms = new ConcurrentHashMap<String, AtomicInteger>();
-        final String handle = Base64Order.enhancedCoder.encode(Digest.encodeMD5Raw(name + crawlerUrlMustMatch + depth + crawlerUrlMustNotMatch + domMaxPages)).substring(0, Word.commonHashLength);
+        final String handle = Base64Order.enhancedCoder.encode(Digest.encodeMD5Raw(name + crawlerUrlMustMatch + depth + crawlerUrlMustNotMatch + domMaxPages + collections)).substring(0, Word.commonHashLength);
        put(HANDLE,           handle);
        put(NAME,             name);
        put(AGENT_NAME, userAgentName);
--- a/source/net/yacy/crawler/data/CrawlQueues.java
+++ b/source/net/yacy/crawler/data/CrawlQueues.java
@ -255,7 +255,7 @@ public class CrawlQueues {
                        this.log.severe(stats + ": NULL PROFILE HANDLE '" + urlEntry.profileHandle() + "' for URL " + urlEntry.url());
                        return true;
                    }
-                    final CrawlProfile profile = this.sb.crawler.getActive(ASCII.getBytes(profileHandle));
+                    final CrawlProfile profile = this.sb.crawler.get(ASCII.getBytes(profileHandle));
                    if (profile == null) {
                        this.log.severe(stats + ": NULL PROFILE HANDLE '" + urlEntry.profileHandle() + "' for URL " + urlEntry.url());
                        return true;
@ -297,7 +297,7 @@ public class CrawlQueues {
     * @return
     */
    private void load(final Request urlEntry, final String stats, final String profileHandle) {
-        final CrawlProfile profile = this.sb.crawler.getActive(UTF8.getBytes(profileHandle));
+        final CrawlProfile profile = this.sb.crawler.get(UTF8.getBytes(profileHandle));
        if (profile != null) {

            // check if the protocol is supported
@ -606,7 +606,7 @@ public class CrawlQueues {
            this.request.setStatus("worker-initialized", WorkflowJob.STATUS_INITIATED);
            this.code = Integer.valueOf(entry.hashCode());
            this.setPriority(Thread.MIN_PRIORITY); // http requests from the crawler should not cause that other functions work worse
-            this.profile = CrawlQueues.this.sb.crawler.getActive(UTF8.getBytes(this.request.profileHandle()));
+            this.profile = CrawlQueues.this.sb.crawler.get(UTF8.getBytes(this.request.profileHandle()));
        }

        private long age() {
--- a/source/net/yacy/crawler/retrieval/FTPLoader.java
+++ b/source/net/yacy/crawler/retrieval/FTPLoader.java
@ -101,7 +101,7 @@ public class FTPLoader {
        // create new ftp client
        final FTPClient ftpClient = new FTPClient();

-        final CrawlProfile profile = this.sb.crawler.getActive(ASCII.getBytes(request.profileHandle()));
+        final CrawlProfile profile = this.sb.crawler.get(ASCII.getBytes(request.profileHandle()));
        // get a connection
        if (openConnection(ftpClient, entryUrl)) {
            // test if the specified file is a directory
@ -249,7 +249,7 @@ public class FTPLoader {

            // create response with metadata only
            responseHeader.put(HeaderFramework.CONTENT_TYPE, "text/plain");
-            final CrawlProfile profile = this.sb.crawler.getActive(ASCII.getBytes(request.profileHandle()));
+            final CrawlProfile profile = this.sb.crawler.get(ASCII.getBytes(request.profileHandle()));
            final Response response = new Response(
                    request,
                    requestHeader,
@ -264,7 +264,7 @@ public class FTPLoader {
        final byte[] b = ftpClient.get(path);

        // create a response
-        final CrawlProfile profile = this.sb.crawler.getActive(ASCII.getBytes(request.profileHandle()));
+        final CrawlProfile profile = this.sb.crawler.get(ASCII.getBytes(request.profileHandle()));
        final Response response = new Response(
                request,
                requestHeader,
--- a/source/net/yacy/crawler/retrieval/FileLoader.java
+++ b/source/net/yacy/crawler/retrieval/FileLoader.java
@ -83,7 +83,7 @@ public class FileLoader {
            ResponseHeader responseHeader = new ResponseHeader(200);
            responseHeader.put(HeaderFramework.LAST_MODIFIED, HeaderFramework.formatRFC1123(new Date()));
            responseHeader.put(HeaderFramework.CONTENT_TYPE, "text/html");
-            final CrawlProfile profile = this.sb.crawler.getActive(ASCII.getBytes(request.profileHandle()));
+            final CrawlProfile profile = this.sb.crawler.get(ASCII.getBytes(request.profileHandle()));
            Response response = new Response(
                    request,
                    requestHeader,
@ -123,7 +123,7 @@ public class FileLoader {

            // create response with metadata only
            responseHeader.put(HeaderFramework.CONTENT_TYPE, "text/plain");
-            final CrawlProfile profile = this.sb.crawler.getActive(ASCII.getBytes(request.profileHandle()));
+            final CrawlProfile profile = this.sb.crawler.get(ASCII.getBytes(request.profileHandle()));
            Response response = new Response(
                    request,
                    requestHeader,
@ -140,7 +140,7 @@ public class FileLoader {
        is.close();

        // create response with loaded content
-        final CrawlProfile profile = this.sb.crawler.getActive(ASCII.getBytes(request.profileHandle()));
+        final CrawlProfile profile = this.sb.crawler.get(ASCII.getBytes(request.profileHandle()));
        Response response = new Response(
                request,
                requestHeader,
--- a/source/net/yacy/crawler/retrieval/SMBLoader.java
+++ b/source/net/yacy/crawler/retrieval/SMBLoader.java
@ -101,7 +101,7 @@ public class SMBLoader {
            ResponseHeader responseHeader = new ResponseHeader(200);
            responseHeader.put(HeaderFramework.LAST_MODIFIED, HeaderFramework.formatRFC1123(new Date()));
            responseHeader.put(HeaderFramework.CONTENT_TYPE, "text/html");
-            final CrawlProfile profile = this.sb.crawler.getActive(ASCII.getBytes(request.profileHandle()));
+            final CrawlProfile profile = this.sb.crawler.get(ASCII.getBytes(request.profileHandle()));
            Response response = new Response(
                    request,
                    requestHeader,
@ -141,7 +141,7 @@ public class SMBLoader {

            // create response with metadata only
            responseHeader.put(HeaderFramework.CONTENT_TYPE, "text/plain");
-            final CrawlProfile profile = this.sb.crawler.getActive(request.profileHandle().getBytes());
+            final CrawlProfile profile = this.sb.crawler.get(request.profileHandle().getBytes());
            Response response = new Response(
                    request,
                    requestHeader,
@ -158,7 +158,7 @@ public class SMBLoader {
        is.close();

        // create response with loaded content
-        final CrawlProfile profile = this.sb.crawler.getActive(request.profileHandle().getBytes());
+        final CrawlProfile profile = this.sb.crawler.get(request.profileHandle().getBytes());
        Response response = new Response(
                request,
                requestHeader,
--- a/source/net/yacy/repository/LoaderDispatcher.java
+++ b/source/net/yacy/repository/LoaderDispatcher.java
@ -187,7 +187,7 @@ public final class LoaderDispatcher {
        if (url.isFile() || url.isSMB()) cacheStrategy = CacheStrategy.NOCACHE; // load just from the file system
        final String protocol = url.getProtocol();
        final String host = url.getHost();
-        final CrawlProfile crawlProfile = request.profileHandle() == null ? null : this.sb.crawler.getActive(UTF8.getBytes(request.profileHandle()));
+        final CrawlProfile crawlProfile = request.profileHandle() == null ? null : this.sb.crawler.get(UTF8.getBytes(request.profileHandle()));
        
        // check if url is in blacklist
        if (blacklistType != null && host != null && Switchboard.urlBlacklist.isListed(blacklistType, host.toLowerCase(), url.getFile())) {
--- a/source/net/yacy/search/Switchboard.java
+++ b/source/net/yacy/search/Switchboard.java
@ -2514,7 +2514,7 @@ public final class Switchboard extends serverSwitch {
                } else {
                    // we consider this as fail urls to have a tracking of the problem
                    if (rejectReason != null && !rejectReason.startsWith("double in")) {
-                        final CrawlProfile profile = this.crawler.getActive(UTF8.getBytes(response.profile().handle()));
+                        final CrawlProfile profile = this.crawler.get(UTF8.getBytes(response.profile().handle()));
                        this.crawlStacker.nextQueue.errorURL.push(response.url(), profile, FailCategory.FINAL_LOAD_CONTEXT, rejectReason, -1);
                    }
                }
@ -3002,7 +3002,7 @@ public final class Switchboard extends serverSwitch {
                continue;
            }
            final Request request = this.loader.request(e.getValue(), true, true);
-            final CrawlProfile profile = this.crawler.getActive(ASCII.getBytes(request.profileHandle()));
+            final CrawlProfile profile = this.crawler.get(ASCII.getBytes(request.profileHandle()));
            final String acceptedError = this.crawlStacker.checkAcceptance(e.getValue(), profile, 0);
            if (acceptedError != null) {
                this.log.warn("addToIndex: cannot load " + urlName + ": " + acceptedError);
@ -3032,7 +3032,7 @@ public final class Switchboard extends serverSwitch {
                        final Document[] documents = response.parse();
                        if (documents != null) {
                            for (final Document document: documents) {
-                                final CrawlProfile profile = crawler.getActive(ASCII.getBytes(request.profileHandle()));
+                                final CrawlProfile profile = crawler.get(ASCII.getBytes(request.profileHandle()));
                                if (document.indexingDenied() && (profile == null || profile.obeyHtmlRobotsNoindex())) {
                                    throw new Parser.Failure("indexing is denied", url);
                                }
@ -3075,7 +3075,7 @@ public final class Switchboard extends serverSwitch {
            if (existingids.contains(e.getKey())) continue; // double
            DigestURL url = e.getValue();
            final Request request = this.loader.request(url, true, true);
-            final CrawlProfile profile = this.crawler.getActive(ASCII.getBytes(request.profileHandle()));
+            final CrawlProfile profile = this.crawler.get(ASCII.getBytes(request.profileHandle()));
            final String acceptedError = this.crawlStacker.checkAcceptance(url, profile, 0);
            if (acceptedError != null) {
                this.log.info("addToCrawler: cannot load " + url.toNormalform(true) + ": " + acceptedError);