fix for wrong status codes of error pages

2024-09-19 00:01:41 +02:00 · 2014-04-10 09:08:59 +02:00 · 2014-04-10 09:08:59 +02:00 · 6bd8c6f195
commit 6bd8c6f195
parent 9e503b3376
5 changed files with 38 additions and 24 deletions
--- a/htroot/Crawler_p.java
+++ b/htroot/Crawler_p.java
@ -224,6 +224,7 @@ public class Crawler_p {
                    sb.robots.delete(ru);
                    try {Cache.delete(RobotsTxt.robotsURL(RobotsTxt.getHostPort(ru)).hash());} catch (IOException e) {}
                }
+                try {sb.robots.clear();} catch (IOException e) {} // to be safe: clear all.
                
                // set the crawl filter
                String ipMustMatch = post.get("ipMustmatch", CrawlProfile.MATCH_ALL_STRING);
--- a/source/net/yacy/crawler/data/CrawlQueues.java
+++ b/source/net/yacy/crawler/data/CrawlQueues.java
@ -639,7 +639,7 @@ public class CrawlQueues {
                        } else {
                            // starting a load from the internet
                            request.setStatus("worker-loading", WorkflowJob.STATUS_RUNNING);
-                            String result = null;
+                            String error = null;
   
                            // load a resource and push queue entry to switchboard queue
                            // returns null if everything went fine, a fail reason string if a problem occurred
@ -651,23 +651,29 @@ public class CrawlQueues {
                                    if (CrawlQueues.log.isFine()) {
                                        CrawlQueues.log.fine("problem loading " + request.url().toString() + ": no content (possibly caused by cache policy)");
                                    }
-                                    result = "no content (possibly caused by cache policy)";
+                                    error = "no content (possibly caused by cache policy)";
                                } else {
                                    request.setStatus("loaded", WorkflowJob.STATUS_RUNNING);
                                    final String storedFailMessage = CrawlQueues.this.sb.toIndexer(response);
                                    request.setStatus("enqueued-" + ((storedFailMessage == null) ? "ok" : "fail"), WorkflowJob.STATUS_FINISHED);
-                                    result = (storedFailMessage == null) ? null : "not enqueued to indexer: " + storedFailMessage;
+                                    error = (storedFailMessage == null) ? null : "not enqueued to indexer: " + storedFailMessage;
                                }
                            } catch (final IOException e) {
                                request.setStatus("error", WorkflowJob.STATUS_FINISHED);
                                if (CrawlQueues.log.isFine()) {
                                    CrawlQueues.log.fine("problem loading " + request.url().toString() + ": " + e.getMessage());
                                }
-                                result = "load error - " + e.getMessage();
+                                error = "load error - " + e.getMessage();
                            }
   
-                            if (result != null) {
-                                CrawlQueues.this.errorURL.push(request.url(), profile, FailCategory.TEMPORARY_NETWORK_FAILURE, "cannot load: " + result, -1);
+                            if (error != null) {
+                                if (error.endsWith("$")) {
+                                    // the "$" mark at the end of the error message means, that the error was already pushed to the error-db by the reporting method
+                                    // thus we only push this message if we don't have that mark
+                                    error = error.substring(0, error.length() - 1).trim();
+                                } else {
+                                    CrawlQueues.this.errorURL.push(request.url(), profile, FailCategory.TEMPORARY_NETWORK_FAILURE, "cannot load: " + error, -1);
+                                }
                                request.setStatus("worker-error", WorkflowJob.STATUS_FINISHED);
                            } else {
                                request.setStatus("worker-processed", WorkflowJob.STATUS_FINISHED);
--- a/source/net/yacy/crawler/retrieval/HTTPLoader.java
+++ b/source/net/yacy/crawler/retrieval/HTTPLoader.java
@ -78,7 +78,7 @@ public final class HTTPLoader {

        if (retryCount < 0) {
            this.sb.crawlQueues.errorURL.push(request.url(), profile, FailCategory.TEMPORARY_NETWORK_FAILURE, "retry counter exceeded", -1);
-            throw new IOException("retry counter exceeded for URL " + request.url().toString() + ". Processing aborted.");
+            throw new IOException("retry counter exceeded for URL " + request.url().toString() + ". Processing aborted.$");
        }

        DigestURL url = request.url();
@ -94,7 +94,7 @@ public final class HTTPLoader {
        final String hostlow = host.toLowerCase();
        if (blacklistType != null && Switchboard.urlBlacklist.isListed(blacklistType, hostlow, path)) {
            this.sb.crawlQueues.errorURL.push(request.url(), profile, FailCategory.FINAL_LOAD_CONTEXT, "url in blacklist", -1);
-            throw new IOException("CRAWLER Rejecting URL '" + request.url().toString() + "'. URL is in blacklist.");
+            throw new IOException("CRAWLER Rejecting URL '" + request.url().toString() + "'. URL is in blacklist.$");
        }

        // resolve yacy and yacyh domains
@ -141,7 +141,7 @@ public final class HTTPLoader {

            if (redirectionUrlString.isEmpty()) {
                this.sb.crawlQueues.errorURL.push(request.url(), profile, FailCategory.TEMPORARY_NETWORK_FAILURE, "no redirection url provided, field '" + HeaderFramework.LOCATION + "' is empty", statusCode);
-                throw new IOException("REJECTED EMTPY REDIRECTION '" + client.getHttpResponse().getStatusLine() + "' for URL " + requestURLString);
+                throw new IOException("REJECTED EMTPY REDIRECTION '" + client.getHttpResponse().getStatusLine() + "' for URL '" + requestURLString + "'$");
            }

            // normalize URL
@ -161,7 +161,7 @@ public final class HTTPLoader {
                // if we are already doing a shutdown we don't need to retry crawling
                if (Thread.currentThread().isInterrupted()) {
                    this.sb.crawlQueues.errorURL.push(request.url(), profile, FailCategory.FINAL_LOAD_CONTEXT, "server shutdown", statusCode);
-                    throw new IOException("CRAWLER Retry of URL=" + requestURLString + " aborted because of server shutdown.");
+                    throw new IOException("CRAWLER Retry of URL=" + requestURLString + " aborted because of server shutdown.$");
                }

                // retry crawling with new url
@ -170,11 +170,11 @@ public final class HTTPLoader {
    	    }
            // we don't want to follow redirects
            this.sb.crawlQueues.errorURL.push(request.url(), profile, FailCategory.FINAL_PROCESS_CONTEXT, "redirection not wanted", statusCode);
-            throw new IOException("REJECTED UNWANTED REDIRECTION '" + client.getHttpResponse().getStatusLine() + "' for URL " + requestURLString);
+            throw new IOException("REJECTED UNWANTED REDIRECTION '" + client.getHttpResponse().getStatusLine() + "' for URL '" + requestURLString + "'$");
        } else if (responseBody == null) {
    	    // no response, reject file
            this.sb.crawlQueues.errorURL.push(request.url(), profile, FailCategory.TEMPORARY_NETWORK_FAILURE, "no response body", statusCode);
-            throw new IOException("REJECTED EMPTY RESPONSE BODY '" + client.getHttpResponse().getStatusLine() + "' for URL " + requestURLString);
+            throw new IOException("REJECTED EMPTY RESPONSE BODY '" + client.getHttpResponse().getStatusLine() + "' for URL '" + requestURLString + "'$");
    	} else if (statusCode == 200 || statusCode == 203) {
            // the transfer is ok

@ -185,7 +185,7 @@ public final class HTTPLoader {
            // check length again in case it was not possible to get the length before loading
            if (maxFileSize >= 0 && contentLength > maxFileSize) {
            	this.sb.crawlQueues.errorURL.push(request.url(), profile, FailCategory.FINAL_PROCESS_CONTEXT, "file size limit exceeded", statusCode);
-            	throw new IOException("REJECTED URL " + request.url() + " because file size '" + contentLength + "' exceeds max filesize limit of " + maxFileSize + " bytes. (GET)");
+            	throw new IOException("REJECTED URL " + request.url() + " because file size '" + contentLength + "' exceeds max filesize limit of " + maxFileSize + " bytes. (GET)$");
            }

            // create a new cache entry
@ -202,7 +202,7 @@ public final class HTTPLoader {
    	} else {
            // if the response has not the right response type then reject file
        	this.sb.crawlQueues.errorURL.push(request.url(), profile, FailCategory.TEMPORARY_NETWORK_FAILURE, "wrong http status code", statusCode);
-            throw new IOException("REJECTED WRONG STATUS TYPE '" + client.getHttpResponse().getStatusLine() + "' for URL " + requestURLString);
+            throw new IOException("REJECTED WRONG STATUS TYPE '" + client.getHttpResponse().getStatusLine() + "' for URL '" + requestURLString + "'$");
        }
    }

--- a/source/net/yacy/repository/LoaderDispatcher.java
+++ b/source/net/yacy/repository/LoaderDispatcher.java
@ -163,12 +163,13 @@ public final class LoaderDispatcher {
            check = this.loaderSteering.remove(request.url());
            if (check != null) check.release(1000);
            return response;
-        } catch (final IOException e) {
+        } catch (final IOException e) {
+            throw new IOException(e);
+        } finally {
            // release the semaphore anyway
            check = this.loaderSteering.remove(request.url());
-            if (check != null) check.release(1000);
-            // Very noisy: ConcurrentLog.logException(e);
-            throw new IOException(e);
+            if (check != null) check.release(1000);
+            // Very noisy: ConcurrentLog.logException(e);            
        }
    }

@ -190,7 +191,7 @@ public final class LoaderDispatcher {
        // check if url is in blacklist
        if (blacklistType != null && host != null && Switchboard.urlBlacklist.isListed(blacklistType, host.toLowerCase(), url.getFile())) {
            this.sb.crawlQueues.errorURL.push(request.url(), crawlProfile, FailCategory.FINAL_LOAD_CONTEXT, "url in blacklist", -1);
-            throw new IOException("DISPATCHER Rejecting URL '" + request.url().toString() + "'. URL is in blacklist.");
+            throw new IOException("DISPATCHER Rejecting URL '" + request.url().toString() + "'. URL is in blacklist.$");
        }

        // check if we have the page in the cache
@ -244,13 +245,13 @@ public final class LoaderDispatcher {
            }
        }

-        // check case where we want results from the cache exclusively, and never from the internet (offline mode)
+        // check case where we want results from the cache exclusively, and never from the Internet (offline mode)
        if (cacheStrategy == CacheStrategy.CACHEONLY) {
            // we had a chance to get the content from the cache .. its over. We don't have it.
            throw new IOException("cache only strategy");
        }

-        // now forget about the cache, nothing there. Try to load the content from the internet
+        // now forget about the cache, nothing there. Try to load the content from the Internet

        // check access time: this is a double-check (we checked possibly already in the balancer)
        // to make sure that we don't DoS the target by mistake
@ -302,7 +303,7 @@ public final class LoaderDispatcher {
            // no caching wanted. Thats ok, do not write any message
            return response;
        }
-        // second check tells us if the protocoll tells us something about caching
+        // second check tells us if the protocol tells us something about caching
        final String storeError = response.shallStoreCacheForCrawler();
        if (storeError == null) {
            try {
--- a/source/net/yacy/search/index/ErrorCache.java
+++ b/source/net/yacy/search/index/ErrorCache.java
@ -114,8 +114,14 @@ public class ErrorCache {
        if (this.fulltext.getDefaultConnector() != null && failCategory.store) {
            // send the error to solr
            try {
-                SolrInputDocument errorDoc = failDoc.toSolr(this.fulltext.getDefaultConfiguration());
-                this.fulltext.getDefaultConnector().add(errorDoc);
+                // do not overwrite error reports with error reports
+                SolrDocument olddoc = this.fulltext.getDefaultConnector().getDocumentById(ASCII.String(failDoc.getDigestURL().hash()), CollectionSchema.httpstatus_i.getSolrFieldName());
+                if (olddoc == null ||
+                    olddoc.getFieldValue(CollectionSchema.httpstatus_i.getSolrFieldName()) == null ||
+                    ((Integer) olddoc.getFieldValue(CollectionSchema.httpstatus_i.getSolrFieldName())) == 200) {
+                    SolrInputDocument errorDoc = failDoc.toSolr(this.fulltext.getDefaultConfiguration());
+                    this.fulltext.getDefaultConnector().add(errorDoc);
+                }
            } catch (final IOException e) {
                ConcurrentLog.warn("SOLR", "failed to send error " + url.toNormalform(true) + " to solr: " + e.getMessage());
            }