fix for wrong status codes of error pages

This commit is contained in:
Michael Peter Christen 2014-04-10 09:08:59 +02:00
parent 9e503b3376
commit 6bd8c6f195
5 changed files with 38 additions and 24 deletions

View File

@ -224,6 +224,7 @@ public class Crawler_p {
sb.robots.delete(ru);
try {Cache.delete(RobotsTxt.robotsURL(RobotsTxt.getHostPort(ru)).hash());} catch (IOException e) {}
}
try {sb.robots.clear();} catch (IOException e) {} // to be safe: clear all.
// set the crawl filter
String ipMustMatch = post.get("ipMustmatch", CrawlProfile.MATCH_ALL_STRING);

View File

@ -639,7 +639,7 @@ public class CrawlQueues {
} else {
// starting a load from the internet
request.setStatus("worker-loading", WorkflowJob.STATUS_RUNNING);
String result = null;
String error = null;
// load a resource and push queue entry to switchboard queue
// returns null if everything went fine, a fail reason string if a problem occurred
@ -651,23 +651,29 @@ public class CrawlQueues {
if (CrawlQueues.log.isFine()) {
CrawlQueues.log.fine("problem loading " + request.url().toString() + ": no content (possibly caused by cache policy)");
}
result = "no content (possibly caused by cache policy)";
error = "no content (possibly caused by cache policy)";
} else {
request.setStatus("loaded", WorkflowJob.STATUS_RUNNING);
final String storedFailMessage = CrawlQueues.this.sb.toIndexer(response);
request.setStatus("enqueued-" + ((storedFailMessage == null) ? "ok" : "fail"), WorkflowJob.STATUS_FINISHED);
result = (storedFailMessage == null) ? null : "not enqueued to indexer: " + storedFailMessage;
error = (storedFailMessage == null) ? null : "not enqueued to indexer: " + storedFailMessage;
}
} catch (final IOException e) {
request.setStatus("error", WorkflowJob.STATUS_FINISHED);
if (CrawlQueues.log.isFine()) {
CrawlQueues.log.fine("problem loading " + request.url().toString() + ": " + e.getMessage());
}
result = "load error - " + e.getMessage();
error = "load error - " + e.getMessage();
}
if (result != null) {
CrawlQueues.this.errorURL.push(request.url(), profile, FailCategory.TEMPORARY_NETWORK_FAILURE, "cannot load: " + result, -1);
if (error != null) {
if (error.endsWith("$")) {
// the "$" mark at the end of the error message means, that the error was already pushed to the error-db by the reporting method
// thus we only push this message if we don't have that mark
error = error.substring(0, error.length() - 1).trim();
} else {
CrawlQueues.this.errorURL.push(request.url(), profile, FailCategory.TEMPORARY_NETWORK_FAILURE, "cannot load: " + error, -1);
}
request.setStatus("worker-error", WorkflowJob.STATUS_FINISHED);
} else {
request.setStatus("worker-processed", WorkflowJob.STATUS_FINISHED);

View File

@ -78,7 +78,7 @@ public final class HTTPLoader {
if (retryCount < 0) {
this.sb.crawlQueues.errorURL.push(request.url(), profile, FailCategory.TEMPORARY_NETWORK_FAILURE, "retry counter exceeded", -1);
throw new IOException("retry counter exceeded for URL " + request.url().toString() + ". Processing aborted.");
throw new IOException("retry counter exceeded for URL " + request.url().toString() + ". Processing aborted.$");
}
DigestURL url = request.url();
@ -94,7 +94,7 @@ public final class HTTPLoader {
final String hostlow = host.toLowerCase();
if (blacklistType != null && Switchboard.urlBlacklist.isListed(blacklistType, hostlow, path)) {
this.sb.crawlQueues.errorURL.push(request.url(), profile, FailCategory.FINAL_LOAD_CONTEXT, "url in blacklist", -1);
throw new IOException("CRAWLER Rejecting URL '" + request.url().toString() + "'. URL is in blacklist.");
throw new IOException("CRAWLER Rejecting URL '" + request.url().toString() + "'. URL is in blacklist.$");
}
// resolve yacy and yacyh domains
@ -141,7 +141,7 @@ public final class HTTPLoader {
if (redirectionUrlString.isEmpty()) {
this.sb.crawlQueues.errorURL.push(request.url(), profile, FailCategory.TEMPORARY_NETWORK_FAILURE, "no redirection url provided, field '" + HeaderFramework.LOCATION + "' is empty", statusCode);
throw new IOException("REJECTED EMTPY REDIRECTION '" + client.getHttpResponse().getStatusLine() + "' for URL " + requestURLString);
throw new IOException("REJECTED EMTPY REDIRECTION '" + client.getHttpResponse().getStatusLine() + "' for URL '" + requestURLString + "'$");
}
// normalize URL
@ -161,7 +161,7 @@ public final class HTTPLoader {
// if we are already doing a shutdown we don't need to retry crawling
if (Thread.currentThread().isInterrupted()) {
this.sb.crawlQueues.errorURL.push(request.url(), profile, FailCategory.FINAL_LOAD_CONTEXT, "server shutdown", statusCode);
throw new IOException("CRAWLER Retry of URL=" + requestURLString + " aborted because of server shutdown.");
throw new IOException("CRAWLER Retry of URL=" + requestURLString + " aborted because of server shutdown.$");
}
// retry crawling with new url
@ -170,11 +170,11 @@ public final class HTTPLoader {
}
// we don't want to follow redirects
this.sb.crawlQueues.errorURL.push(request.url(), profile, FailCategory.FINAL_PROCESS_CONTEXT, "redirection not wanted", statusCode);
throw new IOException("REJECTED UNWANTED REDIRECTION '" + client.getHttpResponse().getStatusLine() + "' for URL " + requestURLString);
throw new IOException("REJECTED UNWANTED REDIRECTION '" + client.getHttpResponse().getStatusLine() + "' for URL '" + requestURLString + "'$");
} else if (responseBody == null) {
// no response, reject file
this.sb.crawlQueues.errorURL.push(request.url(), profile, FailCategory.TEMPORARY_NETWORK_FAILURE, "no response body", statusCode);
throw new IOException("REJECTED EMPTY RESPONSE BODY '" + client.getHttpResponse().getStatusLine() + "' for URL " + requestURLString);
throw new IOException("REJECTED EMPTY RESPONSE BODY '" + client.getHttpResponse().getStatusLine() + "' for URL '" + requestURLString + "'$");
} else if (statusCode == 200 || statusCode == 203) {
// the transfer is ok
@ -185,7 +185,7 @@ public final class HTTPLoader {
// check length again in case it was not possible to get the length before loading
if (maxFileSize >= 0 && contentLength > maxFileSize) {
this.sb.crawlQueues.errorURL.push(request.url(), profile, FailCategory.FINAL_PROCESS_CONTEXT, "file size limit exceeded", statusCode);
throw new IOException("REJECTED URL " + request.url() + " because file size '" + contentLength + "' exceeds max filesize limit of " + maxFileSize + " bytes. (GET)");
throw new IOException("REJECTED URL " + request.url() + " because file size '" + contentLength + "' exceeds max filesize limit of " + maxFileSize + " bytes. (GET)$");
}
// create a new cache entry
@ -202,7 +202,7 @@ public final class HTTPLoader {
} else {
// if the response has not the right response type then reject file
this.sb.crawlQueues.errorURL.push(request.url(), profile, FailCategory.TEMPORARY_NETWORK_FAILURE, "wrong http status code", statusCode);
throw new IOException("REJECTED WRONG STATUS TYPE '" + client.getHttpResponse().getStatusLine() + "' for URL " + requestURLString);
throw new IOException("REJECTED WRONG STATUS TYPE '" + client.getHttpResponse().getStatusLine() + "' for URL '" + requestURLString + "'$");
}
}

View File

@ -163,12 +163,13 @@ public final class LoaderDispatcher {
check = this.loaderSteering.remove(request.url());
if (check != null) check.release(1000);
return response;
} catch (final IOException e) {
} catch (final IOException e) {
throw new IOException(e);
} finally {
// release the semaphore anyway
check = this.loaderSteering.remove(request.url());
if (check != null) check.release(1000);
// Very noisy: ConcurrentLog.logException(e);
throw new IOException(e);
if (check != null) check.release(1000);
// Very noisy: ConcurrentLog.logException(e);
}
}
@ -190,7 +191,7 @@ public final class LoaderDispatcher {
// check if url is in blacklist
if (blacklistType != null && host != null && Switchboard.urlBlacklist.isListed(blacklistType, host.toLowerCase(), url.getFile())) {
this.sb.crawlQueues.errorURL.push(request.url(), crawlProfile, FailCategory.FINAL_LOAD_CONTEXT, "url in blacklist", -1);
throw new IOException("DISPATCHER Rejecting URL '" + request.url().toString() + "'. URL is in blacklist.");
throw new IOException("DISPATCHER Rejecting URL '" + request.url().toString() + "'. URL is in blacklist.$");
}
// check if we have the page in the cache
@ -244,13 +245,13 @@ public final class LoaderDispatcher {
}
}
// check case where we want results from the cache exclusively, and never from the internet (offline mode)
// check case where we want results from the cache exclusively, and never from the Internet (offline mode)
if (cacheStrategy == CacheStrategy.CACHEONLY) {
// we had a chance to get the content from the cache .. its over. We don't have it.
throw new IOException("cache only strategy");
}
// now forget about the cache, nothing there. Try to load the content from the internet
// now forget about the cache, nothing there. Try to load the content from the Internet
// check access time: this is a double-check (we checked possibly already in the balancer)
// to make sure that we don't DoS the target by mistake
@ -302,7 +303,7 @@ public final class LoaderDispatcher {
// no caching wanted. Thats ok, do not write any message
return response;
}
// second check tells us if the protocoll tells us something about caching
// second check tells us if the protocol tells us something about caching
final String storeError = response.shallStoreCacheForCrawler();
if (storeError == null) {
try {

View File

@ -114,8 +114,14 @@ public class ErrorCache {
if (this.fulltext.getDefaultConnector() != null && failCategory.store) {
// send the error to solr
try {
SolrInputDocument errorDoc = failDoc.toSolr(this.fulltext.getDefaultConfiguration());
this.fulltext.getDefaultConnector().add(errorDoc);
// do not overwrite error reports with error reports
SolrDocument olddoc = this.fulltext.getDefaultConnector().getDocumentById(ASCII.String(failDoc.getDigestURL().hash()), CollectionSchema.httpstatus_i.getSolrFieldName());
if (olddoc == null ||
olddoc.getFieldValue(CollectionSchema.httpstatus_i.getSolrFieldName()) == null ||
((Integer) olddoc.getFieldValue(CollectionSchema.httpstatus_i.getSolrFieldName())) == 200) {
SolrInputDocument errorDoc = failDoc.toSolr(this.fulltext.getDefaultConfiguration());
this.fulltext.getDefaultConnector().add(errorDoc);
}
} catch (final IOException e) {
ConcurrentLog.warn("SOLR", "failed to send error " + url.toNormalform(true) + " to solr: " + e.getMessage());
}