mirror of
https://github.com/yacy/yacy_search_server.git
synced 2024-09-19 00:01:41 +02:00
fix for wrong status codes of error pages
This commit is contained in:
parent
9e503b3376
commit
6bd8c6f195
|
@ -224,6 +224,7 @@ public class Crawler_p {
|
|||
sb.robots.delete(ru);
|
||||
try {Cache.delete(RobotsTxt.robotsURL(RobotsTxt.getHostPort(ru)).hash());} catch (IOException e) {}
|
||||
}
|
||||
try {sb.robots.clear();} catch (IOException e) {} // to be safe: clear all.
|
||||
|
||||
// set the crawl filter
|
||||
String ipMustMatch = post.get("ipMustmatch", CrawlProfile.MATCH_ALL_STRING);
|
||||
|
|
|
@ -639,7 +639,7 @@ public class CrawlQueues {
|
|||
} else {
|
||||
// starting a load from the internet
|
||||
request.setStatus("worker-loading", WorkflowJob.STATUS_RUNNING);
|
||||
String result = null;
|
||||
String error = null;
|
||||
|
||||
// load a resource and push queue entry to switchboard queue
|
||||
// returns null if everything went fine, a fail reason string if a problem occurred
|
||||
|
@ -651,23 +651,29 @@ public class CrawlQueues {
|
|||
if (CrawlQueues.log.isFine()) {
|
||||
CrawlQueues.log.fine("problem loading " + request.url().toString() + ": no content (possibly caused by cache policy)");
|
||||
}
|
||||
result = "no content (possibly caused by cache policy)";
|
||||
error = "no content (possibly caused by cache policy)";
|
||||
} else {
|
||||
request.setStatus("loaded", WorkflowJob.STATUS_RUNNING);
|
||||
final String storedFailMessage = CrawlQueues.this.sb.toIndexer(response);
|
||||
request.setStatus("enqueued-" + ((storedFailMessage == null) ? "ok" : "fail"), WorkflowJob.STATUS_FINISHED);
|
||||
result = (storedFailMessage == null) ? null : "not enqueued to indexer: " + storedFailMessage;
|
||||
error = (storedFailMessage == null) ? null : "not enqueued to indexer: " + storedFailMessage;
|
||||
}
|
||||
} catch (final IOException e) {
|
||||
request.setStatus("error", WorkflowJob.STATUS_FINISHED);
|
||||
if (CrawlQueues.log.isFine()) {
|
||||
CrawlQueues.log.fine("problem loading " + request.url().toString() + ": " + e.getMessage());
|
||||
}
|
||||
result = "load error - " + e.getMessage();
|
||||
error = "load error - " + e.getMessage();
|
||||
}
|
||||
|
||||
if (result != null) {
|
||||
CrawlQueues.this.errorURL.push(request.url(), profile, FailCategory.TEMPORARY_NETWORK_FAILURE, "cannot load: " + result, -1);
|
||||
if (error != null) {
|
||||
if (error.endsWith("$")) {
|
||||
// the "$" mark at the end of the error message means, that the error was already pushed to the error-db by the reporting method
|
||||
// thus we only push this message if we don't have that mark
|
||||
error = error.substring(0, error.length() - 1).trim();
|
||||
} else {
|
||||
CrawlQueues.this.errorURL.push(request.url(), profile, FailCategory.TEMPORARY_NETWORK_FAILURE, "cannot load: " + error, -1);
|
||||
}
|
||||
request.setStatus("worker-error", WorkflowJob.STATUS_FINISHED);
|
||||
} else {
|
||||
request.setStatus("worker-processed", WorkflowJob.STATUS_FINISHED);
|
||||
|
|
|
@ -78,7 +78,7 @@ public final class HTTPLoader {
|
|||
|
||||
if (retryCount < 0) {
|
||||
this.sb.crawlQueues.errorURL.push(request.url(), profile, FailCategory.TEMPORARY_NETWORK_FAILURE, "retry counter exceeded", -1);
|
||||
throw new IOException("retry counter exceeded for URL " + request.url().toString() + ". Processing aborted.");
|
||||
throw new IOException("retry counter exceeded for URL " + request.url().toString() + ". Processing aborted.$");
|
||||
}
|
||||
|
||||
DigestURL url = request.url();
|
||||
|
@ -94,7 +94,7 @@ public final class HTTPLoader {
|
|||
final String hostlow = host.toLowerCase();
|
||||
if (blacklistType != null && Switchboard.urlBlacklist.isListed(blacklistType, hostlow, path)) {
|
||||
this.sb.crawlQueues.errorURL.push(request.url(), profile, FailCategory.FINAL_LOAD_CONTEXT, "url in blacklist", -1);
|
||||
throw new IOException("CRAWLER Rejecting URL '" + request.url().toString() + "'. URL is in blacklist.");
|
||||
throw new IOException("CRAWLER Rejecting URL '" + request.url().toString() + "'. URL is in blacklist.$");
|
||||
}
|
||||
|
||||
// resolve yacy and yacyh domains
|
||||
|
@ -141,7 +141,7 @@ public final class HTTPLoader {
|
|||
|
||||
if (redirectionUrlString.isEmpty()) {
|
||||
this.sb.crawlQueues.errorURL.push(request.url(), profile, FailCategory.TEMPORARY_NETWORK_FAILURE, "no redirection url provided, field '" + HeaderFramework.LOCATION + "' is empty", statusCode);
|
||||
throw new IOException("REJECTED EMTPY REDIRECTION '" + client.getHttpResponse().getStatusLine() + "' for URL " + requestURLString);
|
||||
throw new IOException("REJECTED EMTPY REDIRECTION '" + client.getHttpResponse().getStatusLine() + "' for URL '" + requestURLString + "'$");
|
||||
}
|
||||
|
||||
// normalize URL
|
||||
|
@ -161,7 +161,7 @@ public final class HTTPLoader {
|
|||
// if we are already doing a shutdown we don't need to retry crawling
|
||||
if (Thread.currentThread().isInterrupted()) {
|
||||
this.sb.crawlQueues.errorURL.push(request.url(), profile, FailCategory.FINAL_LOAD_CONTEXT, "server shutdown", statusCode);
|
||||
throw new IOException("CRAWLER Retry of URL=" + requestURLString + " aborted because of server shutdown.");
|
||||
throw new IOException("CRAWLER Retry of URL=" + requestURLString + " aborted because of server shutdown.$");
|
||||
}
|
||||
|
||||
// retry crawling with new url
|
||||
|
@ -170,11 +170,11 @@ public final class HTTPLoader {
|
|||
}
|
||||
// we don't want to follow redirects
|
||||
this.sb.crawlQueues.errorURL.push(request.url(), profile, FailCategory.FINAL_PROCESS_CONTEXT, "redirection not wanted", statusCode);
|
||||
throw new IOException("REJECTED UNWANTED REDIRECTION '" + client.getHttpResponse().getStatusLine() + "' for URL " + requestURLString);
|
||||
throw new IOException("REJECTED UNWANTED REDIRECTION '" + client.getHttpResponse().getStatusLine() + "' for URL '" + requestURLString + "'$");
|
||||
} else if (responseBody == null) {
|
||||
// no response, reject file
|
||||
this.sb.crawlQueues.errorURL.push(request.url(), profile, FailCategory.TEMPORARY_NETWORK_FAILURE, "no response body", statusCode);
|
||||
throw new IOException("REJECTED EMPTY RESPONSE BODY '" + client.getHttpResponse().getStatusLine() + "' for URL " + requestURLString);
|
||||
throw new IOException("REJECTED EMPTY RESPONSE BODY '" + client.getHttpResponse().getStatusLine() + "' for URL '" + requestURLString + "'$");
|
||||
} else if (statusCode == 200 || statusCode == 203) {
|
||||
// the transfer is ok
|
||||
|
||||
|
@ -185,7 +185,7 @@ public final class HTTPLoader {
|
|||
// check length again in case it was not possible to get the length before loading
|
||||
if (maxFileSize >= 0 && contentLength > maxFileSize) {
|
||||
this.sb.crawlQueues.errorURL.push(request.url(), profile, FailCategory.FINAL_PROCESS_CONTEXT, "file size limit exceeded", statusCode);
|
||||
throw new IOException("REJECTED URL " + request.url() + " because file size '" + contentLength + "' exceeds max filesize limit of " + maxFileSize + " bytes. (GET)");
|
||||
throw new IOException("REJECTED URL " + request.url() + " because file size '" + contentLength + "' exceeds max filesize limit of " + maxFileSize + " bytes. (GET)$");
|
||||
}
|
||||
|
||||
// create a new cache entry
|
||||
|
@ -202,7 +202,7 @@ public final class HTTPLoader {
|
|||
} else {
|
||||
// if the response has not the right response type then reject file
|
||||
this.sb.crawlQueues.errorURL.push(request.url(), profile, FailCategory.TEMPORARY_NETWORK_FAILURE, "wrong http status code", statusCode);
|
||||
throw new IOException("REJECTED WRONG STATUS TYPE '" + client.getHttpResponse().getStatusLine() + "' for URL " + requestURLString);
|
||||
throw new IOException("REJECTED WRONG STATUS TYPE '" + client.getHttpResponse().getStatusLine() + "' for URL '" + requestURLString + "'$");
|
||||
}
|
||||
}
|
||||
|
||||
|
|
|
@ -163,12 +163,13 @@ public final class LoaderDispatcher {
|
|||
check = this.loaderSteering.remove(request.url());
|
||||
if (check != null) check.release(1000);
|
||||
return response;
|
||||
} catch (final IOException e) {
|
||||
} catch (final IOException e) {
|
||||
throw new IOException(e);
|
||||
} finally {
|
||||
// release the semaphore anyway
|
||||
check = this.loaderSteering.remove(request.url());
|
||||
if (check != null) check.release(1000);
|
||||
// Very noisy: ConcurrentLog.logException(e);
|
||||
throw new IOException(e);
|
||||
if (check != null) check.release(1000);
|
||||
// Very noisy: ConcurrentLog.logException(e);
|
||||
}
|
||||
}
|
||||
|
||||
|
@ -190,7 +191,7 @@ public final class LoaderDispatcher {
|
|||
// check if url is in blacklist
|
||||
if (blacklistType != null && host != null && Switchboard.urlBlacklist.isListed(blacklistType, host.toLowerCase(), url.getFile())) {
|
||||
this.sb.crawlQueues.errorURL.push(request.url(), crawlProfile, FailCategory.FINAL_LOAD_CONTEXT, "url in blacklist", -1);
|
||||
throw new IOException("DISPATCHER Rejecting URL '" + request.url().toString() + "'. URL is in blacklist.");
|
||||
throw new IOException("DISPATCHER Rejecting URL '" + request.url().toString() + "'. URL is in blacklist.$");
|
||||
}
|
||||
|
||||
// check if we have the page in the cache
|
||||
|
@ -244,13 +245,13 @@ public final class LoaderDispatcher {
|
|||
}
|
||||
}
|
||||
|
||||
// check case where we want results from the cache exclusively, and never from the internet (offline mode)
|
||||
// check case where we want results from the cache exclusively, and never from the Internet (offline mode)
|
||||
if (cacheStrategy == CacheStrategy.CACHEONLY) {
|
||||
// we had a chance to get the content from the cache .. its over. We don't have it.
|
||||
throw new IOException("cache only strategy");
|
||||
}
|
||||
|
||||
// now forget about the cache, nothing there. Try to load the content from the internet
|
||||
// now forget about the cache, nothing there. Try to load the content from the Internet
|
||||
|
||||
// check access time: this is a double-check (we checked possibly already in the balancer)
|
||||
// to make sure that we don't DoS the target by mistake
|
||||
|
@ -302,7 +303,7 @@ public final class LoaderDispatcher {
|
|||
// no caching wanted. Thats ok, do not write any message
|
||||
return response;
|
||||
}
|
||||
// second check tells us if the protocoll tells us something about caching
|
||||
// second check tells us if the protocol tells us something about caching
|
||||
final String storeError = response.shallStoreCacheForCrawler();
|
||||
if (storeError == null) {
|
||||
try {
|
||||
|
|
|
@ -114,8 +114,14 @@ public class ErrorCache {
|
|||
if (this.fulltext.getDefaultConnector() != null && failCategory.store) {
|
||||
// send the error to solr
|
||||
try {
|
||||
SolrInputDocument errorDoc = failDoc.toSolr(this.fulltext.getDefaultConfiguration());
|
||||
this.fulltext.getDefaultConnector().add(errorDoc);
|
||||
// do not overwrite error reports with error reports
|
||||
SolrDocument olddoc = this.fulltext.getDefaultConnector().getDocumentById(ASCII.String(failDoc.getDigestURL().hash()), CollectionSchema.httpstatus_i.getSolrFieldName());
|
||||
if (olddoc == null ||
|
||||
olddoc.getFieldValue(CollectionSchema.httpstatus_i.getSolrFieldName()) == null ||
|
||||
((Integer) olddoc.getFieldValue(CollectionSchema.httpstatus_i.getSolrFieldName())) == 200) {
|
||||
SolrInputDocument errorDoc = failDoc.toSolr(this.fulltext.getDefaultConfiguration());
|
||||
this.fulltext.getDefaultConnector().add(errorDoc);
|
||||
}
|
||||
} catch (final IOException e) {
|
||||
ConcurrentLog.warn("SOLR", "failed to send error " + url.toNormalform(true) + " to solr: " + e.getMessage());
|
||||
}
|
||||
|
|
Loading…
Reference in New Issue
Block a user