added collection_sxt to error documents

This commit is contained in:
Michael Peter Christen 2013-07-17 15:20:56 +02:00
parent 0df5195cb0
commit 89c0aa0e74
12 changed files with 47 additions and 28 deletions

View File

@ -444,6 +444,7 @@ public class Crawler_p {
0,
0,
0),
null,
sb.peers.mySeed().hash.getBytes(),
new Date(),
1,

View File

@ -163,6 +163,7 @@ public final class crawlReceipt {
sb.crawlQueues.delegatedURL.remove(entry.hash()); // the delegated work is transformed into an error case
sb.crawlQueues.errorURL.push(
entry.toBalancerEntry(iam),
null,
youare.getBytes(),
null,
0,

View File

@ -82,6 +82,7 @@ public class urls {
// place url to notice-url db
sb.crawlQueues.delegatedURL.push(
entry,
null,
sb.peers.mySeed().hash.getBytes(),
new Date(),
0,

View File

@ -149,7 +149,8 @@ public final class CrawlStacker {
// if the url was rejected we store it into the error URL db
if (rejectReason != null) {
this.nextQueue.errorURL.push(entry, ASCII.getBytes(this.peers.mySeed().hash), new Date(), 1, FailCategory.FINAL_LOAD_CONTEXT, rejectReason, -1);
final CrawlProfile profile = this.crawler.getActive(UTF8.getBytes(entry.profileHandle()));
this.nextQueue.errorURL.push(entry, profile, ASCII.getBytes(this.peers.mySeed().hash), new Date(), 1, FailCategory.FINAL_LOAD_CONTEXT, rejectReason, -1);
}
} catch (final Exception e) {
CrawlStacker.this.log.warn("Error while processing stackCrawl entry.\n" + "Entry: " + entry.toString() + "Error: " + e.toString(), e);

View File

@ -612,6 +612,7 @@ public class CrawlQueues {
private Request request;
private final Integer code;
private final long start;
private final CrawlProfile profile;
private Loader(final Request entry) {
this.start = System.currentTimeMillis();
@ -619,6 +620,7 @@ public class CrawlQueues {
this.request.setStatus("worker-initialized", WorkflowJob.STATUS_INITIATED);
this.code = Integer.valueOf(entry.hashCode());
this.setPriority(Thread.MIN_PRIORITY); // http requests from the crawler should not cause that other functions work worse
this.profile = CrawlQueues.this.sb.crawler.getActive(UTF8.getBytes(this.request.profileHandle()));
}
private long age() {
@ -637,6 +639,7 @@ public class CrawlQueues {
//if (log.isFine()) log.logFine("Crawling of URL '" + request.url().toString() + "' disallowed by robots.txt.");
CrawlQueues.this.errorURL.push(
this.request,
profile,
ASCII.getBytes(CrawlQueues.this.sb.peers.mySeed().hash),
new Date(),
1,
@ -652,8 +655,7 @@ public class CrawlQueues {
// returns null if everything went fine, a fail reason string if a problem occurred
try {
this.request.setStatus("loading", WorkflowJob.STATUS_RUNNING);
final CrawlProfile e = CrawlQueues.this.sb.crawler.getActive(UTF8.getBytes(this.request.profileHandle()));
final Response response = CrawlQueues.this.sb.loader.load(this.request, e == null ? CacheStrategy.IFEXIST : e.cacheStrategy(), BlacklistType.CRAWLER, ClientIdentification.minLoadDelay(), ClientIdentification.DEFAULT_TIMEOUT);
final Response response = CrawlQueues.this.sb.loader.load(this.request, profile == null ? CacheStrategy.IFEXIST : profile.cacheStrategy(), BlacklistType.CRAWLER, ClientIdentification.minLoadDelay(), ClientIdentification.DEFAULT_TIMEOUT);
if (response == null) {
this.request.setStatus("error", WorkflowJob.STATUS_FINISHED);
if (CrawlQueues.this.log.isFine()) {
@ -677,6 +679,7 @@ public class CrawlQueues {
if (result != null) {
CrawlQueues.this.errorURL.push(
this.request,
profile,
ASCII.getBytes(CrawlQueues.this.sb.peers.mySeed().hash),
new Date(),
1,
@ -690,6 +693,7 @@ public class CrawlQueues {
} catch (final Exception e) {
CrawlQueues.this.errorURL.push(
this.request,
profile,
ASCII.getBytes(CrawlQueues.this.sb.peers.mySeed().hash),
new Date(),
1,

View File

@ -172,6 +172,7 @@ public class ZURL implements Iterable<ZURL.Entry> {
public void push(
final Request bentry,
final CrawlProfile profile,
final byte[] executor,
final Date workdate,
final int workcount,
@ -190,7 +191,7 @@ public class ZURL implements Iterable<ZURL.Entry> {
if (this.fulltext.getDefaultConnector() != null && failCategory.store) {
// send the error to solr
try {
SolrInputDocument errorDoc = this.fulltext.getDefaultConfiguration().err(bentry.url(), failCategory.name() + " " + reason, failCategory.failType, httpcode);
SolrInputDocument errorDoc = this.fulltext.getDefaultConfiguration().err(bentry.url(), profile == null ? null : profile.collections(), failCategory.name() + " " + reason, failCategory.failType, httpcode);
this.fulltext.getDefaultConnector().add(errorDoc);
} catch (final IOException e) {
ConcurrentLog.warn("SOLR", "failed to send error " + bentry.url().toNormalform(true) + " to solr: " + e.getMessage());

View File

@ -101,6 +101,7 @@ public class FTPLoader {
// create new ftp client
final FTPClient ftpClient = new FTPClient();
final CrawlProfile profile = this.sb.crawler.getActive(ASCII.getBytes(request.profileHandle()));
// get a connection
if (openConnection(ftpClient, entryUrl)) {
// test if the specified file is a directory
@ -130,7 +131,6 @@ public class FTPLoader {
final ResponseHeader responseHeader = new ResponseHeader(200);
responseHeader.put(HeaderFramework.LAST_MODIFIED, HeaderFramework.formatRFC1123(new Date()));
responseHeader.put(HeaderFramework.CONTENT_TYPE, "text/html");
final CrawlProfile profile = this.sb.crawler.getActive(ASCII.getBytes(request.profileHandle()));
response = new Response(
request,
requestHeader,
@ -156,7 +156,7 @@ public class FTPLoader {
if (berr.size() > 0 || response == null) {
// some error logging
final String detail = (berr.size() > 0) ? "Errorlog: " + berr.toString() : "";
this.sb.crawlQueues.errorURL.push(request, ASCII.getBytes(this.sb.peers.mySeed().hash), new Date(), 1, FailCategory.TEMPORARY_NETWORK_FAILURE, " ftp server download, " + detail, -1);
this.sb.crawlQueues.errorURL.push(request, profile, ASCII.getBytes(this.sb.peers.mySeed().hash), new Date(), 1, FailCategory.TEMPORARY_NETWORK_FAILURE, " ftp server download, " + detail, -1);
throw new IOException("FTPLoader: Unable to download URL '" + request.url().toString() + "': " + detail);
}

View File

@ -70,20 +70,20 @@ public final class HTTPLoader {
this.socketTimeout = (int) sb.getConfigLong("crawler.clientTimeout", 30000);
}
public Response load(final Request entry, final int maxFileSize, final BlacklistType blacklistType, int timeout) throws IOException {
public Response load(final Request entry, CrawlProfile profile, final int maxFileSize, final BlacklistType blacklistType, int timeout) throws IOException {
Latency.updateBeforeLoad(entry.url());
final long start = System.currentTimeMillis();
final Response doc = load(entry, DEFAULT_CRAWLING_RETRY_COUNT, maxFileSize, blacklistType, timeout);
final Response doc = load(entry, profile, DEFAULT_CRAWLING_RETRY_COUNT, maxFileSize, blacklistType, timeout);
Latency.updateAfterLoad(entry.url(), System.currentTimeMillis() - start);
return doc;
}
private Response load(final Request request, final int retryCount, final int maxFileSize, final BlacklistType blacklistType, int timeout) throws IOException {
private Response load(final Request request, CrawlProfile profile, final int retryCount, final int maxFileSize, final BlacklistType blacklistType, int timeout) throws IOException {
byte[] myHash = ASCII.getBytes(this.sb.peers.mySeed().hash);
if (retryCount < 0) {
this.sb.crawlQueues.errorURL.push(request, myHash, new Date(), 1, FailCategory.TEMPORARY_NETWORK_FAILURE, "retry counter exceeded", -1);
this.sb.crawlQueues.errorURL.push(request, profile, myHash, new Date(), 1, FailCategory.TEMPORARY_NETWORK_FAILURE, "retry counter exceeded", -1);
throw new IOException("retry counter exceeded for URL " + request.url().toString() + ". Processing aborted.");
}
@ -99,7 +99,7 @@ public final class HTTPLoader {
// check if url is in blacklist
final String hostlow = host.toLowerCase();
if (blacklistType != null && Switchboard.urlBlacklist.isListed(blacklistType, hostlow, path)) {
this.sb.crawlQueues.errorURL.push(request, myHash, new Date(), 1, FailCategory.FINAL_LOAD_CONTEXT, "url in blacklist", -1);
this.sb.crawlQueues.errorURL.push(request, profile, myHash, new Date(), 1, FailCategory.FINAL_LOAD_CONTEXT, "url in blacklist", -1);
throw new IOException("CRAWLER Rejecting URL '" + request.url().toString() + "'. URL is in blacklist.");
}
@ -146,7 +146,7 @@ public final class HTTPLoader {
redirectionUrlString = redirectionUrlString == null ? "" : redirectionUrlString.trim();
if (redirectionUrlString.isEmpty()) {
this.sb.crawlQueues.errorURL.push(request, myHash, new Date(), 1, FailCategory.TEMPORARY_NETWORK_FAILURE, "no redirection url provided, field '" + HeaderFramework.LOCATION + "' is empty", statusCode);
this.sb.crawlQueues.errorURL.push(request, profile, myHash, new Date(), 1, FailCategory.TEMPORARY_NETWORK_FAILURE, "no redirection url provided, field '" + HeaderFramework.LOCATION + "' is empty", statusCode);
throw new IOException("REJECTED EMTPY REDIRECTION '" + client.getHttpResponse().getStatusLine() + "' for URL " + requestURLString);
}
@ -160,32 +160,32 @@ public final class HTTPLoader {
this.sb.webStructure.generateCitationReference(url, redirectionUrl);
if (this.sb.getConfigBool(SwitchboardConstants.CRAWLER_RECORD_REDIRECTS, true)) {
this.sb.crawlQueues.errorURL.push(request, myHash, new Date(), 1, FailCategory.FINAL_REDIRECT_RULE, "redirect to " + redirectionUrlString, statusCode);
this.sb.crawlQueues.errorURL.push(request, profile, myHash, new Date(), 1, FailCategory.FINAL_REDIRECT_RULE, "redirect to " + redirectionUrlString, statusCode);
}
if (this.sb.getConfigBool(SwitchboardConstants.CRAWLER_FOLLOW_REDIRECTS, true)) {
// if we are already doing a shutdown we don't need to retry crawling
if (Thread.currentThread().isInterrupted()) {
this.sb.crawlQueues.errorURL.push(request, myHash, new Date(), 1, FailCategory.FINAL_LOAD_CONTEXT, "server shutdown", statusCode);
this.sb.crawlQueues.errorURL.push(request, profile, myHash, new Date(), 1, FailCategory.FINAL_LOAD_CONTEXT, "server shutdown", statusCode);
throw new IOException("CRAWLER Retry of URL=" + requestURLString + " aborted because of server shutdown.");
}
// check if the url was already loaded
if (Cache.has(redirectionUrl.hash())) { // customer request
this.sb.crawlQueues.errorURL.push(request, myHash, new Date(), 1, FailCategory.TEMPORARY_NETWORK_FAILURE, "redirection to double content", statusCode);
this.sb.crawlQueues.errorURL.push(request, profile, myHash, new Date(), 1, FailCategory.TEMPORARY_NETWORK_FAILURE, "redirection to double content", statusCode);
throw new IOException("CRAWLER Redirection of URL=" + requestURLString + " ignored. The url appears already in htcache");
}
// retry crawling with new url
request.redirectURL(redirectionUrl);
return load(request, retryCount - 1, maxFileSize, blacklistType, timeout);
return load(request, profile, retryCount - 1, maxFileSize, blacklistType, timeout);
}
// we don't want to follow redirects
this.sb.crawlQueues.errorURL.push(request, myHash, new Date(), 1, FailCategory.FINAL_PROCESS_CONTEXT, "redirection not wanted", statusCode);
this.sb.crawlQueues.errorURL.push(request, profile, myHash, new Date(), 1, FailCategory.FINAL_PROCESS_CONTEXT, "redirection not wanted", statusCode);
throw new IOException("REJECTED UNWANTED REDIRECTION '" + client.getHttpResponse().getStatusLine() + "' for URL " + requestURLString);
} else if (responseBody == null) {
// no response, reject file
this.sb.crawlQueues.errorURL.push(request, myHash, new Date(), 1, FailCategory.TEMPORARY_NETWORK_FAILURE, "no response body", statusCode);
this.sb.crawlQueues.errorURL.push(request, profile, myHash, new Date(), 1, FailCategory.TEMPORARY_NETWORK_FAILURE, "no response body", statusCode);
throw new IOException("REJECTED EMPTY RESPONSE BODY '" + client.getHttpResponse().getStatusLine() + "' for URL " + requestURLString);
} else if (statusCode == 200 || statusCode == 203) {
// the transfer is ok
@ -196,12 +196,11 @@ public final class HTTPLoader {
// check length again in case it was not possible to get the length before loading
if (maxFileSize >= 0 && contentLength > maxFileSize) {
this.sb.crawlQueues.errorURL.push(request, myHash, new Date(), 1, FailCategory.FINAL_PROCESS_CONTEXT, "file size limit exceeded", statusCode);
this.sb.crawlQueues.errorURL.push(request, profile, myHash, new Date(), 1, FailCategory.FINAL_PROCESS_CONTEXT, "file size limit exceeded", statusCode);
throw new IOException("REJECTED URL " + request.url() + " because file size '" + contentLength + "' exceeds max filesize limit of " + maxFileSize + " bytes. (GET)");
}
// create a new cache entry
final CrawlProfile profile = request.profileHandle() == null ? null : this.sb.crawler.getActive(ASCII.getBytes(request.profileHandle()));
response = new Response(
request,
requestHeader,
@ -214,7 +213,7 @@ public final class HTTPLoader {
return response;
} else {
// if the response has not the right response type then reject file
this.sb.crawlQueues.errorURL.push(request, myHash, new Date(), 1, FailCategory.TEMPORARY_NETWORK_FAILURE, "wrong http status code", statusCode);
this.sb.crawlQueues.errorURL.push(request, profile, myHash, new Date(), 1, FailCategory.TEMPORARY_NETWORK_FAILURE, "wrong http status code", statusCode);
throw new IOException("REJECTED WRONG STATUS TYPE '" + client.getHttpResponse().getStatusLine() + "' for URL " + requestURLString);
}
}

View File

@ -186,15 +186,15 @@ public final class LoaderDispatcher {
if (url.isFile() || url.isSMB()) cacheStrategy = CacheStrategy.NOCACHE; // load just from the file system
final String protocol = url.getProtocol();
final String host = url.getHost();
final CrawlProfile crawlProfile = request.profileHandle() == null ? null : this.sb.crawler.getActive(UTF8.getBytes(request.profileHandle()));
// check if url is in blacklist
if (blacklistType != null && host != null && Switchboard.urlBlacklist.isListed(blacklistType, host.toLowerCase(), url.getFile())) {
this.sb.crawlQueues.errorURL.push(request, this.sb.peers.mySeed().hash.getBytes(), new Date(), 1, FailCategory.FINAL_LOAD_CONTEXT, "url in blacklist", -1);
this.sb.crawlQueues.errorURL.push(request, crawlProfile, this.sb.peers.mySeed().hash.getBytes(), new Date(), 1, FailCategory.FINAL_LOAD_CONTEXT, "url in blacklist", -1);
throw new IOException("DISPATCHER Rejecting URL '" + request.url().toString() + "'. URL is in blacklist.");
}
// check if we have the page in the cache
final CrawlProfile crawlProfile = request.profileHandle() == null ? null : this.sb.crawler.getActive(UTF8.getBytes(request.profileHandle()));
if (cacheStrategy != CacheStrategy.NOCACHE && crawlProfile != null) {
// we have passed a first test if caching is allowed
// now see if there is a cache entry
@ -280,7 +280,7 @@ public final class LoaderDispatcher {
// load resource from the internet
Response response = null;
if (protocol.equals("http") || protocol.equals("https")) {
response = this.httpLoader.load(request, maxFileSize, blacklistType, timeout);
response = this.httpLoader.load(request, crawlProfile, maxFileSize, blacklistType, timeout);
} else if (protocol.equals("ftp")) {
response = this.ftpLoader.load(request, true);
} else if (protocol.equals("smb")) {

View File

@ -1803,6 +1803,7 @@ public final class Switchboard extends serverSwitch {
//if (log.isFine()) log.logFine("deQueue: not indexed any word in URL " + response.url() + "; cause: " + noIndexReason);
addURLtoErrorDB(
response.url(),
response.profile(),
(referrerURL == null) ? null : referrerURL.hash(),
response.initiator(),
response.name(),
@ -2474,6 +2475,7 @@ public final class Switchboard extends serverSwitch {
this.log.warn("the resource '" + response.url() + "' is missing in the cache.");
addURLtoErrorDB(
response.url(),
response.profile(),
response.referrerHash(),
response.initiator(),
response.name(),
@ -2498,6 +2500,7 @@ public final class Switchboard extends serverSwitch {
this.log.warn("Unable to parse the resource '" + response.url() + "'. " + e.getMessage());
addURLtoErrorDB(
response.url(),
response.profile(),
response.referrerHash(),
response.initiator(),
response.name(),
@ -2597,6 +2600,7 @@ public final class Switchboard extends serverSwitch {
if (this.log.isInfo()) this.log.info("Not Condensed Resource '" + urls + "': indexing prevented by regular expression on url; indexUrlMustMatchPattern = " + profile.indexUrlMustMatchPattern().pattern() + ", indexUrlMustNotMatchPattern = " + profile.indexUrlMustNotMatchPattern().pattern());
addURLtoErrorDB(
in.queueEntry.url(),
profile,
in.queueEntry.referrerHash(),
in.queueEntry.initiator(),
in.queueEntry.name(),
@ -2612,6 +2616,7 @@ public final class Switchboard extends serverSwitch {
if (this.log.isInfo()) this.log.info("Not Condensed Resource '" + urls + "': denied by document-attached noindexing rule");
addURLtoErrorDB(
in.queueEntry.url(),
profile,
in.queueEntry.referrerHash(),
in.queueEntry.initiator(),
in.queueEntry.name(),
@ -2624,6 +2629,7 @@ public final class Switchboard extends serverSwitch {
if (this.log.isInfo()) this.log.info("Not Condensed Resource '" + urls + "': indexing prevented by regular expression on content; indexContentMustMatchPattern = " + profile.indexContentMustMatchPattern().pattern() + ", indexContentMustNotMatchPattern = " + profile.indexContentMustNotMatchPattern().pattern());
addURLtoErrorDB(
in.queueEntry.url(),
profile,
in.queueEntry.referrerHash(),
in.queueEntry.initiator(),
in.queueEntry.name(),
@ -2707,6 +2713,7 @@ public final class Switchboard extends serverSwitch {
//if (this.log.isInfo()) log.logInfo("Not Indexed Resource '" + queueEntry.url().toNormalform(false, true) + "': denied by rule in document, process case=" + processCase);
addURLtoErrorDB(
url,
profile,
(referrerURL == null) ? null : referrerURL.hash(),
queueEntry.initiator(),
dc_title,
@ -2719,6 +2726,7 @@ public final class Switchboard extends serverSwitch {
//if (this.log.isInfo()) log.logInfo("Not Indexed Resource '" + queueEntry.url().toNormalform(false, true) + "': denied by profile rule, process case=" + processCase + ", profile name = " + queueEntry.profile().name());
addURLtoErrorDB(
url,
profile,
(referrerURL == null) ? null : referrerURL.hash(),
queueEntry.initiator(),
dc_title,
@ -3361,6 +3369,7 @@ public final class Switchboard extends serverSwitch {
private void addURLtoErrorDB(
final DigestURI url,
final CrawlProfile profile,
final byte[] referrerHash,
final byte[] initiator,
final String name,
@ -3380,7 +3389,7 @@ public final class Switchboard extends serverSwitch {
0,
0,
0);
this.crawlQueues.errorURL.push(bentry, initiator, new Date(), 0, failCategory, failreason, -1);
this.crawlQueues.errorURL.push(bentry, profile, initiator, new Date(), 0, failCategory, failreason, -1);
}
public final void heuristicSite(final SearchEvent searchEvent, final String host) {

View File

@ -64,6 +64,7 @@ import net.yacy.cora.storage.HandleSet;
import net.yacy.cora.util.CommonPattern;
import net.yacy.cora.util.ConcurrentLog;
import net.yacy.cora.util.SpaceExceededException;
import net.yacy.crawler.data.CrawlProfile;
import net.yacy.crawler.retrieval.Response;
import net.yacy.document.Condenser;
import net.yacy.document.Document;
@ -1191,7 +1192,7 @@ public class CollectionConfiguration extends SchemaConfiguration implements Seri
* @param httpstatus
* @throws IOException
*/
public SolrInputDocument err(final DigestURI digestURI, final String failReason, final FailType failType, final int httpstatus) throws IOException {
public SolrInputDocument err(final DigestURI digestURI, String[] collections, final String failReason, final FailType failType, final int httpstatus) throws IOException {
final SolrInputDocument solrdoc = new SolrInputDocument();
add(solrdoc, CollectionSchema.id, ASCII.String(digestURI.hash()));
add(solrdoc, CollectionSchema.sku, digestURI.toNormalform(true));
@ -1212,6 +1213,7 @@ public class CollectionConfiguration extends SchemaConfiguration implements Seri
if (contains(CollectionSchema.failreason_s)) add(solrdoc, CollectionSchema.failreason_s, failReason);
if (contains(CollectionSchema.failtype_s)) add(solrdoc, CollectionSchema.failtype_s, failType.name());
if (contains(CollectionSchema.httpstatus_i)) add(solrdoc, CollectionSchema.httpstatus_i, httpstatus);
if (contains(CollectionSchema.collection_sxt)) add(solrdoc, CollectionSchema.collection_sxt, collections);
return solrdoc;
}

View File

@ -259,7 +259,7 @@ public class MediaSnippet implements Comparable<MediaSnippet>, Comparator<MediaS
// check if url is in blacklist
if (Switchboard.urlBlacklist.isListed(blacklistType, url.getHost().toLowerCase(), url.getFile())) {
Switchboard.getSwitchboard().crawlQueues.errorURL.push(new Request(url, null), ASCII.getBytes(Switchboard.getSwitchboard().peers.mySeed().hash), new Date(), 1, FailCategory.FINAL_LOAD_CONTEXT, "url in blacklist", -1);
Switchboard.getSwitchboard().crawlQueues.errorURL.push(new Request(url, null), null, ASCII.getBytes(Switchboard.getSwitchboard().peers.mySeed().hash), new Date(), 1, FailCategory.FINAL_LOAD_CONTEXT, "url in blacklist", -1);
ConcurrentLog.fine("snippet fetch", "MEDIA-SNIPPET Rejecting URL '" + url.toString() + "'. URL is in blacklist.");
isBlacklisted = true;
}