mirror of
https://github.com/yacy/yacy_search_server.git
synced 2024-09-19 00:01:41 +02:00
added collection_sxt to error documents
This commit is contained in:
parent
0df5195cb0
commit
89c0aa0e74
|
@ -444,6 +444,7 @@ public class Crawler_p {
|
|||
0,
|
||||
0,
|
||||
0),
|
||||
null,
|
||||
sb.peers.mySeed().hash.getBytes(),
|
||||
new Date(),
|
||||
1,
|
||||
|
|
|
@ -163,6 +163,7 @@ public final class crawlReceipt {
|
|||
sb.crawlQueues.delegatedURL.remove(entry.hash()); // the delegated work is transformed into an error case
|
||||
sb.crawlQueues.errorURL.push(
|
||||
entry.toBalancerEntry(iam),
|
||||
null,
|
||||
youare.getBytes(),
|
||||
null,
|
||||
0,
|
||||
|
|
|
@ -82,6 +82,7 @@ public class urls {
|
|||
// place url to notice-url db
|
||||
sb.crawlQueues.delegatedURL.push(
|
||||
entry,
|
||||
null,
|
||||
sb.peers.mySeed().hash.getBytes(),
|
||||
new Date(),
|
||||
0,
|
||||
|
|
|
@ -149,7 +149,8 @@ public final class CrawlStacker {
|
|||
|
||||
// if the url was rejected we store it into the error URL db
|
||||
if (rejectReason != null) {
|
||||
this.nextQueue.errorURL.push(entry, ASCII.getBytes(this.peers.mySeed().hash), new Date(), 1, FailCategory.FINAL_LOAD_CONTEXT, rejectReason, -1);
|
||||
final CrawlProfile profile = this.crawler.getActive(UTF8.getBytes(entry.profileHandle()));
|
||||
this.nextQueue.errorURL.push(entry, profile, ASCII.getBytes(this.peers.mySeed().hash), new Date(), 1, FailCategory.FINAL_LOAD_CONTEXT, rejectReason, -1);
|
||||
}
|
||||
} catch (final Exception e) {
|
||||
CrawlStacker.this.log.warn("Error while processing stackCrawl entry.\n" + "Entry: " + entry.toString() + "Error: " + e.toString(), e);
|
||||
|
|
|
@ -612,6 +612,7 @@ public class CrawlQueues {
|
|||
private Request request;
|
||||
private final Integer code;
|
||||
private final long start;
|
||||
private final CrawlProfile profile;
|
||||
|
||||
private Loader(final Request entry) {
|
||||
this.start = System.currentTimeMillis();
|
||||
|
@ -619,6 +620,7 @@ public class CrawlQueues {
|
|||
this.request.setStatus("worker-initialized", WorkflowJob.STATUS_INITIATED);
|
||||
this.code = Integer.valueOf(entry.hashCode());
|
||||
this.setPriority(Thread.MIN_PRIORITY); // http requests from the crawler should not cause that other functions work worse
|
||||
this.profile = CrawlQueues.this.sb.crawler.getActive(UTF8.getBytes(this.request.profileHandle()));
|
||||
}
|
||||
|
||||
private long age() {
|
||||
|
@ -637,6 +639,7 @@ public class CrawlQueues {
|
|||
//if (log.isFine()) log.logFine("Crawling of URL '" + request.url().toString() + "' disallowed by robots.txt.");
|
||||
CrawlQueues.this.errorURL.push(
|
||||
this.request,
|
||||
profile,
|
||||
ASCII.getBytes(CrawlQueues.this.sb.peers.mySeed().hash),
|
||||
new Date(),
|
||||
1,
|
||||
|
@ -652,8 +655,7 @@ public class CrawlQueues {
|
|||
// returns null if everything went fine, a fail reason string if a problem occurred
|
||||
try {
|
||||
this.request.setStatus("loading", WorkflowJob.STATUS_RUNNING);
|
||||
final CrawlProfile e = CrawlQueues.this.sb.crawler.getActive(UTF8.getBytes(this.request.profileHandle()));
|
||||
final Response response = CrawlQueues.this.sb.loader.load(this.request, e == null ? CacheStrategy.IFEXIST : e.cacheStrategy(), BlacklistType.CRAWLER, ClientIdentification.minLoadDelay(), ClientIdentification.DEFAULT_TIMEOUT);
|
||||
final Response response = CrawlQueues.this.sb.loader.load(this.request, profile == null ? CacheStrategy.IFEXIST : profile.cacheStrategy(), BlacklistType.CRAWLER, ClientIdentification.minLoadDelay(), ClientIdentification.DEFAULT_TIMEOUT);
|
||||
if (response == null) {
|
||||
this.request.setStatus("error", WorkflowJob.STATUS_FINISHED);
|
||||
if (CrawlQueues.this.log.isFine()) {
|
||||
|
@ -677,6 +679,7 @@ public class CrawlQueues {
|
|||
if (result != null) {
|
||||
CrawlQueues.this.errorURL.push(
|
||||
this.request,
|
||||
profile,
|
||||
ASCII.getBytes(CrawlQueues.this.sb.peers.mySeed().hash),
|
||||
new Date(),
|
||||
1,
|
||||
|
@ -690,6 +693,7 @@ public class CrawlQueues {
|
|||
} catch (final Exception e) {
|
||||
CrawlQueues.this.errorURL.push(
|
||||
this.request,
|
||||
profile,
|
||||
ASCII.getBytes(CrawlQueues.this.sb.peers.mySeed().hash),
|
||||
new Date(),
|
||||
1,
|
||||
|
|
|
@ -172,6 +172,7 @@ public class ZURL implements Iterable<ZURL.Entry> {
|
|||
|
||||
public void push(
|
||||
final Request bentry,
|
||||
final CrawlProfile profile,
|
||||
final byte[] executor,
|
||||
final Date workdate,
|
||||
final int workcount,
|
||||
|
@ -190,7 +191,7 @@ public class ZURL implements Iterable<ZURL.Entry> {
|
|||
if (this.fulltext.getDefaultConnector() != null && failCategory.store) {
|
||||
// send the error to solr
|
||||
try {
|
||||
SolrInputDocument errorDoc = this.fulltext.getDefaultConfiguration().err(bentry.url(), failCategory.name() + " " + reason, failCategory.failType, httpcode);
|
||||
SolrInputDocument errorDoc = this.fulltext.getDefaultConfiguration().err(bentry.url(), profile == null ? null : profile.collections(), failCategory.name() + " " + reason, failCategory.failType, httpcode);
|
||||
this.fulltext.getDefaultConnector().add(errorDoc);
|
||||
} catch (final IOException e) {
|
||||
ConcurrentLog.warn("SOLR", "failed to send error " + bentry.url().toNormalform(true) + " to solr: " + e.getMessage());
|
||||
|
|
|
@ -101,6 +101,7 @@ public class FTPLoader {
|
|||
// create new ftp client
|
||||
final FTPClient ftpClient = new FTPClient();
|
||||
|
||||
final CrawlProfile profile = this.sb.crawler.getActive(ASCII.getBytes(request.profileHandle()));
|
||||
// get a connection
|
||||
if (openConnection(ftpClient, entryUrl)) {
|
||||
// test if the specified file is a directory
|
||||
|
@ -130,7 +131,6 @@ public class FTPLoader {
|
|||
final ResponseHeader responseHeader = new ResponseHeader(200);
|
||||
responseHeader.put(HeaderFramework.LAST_MODIFIED, HeaderFramework.formatRFC1123(new Date()));
|
||||
responseHeader.put(HeaderFramework.CONTENT_TYPE, "text/html");
|
||||
final CrawlProfile profile = this.sb.crawler.getActive(ASCII.getBytes(request.profileHandle()));
|
||||
response = new Response(
|
||||
request,
|
||||
requestHeader,
|
||||
|
@ -156,7 +156,7 @@ public class FTPLoader {
|
|||
if (berr.size() > 0 || response == null) {
|
||||
// some error logging
|
||||
final String detail = (berr.size() > 0) ? "Errorlog: " + berr.toString() : "";
|
||||
this.sb.crawlQueues.errorURL.push(request, ASCII.getBytes(this.sb.peers.mySeed().hash), new Date(), 1, FailCategory.TEMPORARY_NETWORK_FAILURE, " ftp server download, " + detail, -1);
|
||||
this.sb.crawlQueues.errorURL.push(request, profile, ASCII.getBytes(this.sb.peers.mySeed().hash), new Date(), 1, FailCategory.TEMPORARY_NETWORK_FAILURE, " ftp server download, " + detail, -1);
|
||||
throw new IOException("FTPLoader: Unable to download URL '" + request.url().toString() + "': " + detail);
|
||||
}
|
||||
|
||||
|
|
|
@ -70,20 +70,20 @@ public final class HTTPLoader {
|
|||
this.socketTimeout = (int) sb.getConfigLong("crawler.clientTimeout", 30000);
|
||||
}
|
||||
|
||||
public Response load(final Request entry, final int maxFileSize, final BlacklistType blacklistType, int timeout) throws IOException {
|
||||
public Response load(final Request entry, CrawlProfile profile, final int maxFileSize, final BlacklistType blacklistType, int timeout) throws IOException {
|
||||
Latency.updateBeforeLoad(entry.url());
|
||||
final long start = System.currentTimeMillis();
|
||||
final Response doc = load(entry, DEFAULT_CRAWLING_RETRY_COUNT, maxFileSize, blacklistType, timeout);
|
||||
final Response doc = load(entry, profile, DEFAULT_CRAWLING_RETRY_COUNT, maxFileSize, blacklistType, timeout);
|
||||
Latency.updateAfterLoad(entry.url(), System.currentTimeMillis() - start);
|
||||
return doc;
|
||||
}
|
||||
|
||||
private Response load(final Request request, final int retryCount, final int maxFileSize, final BlacklistType blacklistType, int timeout) throws IOException {
|
||||
private Response load(final Request request, CrawlProfile profile, final int retryCount, final int maxFileSize, final BlacklistType blacklistType, int timeout) throws IOException {
|
||||
|
||||
byte[] myHash = ASCII.getBytes(this.sb.peers.mySeed().hash);
|
||||
|
||||
if (retryCount < 0) {
|
||||
this.sb.crawlQueues.errorURL.push(request, myHash, new Date(), 1, FailCategory.TEMPORARY_NETWORK_FAILURE, "retry counter exceeded", -1);
|
||||
this.sb.crawlQueues.errorURL.push(request, profile, myHash, new Date(), 1, FailCategory.TEMPORARY_NETWORK_FAILURE, "retry counter exceeded", -1);
|
||||
throw new IOException("retry counter exceeded for URL " + request.url().toString() + ". Processing aborted.");
|
||||
}
|
||||
|
||||
|
@ -99,7 +99,7 @@ public final class HTTPLoader {
|
|||
// check if url is in blacklist
|
||||
final String hostlow = host.toLowerCase();
|
||||
if (blacklistType != null && Switchboard.urlBlacklist.isListed(blacklistType, hostlow, path)) {
|
||||
this.sb.crawlQueues.errorURL.push(request, myHash, new Date(), 1, FailCategory.FINAL_LOAD_CONTEXT, "url in blacklist", -1);
|
||||
this.sb.crawlQueues.errorURL.push(request, profile, myHash, new Date(), 1, FailCategory.FINAL_LOAD_CONTEXT, "url in blacklist", -1);
|
||||
throw new IOException("CRAWLER Rejecting URL '" + request.url().toString() + "'. URL is in blacklist.");
|
||||
}
|
||||
|
||||
|
@ -146,7 +146,7 @@ public final class HTTPLoader {
|
|||
redirectionUrlString = redirectionUrlString == null ? "" : redirectionUrlString.trim();
|
||||
|
||||
if (redirectionUrlString.isEmpty()) {
|
||||
this.sb.crawlQueues.errorURL.push(request, myHash, new Date(), 1, FailCategory.TEMPORARY_NETWORK_FAILURE, "no redirection url provided, field '" + HeaderFramework.LOCATION + "' is empty", statusCode);
|
||||
this.sb.crawlQueues.errorURL.push(request, profile, myHash, new Date(), 1, FailCategory.TEMPORARY_NETWORK_FAILURE, "no redirection url provided, field '" + HeaderFramework.LOCATION + "' is empty", statusCode);
|
||||
throw new IOException("REJECTED EMTPY REDIRECTION '" + client.getHttpResponse().getStatusLine() + "' for URL " + requestURLString);
|
||||
}
|
||||
|
||||
|
@ -160,32 +160,32 @@ public final class HTTPLoader {
|
|||
this.sb.webStructure.generateCitationReference(url, redirectionUrl);
|
||||
|
||||
if (this.sb.getConfigBool(SwitchboardConstants.CRAWLER_RECORD_REDIRECTS, true)) {
|
||||
this.sb.crawlQueues.errorURL.push(request, myHash, new Date(), 1, FailCategory.FINAL_REDIRECT_RULE, "redirect to " + redirectionUrlString, statusCode);
|
||||
this.sb.crawlQueues.errorURL.push(request, profile, myHash, new Date(), 1, FailCategory.FINAL_REDIRECT_RULE, "redirect to " + redirectionUrlString, statusCode);
|
||||
}
|
||||
|
||||
if (this.sb.getConfigBool(SwitchboardConstants.CRAWLER_FOLLOW_REDIRECTS, true)) {
|
||||
// if we are already doing a shutdown we don't need to retry crawling
|
||||
if (Thread.currentThread().isInterrupted()) {
|
||||
this.sb.crawlQueues.errorURL.push(request, myHash, new Date(), 1, FailCategory.FINAL_LOAD_CONTEXT, "server shutdown", statusCode);
|
||||
this.sb.crawlQueues.errorURL.push(request, profile, myHash, new Date(), 1, FailCategory.FINAL_LOAD_CONTEXT, "server shutdown", statusCode);
|
||||
throw new IOException("CRAWLER Retry of URL=" + requestURLString + " aborted because of server shutdown.");
|
||||
}
|
||||
|
||||
// check if the url was already loaded
|
||||
if (Cache.has(redirectionUrl.hash())) { // customer request
|
||||
this.sb.crawlQueues.errorURL.push(request, myHash, new Date(), 1, FailCategory.TEMPORARY_NETWORK_FAILURE, "redirection to double content", statusCode);
|
||||
this.sb.crawlQueues.errorURL.push(request, profile, myHash, new Date(), 1, FailCategory.TEMPORARY_NETWORK_FAILURE, "redirection to double content", statusCode);
|
||||
throw new IOException("CRAWLER Redirection of URL=" + requestURLString + " ignored. The url appears already in htcache");
|
||||
}
|
||||
|
||||
// retry crawling with new url
|
||||
request.redirectURL(redirectionUrl);
|
||||
return load(request, retryCount - 1, maxFileSize, blacklistType, timeout);
|
||||
return load(request, profile, retryCount - 1, maxFileSize, blacklistType, timeout);
|
||||
}
|
||||
// we don't want to follow redirects
|
||||
this.sb.crawlQueues.errorURL.push(request, myHash, new Date(), 1, FailCategory.FINAL_PROCESS_CONTEXT, "redirection not wanted", statusCode);
|
||||
this.sb.crawlQueues.errorURL.push(request, profile, myHash, new Date(), 1, FailCategory.FINAL_PROCESS_CONTEXT, "redirection not wanted", statusCode);
|
||||
throw new IOException("REJECTED UNWANTED REDIRECTION '" + client.getHttpResponse().getStatusLine() + "' for URL " + requestURLString);
|
||||
} else if (responseBody == null) {
|
||||
// no response, reject file
|
||||
this.sb.crawlQueues.errorURL.push(request, myHash, new Date(), 1, FailCategory.TEMPORARY_NETWORK_FAILURE, "no response body", statusCode);
|
||||
this.sb.crawlQueues.errorURL.push(request, profile, myHash, new Date(), 1, FailCategory.TEMPORARY_NETWORK_FAILURE, "no response body", statusCode);
|
||||
throw new IOException("REJECTED EMPTY RESPONSE BODY '" + client.getHttpResponse().getStatusLine() + "' for URL " + requestURLString);
|
||||
} else if (statusCode == 200 || statusCode == 203) {
|
||||
// the transfer is ok
|
||||
|
@ -196,12 +196,11 @@ public final class HTTPLoader {
|
|||
|
||||
// check length again in case it was not possible to get the length before loading
|
||||
if (maxFileSize >= 0 && contentLength > maxFileSize) {
|
||||
this.sb.crawlQueues.errorURL.push(request, myHash, new Date(), 1, FailCategory.FINAL_PROCESS_CONTEXT, "file size limit exceeded", statusCode);
|
||||
this.sb.crawlQueues.errorURL.push(request, profile, myHash, new Date(), 1, FailCategory.FINAL_PROCESS_CONTEXT, "file size limit exceeded", statusCode);
|
||||
throw new IOException("REJECTED URL " + request.url() + " because file size '" + contentLength + "' exceeds max filesize limit of " + maxFileSize + " bytes. (GET)");
|
||||
}
|
||||
|
||||
// create a new cache entry
|
||||
final CrawlProfile profile = request.profileHandle() == null ? null : this.sb.crawler.getActive(ASCII.getBytes(request.profileHandle()));
|
||||
response = new Response(
|
||||
request,
|
||||
requestHeader,
|
||||
|
@ -214,7 +213,7 @@ public final class HTTPLoader {
|
|||
return response;
|
||||
} else {
|
||||
// if the response has not the right response type then reject file
|
||||
this.sb.crawlQueues.errorURL.push(request, myHash, new Date(), 1, FailCategory.TEMPORARY_NETWORK_FAILURE, "wrong http status code", statusCode);
|
||||
this.sb.crawlQueues.errorURL.push(request, profile, myHash, new Date(), 1, FailCategory.TEMPORARY_NETWORK_FAILURE, "wrong http status code", statusCode);
|
||||
throw new IOException("REJECTED WRONG STATUS TYPE '" + client.getHttpResponse().getStatusLine() + "' for URL " + requestURLString);
|
||||
}
|
||||
}
|
||||
|
|
|
@ -186,15 +186,15 @@ public final class LoaderDispatcher {
|
|||
if (url.isFile() || url.isSMB()) cacheStrategy = CacheStrategy.NOCACHE; // load just from the file system
|
||||
final String protocol = url.getProtocol();
|
||||
final String host = url.getHost();
|
||||
|
||||
final CrawlProfile crawlProfile = request.profileHandle() == null ? null : this.sb.crawler.getActive(UTF8.getBytes(request.profileHandle()));
|
||||
|
||||
// check if url is in blacklist
|
||||
if (blacklistType != null && host != null && Switchboard.urlBlacklist.isListed(blacklistType, host.toLowerCase(), url.getFile())) {
|
||||
this.sb.crawlQueues.errorURL.push(request, this.sb.peers.mySeed().hash.getBytes(), new Date(), 1, FailCategory.FINAL_LOAD_CONTEXT, "url in blacklist", -1);
|
||||
this.sb.crawlQueues.errorURL.push(request, crawlProfile, this.sb.peers.mySeed().hash.getBytes(), new Date(), 1, FailCategory.FINAL_LOAD_CONTEXT, "url in blacklist", -1);
|
||||
throw new IOException("DISPATCHER Rejecting URL '" + request.url().toString() + "'. URL is in blacklist.");
|
||||
}
|
||||
|
||||
// check if we have the page in the cache
|
||||
final CrawlProfile crawlProfile = request.profileHandle() == null ? null : this.sb.crawler.getActive(UTF8.getBytes(request.profileHandle()));
|
||||
if (cacheStrategy != CacheStrategy.NOCACHE && crawlProfile != null) {
|
||||
// we have passed a first test if caching is allowed
|
||||
// now see if there is a cache entry
|
||||
|
@ -280,7 +280,7 @@ public final class LoaderDispatcher {
|
|||
// load resource from the internet
|
||||
Response response = null;
|
||||
if (protocol.equals("http") || protocol.equals("https")) {
|
||||
response = this.httpLoader.load(request, maxFileSize, blacklistType, timeout);
|
||||
response = this.httpLoader.load(request, crawlProfile, maxFileSize, blacklistType, timeout);
|
||||
} else if (protocol.equals("ftp")) {
|
||||
response = this.ftpLoader.load(request, true);
|
||||
} else if (protocol.equals("smb")) {
|
||||
|
|
|
@ -1803,6 +1803,7 @@ public final class Switchboard extends serverSwitch {
|
|||
//if (log.isFine()) log.logFine("deQueue: not indexed any word in URL " + response.url() + "; cause: " + noIndexReason);
|
||||
addURLtoErrorDB(
|
||||
response.url(),
|
||||
response.profile(),
|
||||
(referrerURL == null) ? null : referrerURL.hash(),
|
||||
response.initiator(),
|
||||
response.name(),
|
||||
|
@ -2474,6 +2475,7 @@ public final class Switchboard extends serverSwitch {
|
|||
this.log.warn("the resource '" + response.url() + "' is missing in the cache.");
|
||||
addURLtoErrorDB(
|
||||
response.url(),
|
||||
response.profile(),
|
||||
response.referrerHash(),
|
||||
response.initiator(),
|
||||
response.name(),
|
||||
|
@ -2498,6 +2500,7 @@ public final class Switchboard extends serverSwitch {
|
|||
this.log.warn("Unable to parse the resource '" + response.url() + "'. " + e.getMessage());
|
||||
addURLtoErrorDB(
|
||||
response.url(),
|
||||
response.profile(),
|
||||
response.referrerHash(),
|
||||
response.initiator(),
|
||||
response.name(),
|
||||
|
@ -2597,6 +2600,7 @@ public final class Switchboard extends serverSwitch {
|
|||
if (this.log.isInfo()) this.log.info("Not Condensed Resource '" + urls + "': indexing prevented by regular expression on url; indexUrlMustMatchPattern = " + profile.indexUrlMustMatchPattern().pattern() + ", indexUrlMustNotMatchPattern = " + profile.indexUrlMustNotMatchPattern().pattern());
|
||||
addURLtoErrorDB(
|
||||
in.queueEntry.url(),
|
||||
profile,
|
||||
in.queueEntry.referrerHash(),
|
||||
in.queueEntry.initiator(),
|
||||
in.queueEntry.name(),
|
||||
|
@ -2612,6 +2616,7 @@ public final class Switchboard extends serverSwitch {
|
|||
if (this.log.isInfo()) this.log.info("Not Condensed Resource '" + urls + "': denied by document-attached noindexing rule");
|
||||
addURLtoErrorDB(
|
||||
in.queueEntry.url(),
|
||||
profile,
|
||||
in.queueEntry.referrerHash(),
|
||||
in.queueEntry.initiator(),
|
||||
in.queueEntry.name(),
|
||||
|
@ -2624,6 +2629,7 @@ public final class Switchboard extends serverSwitch {
|
|||
if (this.log.isInfo()) this.log.info("Not Condensed Resource '" + urls + "': indexing prevented by regular expression on content; indexContentMustMatchPattern = " + profile.indexContentMustMatchPattern().pattern() + ", indexContentMustNotMatchPattern = " + profile.indexContentMustNotMatchPattern().pattern());
|
||||
addURLtoErrorDB(
|
||||
in.queueEntry.url(),
|
||||
profile,
|
||||
in.queueEntry.referrerHash(),
|
||||
in.queueEntry.initiator(),
|
||||
in.queueEntry.name(),
|
||||
|
@ -2707,6 +2713,7 @@ public final class Switchboard extends serverSwitch {
|
|||
//if (this.log.isInfo()) log.logInfo("Not Indexed Resource '" + queueEntry.url().toNormalform(false, true) + "': denied by rule in document, process case=" + processCase);
|
||||
addURLtoErrorDB(
|
||||
url,
|
||||
profile,
|
||||
(referrerURL == null) ? null : referrerURL.hash(),
|
||||
queueEntry.initiator(),
|
||||
dc_title,
|
||||
|
@ -2719,6 +2726,7 @@ public final class Switchboard extends serverSwitch {
|
|||
//if (this.log.isInfo()) log.logInfo("Not Indexed Resource '" + queueEntry.url().toNormalform(false, true) + "': denied by profile rule, process case=" + processCase + ", profile name = " + queueEntry.profile().name());
|
||||
addURLtoErrorDB(
|
||||
url,
|
||||
profile,
|
||||
(referrerURL == null) ? null : referrerURL.hash(),
|
||||
queueEntry.initiator(),
|
||||
dc_title,
|
||||
|
@ -3361,6 +3369,7 @@ public final class Switchboard extends serverSwitch {
|
|||
|
||||
private void addURLtoErrorDB(
|
||||
final DigestURI url,
|
||||
final CrawlProfile profile,
|
||||
final byte[] referrerHash,
|
||||
final byte[] initiator,
|
||||
final String name,
|
||||
|
@ -3380,7 +3389,7 @@ public final class Switchboard extends serverSwitch {
|
|||
0,
|
||||
0,
|
||||
0);
|
||||
this.crawlQueues.errorURL.push(bentry, initiator, new Date(), 0, failCategory, failreason, -1);
|
||||
this.crawlQueues.errorURL.push(bentry, profile, initiator, new Date(), 0, failCategory, failreason, -1);
|
||||
}
|
||||
|
||||
public final void heuristicSite(final SearchEvent searchEvent, final String host) {
|
||||
|
|
|
@ -64,6 +64,7 @@ import net.yacy.cora.storage.HandleSet;
|
|||
import net.yacy.cora.util.CommonPattern;
|
||||
import net.yacy.cora.util.ConcurrentLog;
|
||||
import net.yacy.cora.util.SpaceExceededException;
|
||||
import net.yacy.crawler.data.CrawlProfile;
|
||||
import net.yacy.crawler.retrieval.Response;
|
||||
import net.yacy.document.Condenser;
|
||||
import net.yacy.document.Document;
|
||||
|
@ -1191,7 +1192,7 @@ public class CollectionConfiguration extends SchemaConfiguration implements Seri
|
|||
* @param httpstatus
|
||||
* @throws IOException
|
||||
*/
|
||||
public SolrInputDocument err(final DigestURI digestURI, final String failReason, final FailType failType, final int httpstatus) throws IOException {
|
||||
public SolrInputDocument err(final DigestURI digestURI, String[] collections, final String failReason, final FailType failType, final int httpstatus) throws IOException {
|
||||
final SolrInputDocument solrdoc = new SolrInputDocument();
|
||||
add(solrdoc, CollectionSchema.id, ASCII.String(digestURI.hash()));
|
||||
add(solrdoc, CollectionSchema.sku, digestURI.toNormalform(true));
|
||||
|
@ -1212,6 +1213,7 @@ public class CollectionConfiguration extends SchemaConfiguration implements Seri
|
|||
if (contains(CollectionSchema.failreason_s)) add(solrdoc, CollectionSchema.failreason_s, failReason);
|
||||
if (contains(CollectionSchema.failtype_s)) add(solrdoc, CollectionSchema.failtype_s, failType.name());
|
||||
if (contains(CollectionSchema.httpstatus_i)) add(solrdoc, CollectionSchema.httpstatus_i, httpstatus);
|
||||
if (contains(CollectionSchema.collection_sxt)) add(solrdoc, CollectionSchema.collection_sxt, collections);
|
||||
return solrdoc;
|
||||
}
|
||||
|
||||
|
|
|
@ -259,7 +259,7 @@ public class MediaSnippet implements Comparable<MediaSnippet>, Comparator<MediaS
|
|||
|
||||
// check if url is in blacklist
|
||||
if (Switchboard.urlBlacklist.isListed(blacklistType, url.getHost().toLowerCase(), url.getFile())) {
|
||||
Switchboard.getSwitchboard().crawlQueues.errorURL.push(new Request(url, null), ASCII.getBytes(Switchboard.getSwitchboard().peers.mySeed().hash), new Date(), 1, FailCategory.FINAL_LOAD_CONTEXT, "url in blacklist", -1);
|
||||
Switchboard.getSwitchboard().crawlQueues.errorURL.push(new Request(url, null), null, ASCII.getBytes(Switchboard.getSwitchboard().peers.mySeed().hash), new Date(), 1, FailCategory.FINAL_LOAD_CONTEXT, "url in blacklist", -1);
|
||||
ConcurrentLog.fine("snippet fetch", "MEDIA-SNIPPET Rejecting URL '" + url.toString() + "'. URL is in blacklist.");
|
||||
isBlacklisted = true;
|
||||
}
|
||||
|
|
Loading…
Reference in New Issue
Block a user