- removed ZURL data structure; removed also the ZURL data file

- replaced load failure logging by information which is stored in Solr
- fixed a bug with crawling of feeds: added must-match pattern
application to feed urls to filter out such urls which shall not be in a
wanted domain
- delegatedURLs, which also used ZURLs are now temporary objects in
memory
This commit is contained in:
Michael Peter Christen 2013-09-17 15:27:02 +02:00
parent 31920385f7
commit 2602be8d1e
20 changed files with 401 additions and 649 deletions

View File

@ -37,6 +37,7 @@ import java.util.regex.PatternSyntaxException;
import net.yacy.cora.document.encoding.ASCII;
import net.yacy.cora.document.id.AnchorURL;
import net.yacy.cora.document.id.DigestURL;
import net.yacy.cora.federate.solr.FailCategory;
import net.yacy.cora.federate.yacy.CacheStrategy;
import net.yacy.cora.protocol.ClientIdentification;
import net.yacy.cora.protocol.RequestHeader;
@ -44,8 +45,6 @@ import net.yacy.cora.util.ConcurrentLog;
import net.yacy.cora.util.SpaceExceededException;
import net.yacy.crawler.CrawlSwitchboard;
import net.yacy.crawler.data.CrawlProfile;
import net.yacy.crawler.data.ZURL.FailCategory;
import net.yacy.crawler.retrieval.Request;
import net.yacy.crawler.retrieval.SitemapImporter;
import net.yacy.data.WorkTables;
import net.yacy.document.Document;
@ -392,7 +391,7 @@ public class Crawler_p {
for (DigestURL u: rootURLs) {
hosthashes.add(ASCII.getBytes(u.hosthash()));
}
sb.crawlQueues.errorURL.removeHosts(hosthashes, false);
sb.crawlQueues.errorURL.removeHosts(hosthashes);
for (byte[] hosthash: hosthashes) {
try {
String deletequery = CollectionSchema.host_id_s.getSolrFieldName() + ":\"" + ASCII.String(hosthash) + "\" AND " + CollectionSchema.failreason_s.getSolrFieldName() + ":[* TO *]";
@ -440,24 +439,7 @@ public class Crawler_p {
} else {
StringBuilder fr = new StringBuilder();
for (Map.Entry<DigestURL, String> failure: failurls.entrySet()) {
sb.crawlQueues.errorURL.push(
new Request(
sb.peers.mySeed().hash.getBytes(),
failure.getKey(),
null,
"",
new Date(),
profile.handle(),
0,
0,
0,
0),
null,
sb.peers.mySeed().hash.getBytes(),
new Date(),
1,
FailCategory.FINAL_LOAD_CONTEXT,
failure.getValue(), -1);
sb.crawlQueues.errorURL.push(failure.getKey(), null, FailCategory.FINAL_LOAD_CONTEXT, failure.getValue(), -1);
fr.append(failure.getValue()).append('/');
}

View File

@ -439,7 +439,7 @@ public class HostBrowser {
FailType failType = errorDocs.get(entry.getKey());
if (failType == null) {
// maybe this is only in the errorURL
prop.put("files_list_" + c + "_type_stored_error", process == HarvestProcess.ERRORS ? sb.crawlQueues.errorURL.get(uri.hash()).anycause() : "unknown error");
prop.put("files_list_" + c + "_type_stored_error", process == HarvestProcess.ERRORS ? sb.crawlQueues.errorURL.get(ASCII.String(uri.hash())).getFailReason() : "unknown error");
} else {
prop.put("files_list_" + c + "_type_stored_error", failType == FailType.excl ? "excluded from indexing" : "load fail");
}

View File

@ -32,16 +32,12 @@
</colgroup>
<tr class="TableHeader">
<th>Time</th>
<th>Initiator</th>
<th>Executor</th>
<th>URL</th>
<th>Fail-Reason</th>
</tr>
#{list}#
<tr class="TableCell#(dark)#Light::Dark#(/dark)#">
<td>#[time]#</td>
<td>#[initiator]#</td>
<td>#[executor]#</td>
<td><a href="#[url]#">#[url]#</a></td>
<td>#[failreason]#</td>
</tr>

View File

@ -24,15 +24,14 @@
import java.util.ArrayList;
import java.util.Date;
import net.yacy.cora.date.GenericFormatter;
import net.yacy.cora.document.encoding.ASCII;
import net.yacy.cora.document.id.DigestURL;
import net.yacy.cora.protocol.RequestHeader;
import net.yacy.crawler.CrawlStacker;
import net.yacy.crawler.data.ZURL;
import net.yacy.peers.Seed;
import net.yacy.search.Switchboard;
import net.yacy.search.schema.CollectionConfiguration;
import net.yacy.server.serverObjects;
import net.yacy.server.serverSwitch;
@ -73,27 +72,19 @@ public class IndexCreateParserErrors_p {
}
dark = true;
DigestURL url;
byte[] initiatorHash, executorHash;
Seed initiatorSeed, executorSeed;
int j=0;
ArrayList<ZURL.Entry> l = sb.crawlQueues.errorURL.list(showRejectedCount);
ZURL.Entry entry;
ArrayList<CollectionConfiguration.FailDoc> l = sb.crawlQueues.errorURL.list(showRejectedCount);
CollectionConfiguration.FailDoc entry;
for (int i = l.size() - 1; i >= 0; i--) {
entry = l.get(i);
if (entry == null) continue;
url = entry.url();
url = entry.getDigestURL();
if (url == null) continue;
initiatorHash = entry.initiator();
executorHash = entry.executor();
initiatorSeed = (initiatorHash == null) ? null : sb.peers.getConnected(ASCII.String(initiatorHash));
executorSeed = (executorHash == null) ? null : sb.peers.getConnected(ASCII.String(executorHash));
prop.putHTML("rejected_list_"+j+"_time", GenericFormatter.SIMPLE_FORMATTER.format(entry.workdate()));
prop.putHTML("rejected_list_"+j+"_initiator", ((initiatorSeed == null) ? "proxy" : initiatorSeed.getName()));
prop.putHTML("rejected_list_"+j+"_executor", ((executorSeed == null) ? "proxy" : executorSeed.getName()));
prop.putHTML("rejected_list_"+j+"_time", GenericFormatter.SIMPLE_FORMATTER.format(new Date()));
prop.putHTML("rejected_list_"+j+"_url", url.toNormalform(false));
String cause = entry.anycause();
String cause = entry.getFailReason();
if (cause.startsWith(CrawlStacker.ERROR_NO_MATCH_MUST_MATCH_FILTER)) {
prop.put("rejected_list_"+j+"_failreason", "(<a href=\"/RegexTest.html?text=" + url.toNormalform(false) +
"&regex=" + cause.substring(CrawlStacker.ERROR_NO_MATCH_MUST_MATCH_FILTER.length()) + "\">test</a>) " + cause);

View File

@ -32,6 +32,7 @@
import java.net.MalformedURLException;
import java.util.Date;
import net.yacy.cora.document.encoding.ASCII;
import net.yacy.cora.document.encoding.UTF8;
import net.yacy.cora.document.id.DigestURL;
import net.yacy.cora.federate.yacy.CacheStrategy;
@ -127,7 +128,7 @@ public class QuickCrawlLink_p {
final byte[] urlhash = crawlingStartURL.hash();
indexSegment.fulltext().remove(urlhash);
sb.crawlQueues.noticeURL.removeByURLHash(urlhash);
sb.crawlQueues.errorURL.remove(urlhash);
sb.crawlQueues.errorURL.remove(ASCII.String(urlhash));
// create crawling profile
CrawlProfile pe = null;

View File

@ -30,11 +30,11 @@
import java.io.IOException;
import net.yacy.cora.document.encoding.ASCII;
import net.yacy.cora.federate.solr.FailCategory;
import net.yacy.cora.protocol.RequestHeader;
import net.yacy.cora.util.ConcurrentLog;
import net.yacy.crawler.data.ResultURLs;
import net.yacy.crawler.data.ResultURLs.EventOrigin;
import net.yacy.crawler.data.ZURL.FailCategory;
import net.yacy.kelondro.data.meta.URIMetadataRow;
import net.yacy.peers.Protocol;
import net.yacy.peers.Seed;
@ -161,14 +161,7 @@ public final class crawlReceipt {
}
sb.crawlQueues.delegatedURL.remove(entry.hash()); // the delegated work is transformed into an error case
sb.crawlQueues.errorURL.push(
entry.toBalancerEntry(iam),
null,
youare.getBytes(),
null,
0,
FailCategory.FINAL_LOAD_CONTEXT,
result + ":" + reason, -1);
sb.crawlQueues.errorURL.push(entry.url(), null, FailCategory.FINAL_LOAD_CONTEXT, result + ":" + reason, -1);
//switchboard.noticeURL.remove(receivedUrlhash);
prop.put("delay", "3600");
return prop;

View File

@ -246,7 +246,7 @@ public final class search {
false,
indexSegment,
rankingProfile,
header.get(RequestHeader.USER_AGENT, ""),
header.get(HeaderFramework.USER_AGENT, ""),
false,
false,
0.0d,
@ -310,7 +310,7 @@ public final class search {
false,
sb.index,
rankingProfile,
header.get(RequestHeader.USER_AGENT, ""),
header.get(HeaderFramework.USER_AGENT, ""),
false,
false,
0.0d,

View File

@ -25,14 +25,11 @@
// Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
import java.io.IOException;
import java.util.Date;
import net.yacy.cora.date.GenericFormatter;
import net.yacy.cora.document.encoding.ASCII;
import net.yacy.cora.document.id.DigestURL;
import net.yacy.cora.protocol.RequestHeader;
import net.yacy.crawler.data.NoticedURL;
import net.yacy.crawler.data.ZURL.FailCategory;
import net.yacy.crawler.retrieval.Request;
import net.yacy.kelondro.data.meta.URIMetadataNode;
import net.yacy.peers.Protocol;
@ -80,15 +77,7 @@ public class urls {
referrer = sb.getURL(entry.referrerhash());
// place url to notice-url db
sb.crawlQueues.delegatedURL.push(
entry,
null,
sb.peers.mySeed().hash.getBytes(),
new Date(),
0,
FailCategory.FINAL_PROCESS_CONTEXT,
"client=____________",
-1);
sb.crawlQueues.delegatedURL.put(ASCII.String(entry.url().hash()), entry.url());
// create RSS entry
prop.put("item_" + c + "_title", "");

View File

@ -663,7 +663,7 @@ public class yacysearch {
authenticated,
indexSegment,
ranking,
header.get(RequestHeader.USER_AGENT, ""),
header.get(HeaderFramework.USER_AGENT, ""),
sb.getConfigBool(SwitchboardConstants.SEARCH_VERIFY_DELETE, false)
&& sb.getConfigBool(SwitchboardConstants.NETWORK_SEARCHVERIFY, false)
&& sb.peers.mySeed().getFlagAcceptRemoteIndex(),

View File

@ -0,0 +1,39 @@
/**
* FailCategory
* Copyright 2013 by Michael Peter Christen
* First released 17.10.2013 at http://yacy.net
*
* This library is free software; you can redistribute it and/or
* modify it under the terms of the GNU Lesser General Public
* License as published by the Free Software Foundation; either
* version 2.1 of the License, or (at your option) any later version.
*
* This library is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
* Lesser General Public License for more details.
*
* You should have received a copy of the GNU Lesser General Public License
* along with this program in the file lgpl21.txt
* If not, see <http://www.gnu.org/licenses/>.
*/
package net.yacy.cora.federate.solr;
public enum FailCategory {
// TEMPORARY categories are such failure cases that should be tried again
// FINAL categories are such failure cases that are final and should not be tried again
TEMPORARY_NETWORK_FAILURE(true, FailType.fail), // an entity could not been loaded
FINAL_PROCESS_CONTEXT(false, FailType.excl), // because of a processing context we do not want that url again (i.e. remote crawling)
FINAL_LOAD_CONTEXT(false, FailType.excl), // the crawler configuration does not want to load the entity
FINAL_ROBOTS_RULE(true, FailType.excl), // a remote server denies indexing or loading
FINAL_REDIRECT_RULE(true, FailType.excl); // the remote server redirects this page, thus disallowing reading of content
public final boolean store;
public final FailType failType;
private FailCategory(boolean store, FailType failType) {
this.store = store;
this.failType = failType;
}
}

View File

@ -41,6 +41,7 @@ import net.yacy.cora.document.encoding.UTF8;
import net.yacy.cora.document.id.AnchorURL;
import net.yacy.cora.document.id.DigestURL;
import net.yacy.cora.document.id.MultiProtocolURL;
import net.yacy.cora.federate.solr.FailCategory;
import net.yacy.cora.order.Base64Order;
import net.yacy.cora.protocol.Domains;
import net.yacy.cora.protocol.ftp.FTPClient;
@ -49,9 +50,7 @@ import net.yacy.crawler.data.CrawlProfile;
import net.yacy.crawler.data.CrawlQueues;
import net.yacy.crawler.data.NoticedURL;
import net.yacy.crawler.data.ResultURLs;
import net.yacy.crawler.data.ZURL;
import net.yacy.crawler.data.ResultURLs.EventOrigin;
import net.yacy.crawler.data.ZURL.FailCategory;
import net.yacy.crawler.retrieval.FTPLoader;
import net.yacy.crawler.retrieval.HTTPLoader;
import net.yacy.crawler.retrieval.Request;
@ -65,6 +64,7 @@ import net.yacy.repository.Blacklist.BlacklistType;
import net.yacy.repository.FilterEngine;
import net.yacy.search.Switchboard;
import net.yacy.search.index.Segment;
import net.yacy.search.schema.CollectionConfiguration;
public final class CrawlStacker {
@ -75,7 +75,7 @@ public final class CrawlStacker {
private final ConcurrentLog log = new ConcurrentLog("STACKCRAWL");
private final RobotsTxt robots;
private final WorkflowProcessor<Request> requestQueue;
private final CrawlQueues nextQueue;
public final CrawlQueues nextQueue;
private final CrawlSwitchboard crawler;
private final Segment indexSegment;
private final SeedDB peers;
@ -151,7 +151,7 @@ public final class CrawlStacker {
// if the url was rejected we store it into the error URL db
if (rejectReason != null && !rejectReason.startsWith("double in")) {
final CrawlProfile profile = this.crawler.getActive(UTF8.getBytes(entry.profileHandle()));
this.nextQueue.errorURL.push(entry, profile, ASCII.getBytes(this.peers.mySeed().hash), new Date(), 1, FailCategory.FINAL_LOAD_CONTEXT, rejectReason, -1);
this.nextQueue.errorURL.push(entry.url(), profile, FailCategory.FINAL_LOAD_CONTEXT, rejectReason, -1);
}
} catch (final Exception e) {
CrawlStacker.this.log.warn("Error while processing stackCrawl entry.\n" + "Entry: " + entry.toString() + "Error: " + e.toString(), e);
@ -186,7 +186,7 @@ public final class CrawlStacker {
this.indexSegment.fulltext().remove(urlhash);
byte[] hosthash = new byte[6]; System.arraycopy(urlhash, 6, hosthash, 0, 6);
List<byte[]> hosthashes = new ArrayList<byte[]>(); hosthashes.add(hosthash);
this.nextQueue.errorURL.removeHosts(hosthashes, false);
this.nextQueue.errorURL.removeHosts(hosthashes);
this.nextQueue.removeURL(urlhash);
String u = url.toNormalform(true);
if (u.endsWith("/")) {
@ -198,7 +198,7 @@ public final class CrawlStacker {
final byte[] uh = new DigestURL(u).hash();
this.indexSegment.fulltext().remove(uh);
this.nextQueue.noticeURL.removeByURLHash(uh);
this.nextQueue.errorURL.remove(uh);
this.nextQueue.errorURL.remove(ASCII.String(uh));
} catch (final MalformedURLException e1) {}
}
@ -246,7 +246,7 @@ public final class CrawlStacker {
if (replace) {
CrawlStacker.this.indexSegment.fulltext().remove(urlhash);
cq.noticeURL.removeByURLHash(urlhash);
cq.errorURL.remove(urlhash);
cq.errorURL.remove(ASCII.String(urlhash));
}
// put entry on crawl stack
@ -425,8 +425,8 @@ public final class CrawlStacker {
if (dbocc != null) {
// do double-check
if (dbocc == HarvestProcess.ERRORS) {
final ZURL.Entry errorEntry = this.nextQueue.errorURL.get(url.hash());
return "double in: errors (" + errorEntry.anycause() + ")";
final CollectionConfiguration.FailDoc errorEntry = this.nextQueue.errorURL.get(ASCII.String(url.hash()));
return "double in: errors (" + errorEntry.getFailReason() + ")";
}
return "double in: " + dbocc.toString();
}
@ -441,9 +441,9 @@ public final class CrawlStacker {
return "double in: LURL-DB, oldDate = " + oldDate.toString();
}
if (dbocc == HarvestProcess.ERRORS) {
final ZURL.Entry errorEntry = this.nextQueue.errorURL.get(url.hash());
if (this.log.isInfo()) this.log.info("URL '" + urlstring + "' is double registered in '" + dbocc.toString() + "', previous cause: " + errorEntry.anycause());
return "double in: errors (" + errorEntry.anycause() + "), oldDate = " + oldDate.toString();
final CollectionConfiguration.FailDoc errorEntry = this.nextQueue.errorURL.get(ASCII.String(url.hash()));
if (this.log.isInfo()) this.log.info("URL '" + urlstring + "' is double registered in '" + dbocc.toString() + "', previous cause: " + errorEntry.getFailReason());
return "double in: errors (" + errorEntry.getFailReason() + "), oldDate = " + oldDate.toString();
}
if (this.log.isInfo()) this.log.info("URL '" + urlstring + "' is double registered in '" + dbocc.toString() + "'. ");
return "double in: " + dbocc.toString() + ", oldDate = " + oldDate.toString();

View File

@ -40,17 +40,16 @@ import net.yacy.cora.document.encoding.UTF8;
import net.yacy.cora.document.feed.Hit;
import net.yacy.cora.document.feed.RSSFeed;
import net.yacy.cora.document.id.DigestURL;
import net.yacy.cora.federate.solr.FailCategory;
import net.yacy.cora.federate.yacy.CacheStrategy;
import net.yacy.cora.order.Base64Order;
import net.yacy.cora.protocol.ConnectionInfo;
import net.yacy.cora.util.ConcurrentLog;
import net.yacy.crawler.HarvestProcess;
import net.yacy.crawler.data.NoticedURL.StackType;
import net.yacy.crawler.data.ZURL.FailCategory;
import net.yacy.crawler.retrieval.Request;
import net.yacy.crawler.retrieval.Response;
import net.yacy.crawler.robots.RobotsTxtEntry;
import net.yacy.kelondro.util.FileUtils;
import net.yacy.kelondro.workflow.WorkflowJob;
import net.yacy.peers.DHTSelection;
import net.yacy.peers.Protocol;
@ -59,19 +58,19 @@ import net.yacy.repository.Blacklist.BlacklistType;
import net.yacy.search.IndexingQueueEntry;
import net.yacy.search.Switchboard;
import net.yacy.search.SwitchboardConstants;
import net.yacy.search.index.ErrorCache;
import net.yacy.search.schema.CollectionConfiguration;
public class CrawlQueues {
private static final String ERROR_DB_FILENAME = "urlError4.db";
private static final String DELEGATED_DB_FILENAME = "urlDelegated4.db";
private Switchboard sb;
private ConcurrentLog log;
private Map<Integer, Loader> workers; // mapping from url hash to Worker thread object
private final ArrayList<String> remoteCrawlProviderHashes;
public NoticedURL noticeURL;
public ZURL errorURL, delegatedURL;
public ErrorCache errorURL;
public Map<String, DigestURL> delegatedURL;
public CrawlQueues(final Switchboard sb, final File queuePath) {
this.sb = sb;
@ -82,10 +81,8 @@ public class CrawlQueues {
// start crawling management
this.log.config("Starting Crawling Management");
this.noticeURL = new NoticedURL(queuePath, sb.useTailCache, sb.exceed134217727);
FileUtils.deletedelete(new File(queuePath, ERROR_DB_FILENAME));
this.errorURL = new ZURL(sb.index.fulltext(), queuePath, ERROR_DB_FILENAME, false, sb.useTailCache, sb.exceed134217727);
this.delegatedURL = new ZURL(sb.index.fulltext(), queuePath, DELEGATED_DB_FILENAME, true, sb.useTailCache, sb.exceed134217727);
try {this.errorURL.clear();} catch (IOException e) {} // start with empty errors each time
this.errorURL = new ErrorCache(sb.index.fulltext());
this.delegatedURL = new ConcurrentHashMap<String, DigestURL>();
}
public void relocate(final File newQueuePath) {
@ -95,10 +92,8 @@ public class CrawlQueues {
this.remoteCrawlProviderHashes.clear();
this.noticeURL = new NoticedURL(newQueuePath, this.sb.useTailCache, this.sb.exceed134217727);
FileUtils.deletedelete(new File(newQueuePath, ERROR_DB_FILENAME));
this.errorURL = new ZURL(this.sb.index.fulltext(), newQueuePath, ERROR_DB_FILENAME, false, this.sb.useTailCache, this.sb.exceed134217727);
this.delegatedURL = new ZURL(this.sb.index.fulltext(), newQueuePath, DELEGATED_DB_FILENAME, true, this.sb.useTailCache, this.sb.exceed134217727);
try {this.errorURL.clear();} catch (IOException e) {} // start with empty errors each time
this.errorURL = new ErrorCache(this.sb.index.fulltext());
this.delegatedURL = new ConcurrentHashMap<String, DigestURL>();
}
public synchronized void close() {
@ -114,8 +109,7 @@ public class CrawlQueues {
}
}
this.noticeURL.close();
this.errorURL.close();
this.delegatedURL.close();
this.delegatedURL.clear();
}
public void clear() {
@ -130,11 +124,7 @@ public class CrawlQueues {
} catch (final IOException e) {
ConcurrentLog.logException(e);
}
try {
this.delegatedURL.clear();
} catch (final IOException e) {
ConcurrentLog.logException(e);
}
this.delegatedURL.clear();
}
/**
@ -143,7 +133,7 @@ public class CrawlQueues {
* @return if the hash exists, the name of the database is returned, otherwise null is returned
*/
public HarvestProcess exists(final byte[] hash) {
if (this.delegatedURL.exists(hash)) {
if (this.delegatedURL.containsKey(ASCII.String(hash))) {
return HarvestProcess.DELEGATED;
}
if (this.errorURL.exists(hash)) {
@ -164,7 +154,7 @@ public class CrawlQueues {
assert hash != null && hash.length == 12;
this.noticeURL.removeByURLHash(hash);
this.delegatedURL.remove(hash);
this.errorURL.remove(hash);
this.errorURL.remove(ASCII.String(hash));
}
public DigestURL getURL(final byte[] urlhash) {
@ -172,13 +162,13 @@ public class CrawlQueues {
if (urlhash == null || urlhash.length == 0) {
return null;
}
ZURL.Entry ee = this.delegatedURL.get(urlhash);
if (ee != null) {
return ee.url();
DigestURL u = this.delegatedURL.get(ASCII.String(urlhash));
if (u != null) {
return u;
}
ee = this.errorURL.get(urlhash);
CollectionConfiguration.FailDoc ee = this.errorURL.get(ASCII.String(urlhash));
if (ee != null) {
return ee.url();
return ee.getDigestURL();
}
for (final Loader w: this.workers.values()) {
if (Base64Order.enhancedCoder.equal(w.request.url().hash(), urlhash)) {
@ -639,14 +629,7 @@ public class CrawlQueues {
(robotsEntry = CrawlQueues.this.sb.robots.getEntry(this.request.url(), this.profile.getAgent())) != null &&
robotsEntry.isDisallowed(this.request.url())) {
//if (log.isFine()) log.logFine("Crawling of URL '" + request.url().toString() + "' disallowed by robots.txt.");
CrawlQueues.this.errorURL.push(
this.request,
profile,
ASCII.getBytes(CrawlQueues.this.sb.peers.mySeed().hash),
new Date(),
1,
FailCategory.FINAL_ROBOTS_RULE,
"denied by robots.txt", -1);
CrawlQueues.this.errorURL.push(this.request.url(), profile, FailCategory.FINAL_ROBOTS_RULE, "denied by robots.txt", -1);
this.request.setStatus("worker-disallowed", WorkflowJob.STATUS_FINISHED);
} else {
// starting a load from the internet
@ -679,28 +662,14 @@ public class CrawlQueues {
}
if (result != null) {
CrawlQueues.this.errorURL.push(
this.request,
profile,
ASCII.getBytes(CrawlQueues.this.sb.peers.mySeed().hash),
new Date(),
1,
FailCategory.TEMPORARY_NETWORK_FAILURE,
"cannot load: " + result, -1);
CrawlQueues.this.errorURL.push(this.request.url(), profile, FailCategory.TEMPORARY_NETWORK_FAILURE, "cannot load: " + result, -1);
this.request.setStatus("worker-error", WorkflowJob.STATUS_FINISHED);
} else {
this.request.setStatus("worker-processed", WorkflowJob.STATUS_FINISHED);
}
}
} catch (final Exception e) {
CrawlQueues.this.errorURL.push(
this.request,
profile,
ASCII.getBytes(CrawlQueues.this.sb.peers.mySeed().hash),
new Date(),
1,
FailCategory.TEMPORARY_NETWORK_FAILURE,
e.getMessage() + " - in worker", -1);
CrawlQueues.this.errorURL.push(this.request.url(), profile, FailCategory.TEMPORARY_NETWORK_FAILURE, e.getMessage() + " - in worker", -1);
ConcurrentLog.logException(e);
this.request.setStatus("worker-exception", WorkflowJob.STATUS_FINISHED);
} finally {

View File

@ -1,365 +0,0 @@
// plasmaCrawlZURL.java
// (C) 2007 by Michael Peter Christen; mc@yacy.net, Frankfurt a. M., Germany
// first published 15.03.2007 on http://www.anomic.de
//
// This is a part of YaCy, a peer-to-peer based web search engine
//
// $LastChangedDate$
// $LastChangedRevision$
// $LastChangedBy$
//
// LICENSE
//
// This program is free software; you can redistribute it and/or modify
// it under the terms of the GNU General Public License as published by
// the Free Software Foundation; either version 2 of the License, or
// (at your option) any later version.
//
// This program is distributed in the hope that it will be useful,
// but WITHOUT ANY WARRANTY; without even the implied warranty of
// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
// GNU General Public License for more details.
//
// You should have received a copy of the GNU General Public License
// along with this program; if not, write to the Free Software
// Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
package net.yacy.crawler.data;
import java.io.File;
import java.io.IOException;
import java.util.ArrayList;
import java.util.Date;
import java.util.Iterator;
import java.util.List;
import java.util.Queue;
import java.util.concurrent.LinkedBlockingQueue;
import org.apache.solr.common.SolrInputDocument;
import net.yacy.cora.document.encoding.UTF8;
import net.yacy.cora.document.id.DigestURL;
import net.yacy.cora.federate.solr.FailType;
import net.yacy.cora.order.Base64Order;
import net.yacy.cora.order.NaturalOrder;
import net.yacy.cora.util.ConcurrentLog;
import net.yacy.cora.util.SpaceExceededException;
import net.yacy.crawler.retrieval.Request;
import net.yacy.kelondro.data.word.Word;
import net.yacy.kelondro.index.Index;
import net.yacy.kelondro.index.Row;
import net.yacy.kelondro.table.SplitTable;
import net.yacy.kelondro.table.Table;
import net.yacy.kelondro.util.FileUtils;
import net.yacy.search.index.Fulltext;
public class ZURL implements Iterable<ZURL.Entry> {
private static ConcurrentLog log = new ConcurrentLog("REJECTED");
private static final int EcoFSBufferSize = 2000;
private static final int maxStackSize = 1000;
public enum FailCategory {
// TEMPORARY categories are such failure cases that should be tried again
// FINAL categories are such failure cases that are final and should not be tried again
TEMPORARY_NETWORK_FAILURE(true, FailType.fail), // an entity could not been loaded
FINAL_PROCESS_CONTEXT(false, FailType.excl), // because of a processing context we do not want that url again (i.e. remote crawling)
FINAL_LOAD_CONTEXT(false, FailType.excl), // the crawler configuration does not want to load the entity
FINAL_ROBOTS_RULE(true, FailType.excl), // a remote server denies indexing or loading
FINAL_REDIRECT_RULE(true, FailType.excl); // the remote server redirects this page, thus disallowing reading of content
public final boolean store;
public final FailType failType;
private FailCategory(boolean store, FailType failType) {
this.store = store;
this.failType = failType;
}
}
private final static Row rowdef = new Row(
"String urlhash-" + Word.commonHashLength + ", " + // the url's hash
"String executor-" + Word.commonHashLength + ", " + // the crawling executor
"Cardinal workdate-8 {b256}, " + // the time when the url was last time tried to load
"Cardinal workcount-4 {b256}, " + // number of load retries
"String anycause-132, " + // string describing load failure
"byte[] entry-" + Request.rowdef.objectsize, // extra space
Base64Order.enhancedCoder
);
// the class object
private Index urlIndex;
private final Queue<byte[]> stack;
private final Fulltext fulltext;
protected ZURL(
final Fulltext fulltext,
final File cachePath,
final String tablename,
final boolean startWithEmptyFile,
final boolean useTailCache,
final boolean exceed134217727) {
this.fulltext = fulltext;
// creates a new ZURL in a file
cachePath.mkdirs();
final File f = new File(cachePath, tablename);
if (startWithEmptyFile) {
if (f.exists()) {
if (f.isDirectory()) SplitTable.delete(cachePath, tablename); else FileUtils.deletedelete(f);
}
}
try {
this.urlIndex = new Table(f, rowdef, EcoFSBufferSize, 0, useTailCache, exceed134217727, true);
} catch (final SpaceExceededException e) {
try {
this.urlIndex = new Table(f, rowdef, 0, 0, false, exceed134217727, true);
} catch (final SpaceExceededException e1) {
ConcurrentLog.logException(e1);
}
}
//urlIndex = new kelondroFlexTable(cachePath, tablename, -1, rowdef, 0, true);
this.stack = new LinkedBlockingQueue<byte[]>();
}
protected void clear() throws IOException {
if (this.urlIndex != null) this.urlIndex.clear();
if (this.stack != null) this.stack.clear();
}
protected void close() {
try {clear();} catch (final IOException e) {}
if (this.urlIndex != null) this.urlIndex.close();
}
public boolean remove(final byte[] hash) {
if (hash == null) return false;
//System.out.println("*** DEBUG ZURL " + this.urlIndex.filename() + " remove " + hash);
try {
Iterator<byte[]> i = ZURL.this.stack.iterator();
while (i.hasNext()) {
byte[] b = i.next();
if (NaturalOrder.naturalOrder.equal(hash, b)) i.remove();
}
return this.urlIndex.delete(hash);
} catch (final IOException e) {
return false;
}
}
public void removeHosts(final Iterable<byte[]> hosthashes, final boolean concurrent) {
if (hosthashes == null) return;
Thread t = new Thread() {
public void run() {
try {
Iterator<byte[]> i = ZURL.this.urlIndex.keys(true, null);
List<byte[]> r = new ArrayList<byte[]>();
while (i.hasNext()) {
byte[] b = i.next();
for (byte[] hosthash: hosthashes) {
if (NaturalOrder.naturalOrder.equal(hosthash, 0, b, 6, 6)) r.add(b);
}
}
for (byte[] b: r) ZURL.this.urlIndex.remove(b);
i = ZURL.this.stack.iterator();
while (i.hasNext()) {
byte[] b = i.next();
for (byte[] hosthash: hosthashes) {
if (NaturalOrder.naturalOrder.equal(hosthash, 0, b, 6, 6)) i.remove();
}
}
} catch (final IOException e) {}
}
};
if (concurrent) t.start(); else t.run();
}
public void push(
final Request bentry,
final CrawlProfile profile,
final byte[] executor,
final Date workdate,
final int workcount,
final FailCategory failCategory,
String anycause,
final int httpcode) {
// assert executor != null; // null == proxy !
assert failCategory.store || httpcode == -1 : "failCategory=" + failCategory.name();
if (exists(bentry.url().hash())) return; // don't insert double causes
if (anycause == null) anycause = "unknown";
final String reason = anycause + ((httpcode >= 0) ? " (http return code = " + httpcode + ")" : "");
final Entry entry = new Entry(bentry, executor, workdate, workcount, reason);
put(entry);
this.stack.add(entry.hash());
if (!reason.startsWith("double")) log.info(bentry.url().toNormalform(true) + " - " + reason);
if (this.fulltext.getDefaultConnector() != null && failCategory.store) {
// send the error to solr
try {
SolrInputDocument errorDoc = this.fulltext.getDefaultConfiguration().err(bentry.url(), profile == null ? null : profile.collections(), failCategory.name() + " " + reason, failCategory.failType, httpcode);
this.fulltext.getDefaultConnector().add(errorDoc);
} catch (final IOException e) {
ConcurrentLog.warn("SOLR", "failed to send error " + bentry.url().toNormalform(true) + " to solr: " + e.getMessage());
}
}
while (this.stack.size() > maxStackSize) this.stack.poll();
}
@Override
public Iterator<ZURL.Entry> iterator() {
return new EntryIterator();
}
public ArrayList<ZURL.Entry> list(int max) {
final ArrayList<ZURL.Entry> l = new ArrayList<ZURL.Entry>();
DigestURL url;
for (final ZURL.Entry entry: this) {
if (entry == null) continue;
url = entry.url();
if (url == null) continue;
l.add(entry);
if (max-- <= 0) l.remove(0);
}
return l;
}
private class EntryIterator implements Iterator<ZURL.Entry> {
private final Iterator<byte[]> hi;
public EntryIterator() {
this.hi = ZURL.this.stack.iterator();
}
@Override
public boolean hasNext() {
return this.hi.hasNext();
}
@Override
public ZURL.Entry next() {
return get(this.hi.next());
}
@Override
public void remove() {
this.hi.remove();
}
}
public ZURL.Entry get(final byte[] urlhash) {
try {
if (this.urlIndex == null) return null;
// System.out.println("*** DEBUG ZURL " + this.urlIndex.filename() + " get " + urlhash);
final Row.Entry entry = this.urlIndex.get(urlhash, false);
if (entry == null) return null;
return new Entry(entry);
} catch (final IOException e) {
ConcurrentLog.logException(e);
return null;
}
}
/**
* private put (use push instead)
* @param entry
*/
private void put(final Entry entry) {
// stores the values from the object variables into the database
if (entry.stored) return;
if (entry.bentry == null) return;
final Row.Entry newrow = rowdef.newEntry();
newrow.setCol(0, entry.bentry.url().hash());
newrow.setCol(1, entry.executor);
newrow.setCol(2, entry.workdate.getTime());
newrow.setCol(3, entry.workcount);
newrow.setCol(4, UTF8.getBytes(entry.anycause));
newrow.setCol(5, entry.bentry.toRow().bytes());
try {
if (this.urlIndex != null) this.urlIndex.put(newrow);
entry.stored = true;
} catch (final Exception e) {
ConcurrentLog.logException(e);
}
}
boolean exists(final byte[] urlHash) {
return this.urlIndex.has(urlHash);
}
public void clearStack() {
this.stack.clear();
}
public int stackSize() {
return this.stack.size();
}
public class Entry {
private Request bentry; // the balancer entry
private final byte[] executor; // the crawling executor
private final Date workdate; // the time when the url was last time tried to load
private final int workcount; // number of tryings
private final String anycause; // string describing reason for load fail
private boolean stored;
private Entry(
final Request bentry,
final byte[] executor,
final Date workdate,
final int workcount,
final String anycause) {
// create new entry
assert bentry != null;
// assert executor != null; // null == proxy !
this.bentry = bentry;
this.executor = executor;
this.workdate = (workdate == null) ? new Date() : workdate;
this.workcount = workcount;
this.anycause = (anycause == null) ? "" : anycause;
this.stored = false;
}
private Entry(final Row.Entry entry) throws IOException {
assert (entry != null);
this.executor = entry.getColBytes(1, true);
this.workdate = new Date(entry.getColLong(2));
this.workcount = (int) entry.getColLong(3);
this.anycause = entry.getColUTF8(4);
this.bentry = new Request(Request.rowdef.newEntry(entry.getColBytes(5, false)));
assert (Base64Order.enhancedCoder.equal(entry.getPrimaryKeyBytes(), this.bentry.url().hash()));
this.stored = true;
return;
}
public DigestURL url() {
return this.bentry.url();
}
public byte[] initiator() {
return this.bentry.initiator();
}
private byte[] hash() {
// return a url-hash, based on the md5 algorithm
// the result is a String of 12 bytes within a 72-bit space
// (each byte has an 6-bit range)
// that should be enough for all web pages on the world
return this.bentry.url().hash();
}
public Date workdate() {
return this.workdate;
}
public byte[] executor() {
// return the creator's hash
return this.executor;
}
public String anycause() {
return this.anycause;
}
}
}

View File

@ -36,6 +36,7 @@ import net.yacy.cora.document.encoding.ASCII;
import net.yacy.cora.document.encoding.UTF8;
import net.yacy.cora.document.id.DigestURL;
import net.yacy.cora.document.id.MultiProtocolURL;
import net.yacy.cora.federate.solr.FailCategory;
import net.yacy.cora.protocol.HeaderFramework;
import net.yacy.cora.protocol.RequestHeader;
import net.yacy.cora.protocol.ResponseHeader;
@ -43,7 +44,6 @@ import net.yacy.cora.protocol.ftp.FTPClient;
import net.yacy.cora.util.ConcurrentLog;
import net.yacy.crawler.data.CrawlProfile;
import net.yacy.crawler.data.Latency;
import net.yacy.crawler.data.ZURL.FailCategory;
import net.yacy.document.TextParser;
import net.yacy.search.Switchboard;
@ -156,7 +156,7 @@ public class FTPLoader {
if (berr.size() > 0 || response == null) {
// some error logging
final String detail = (berr.size() > 0) ? "Errorlog: " + berr.toString() : "";
this.sb.crawlQueues.errorURL.push(request, profile, ASCII.getBytes(this.sb.peers.mySeed().hash), new Date(), 1, FailCategory.TEMPORARY_NETWORK_FAILURE, " ftp server download, " + detail, -1);
this.sb.crawlQueues.errorURL.push(request.url(), profile, FailCategory.TEMPORARY_NETWORK_FAILURE, " ftp server download, " + detail, -1);
throw new IOException("FTPLoader: Unable to download URL '" + request.url().toString() + "': " + detail);
}

View File

@ -25,10 +25,9 @@
package net.yacy.crawler.retrieval;
import java.io.IOException;
import java.util.Date;
import net.yacy.cora.document.encoding.ASCII;
import net.yacy.cora.document.id.DigestURL;
import net.yacy.cora.federate.solr.FailCategory;
import net.yacy.cora.protocol.ClientIdentification;
import net.yacy.cora.protocol.HeaderFramework;
import net.yacy.cora.protocol.RequestHeader;
@ -37,7 +36,6 @@ import net.yacy.cora.protocol.http.HTTPClient;
import net.yacy.cora.util.ConcurrentLog;
import net.yacy.crawler.data.CrawlProfile;
import net.yacy.crawler.data.Latency;
import net.yacy.crawler.data.ZURL.FailCategory;
import net.yacy.kelondro.io.ByteCount;
import net.yacy.repository.Blacklist.BlacklistType;
import net.yacy.search.Switchboard;
@ -79,10 +77,8 @@ public final class HTTPLoader {
private Response load(final Request request, CrawlProfile profile, final int retryCount, final int maxFileSize, final BlacklistType blacklistType, final ClientIdentification.Agent agent) throws IOException {
byte[] myHash = ASCII.getBytes(this.sb.peers.mySeed().hash);
if (retryCount < 0) {
this.sb.crawlQueues.errorURL.push(request, profile, myHash, new Date(), 1, FailCategory.TEMPORARY_NETWORK_FAILURE, "retry counter exceeded", -1);
this.sb.crawlQueues.errorURL.push(request.url(), profile, FailCategory.TEMPORARY_NETWORK_FAILURE, "retry counter exceeded", -1);
throw new IOException("retry counter exceeded for URL " + request.url().toString() + ". Processing aborted.");
}
@ -98,7 +94,7 @@ public final class HTTPLoader {
// check if url is in blacklist
final String hostlow = host.toLowerCase();
if (blacklistType != null && Switchboard.urlBlacklist.isListed(blacklistType, hostlow, path)) {
this.sb.crawlQueues.errorURL.push(request, profile, myHash, new Date(), 1, FailCategory.FINAL_LOAD_CONTEXT, "url in blacklist", -1);
this.sb.crawlQueues.errorURL.push(request.url(), profile, FailCategory.FINAL_LOAD_CONTEXT, "url in blacklist", -1);
throw new IOException("CRAWLER Rejecting URL '" + request.url().toString() + "'. URL is in blacklist.");
}
@ -145,7 +141,7 @@ public final class HTTPLoader {
redirectionUrlString = redirectionUrlString == null ? "" : redirectionUrlString.trim();
if (redirectionUrlString.isEmpty()) {
this.sb.crawlQueues.errorURL.push(request, profile, myHash, new Date(), 1, FailCategory.TEMPORARY_NETWORK_FAILURE, "no redirection url provided, field '" + HeaderFramework.LOCATION + "' is empty", statusCode);
this.sb.crawlQueues.errorURL.push(request.url(), profile, FailCategory.TEMPORARY_NETWORK_FAILURE, "no redirection url provided, field '" + HeaderFramework.LOCATION + "' is empty", statusCode);
throw new IOException("REJECTED EMTPY REDIRECTION '" + client.getHttpResponse().getStatusLine() + "' for URL " + requestURLString);
}
@ -159,13 +155,13 @@ public final class HTTPLoader {
this.sb.webStructure.generateCitationReference(url, redirectionUrl);
if (this.sb.getConfigBool(SwitchboardConstants.CRAWLER_RECORD_REDIRECTS, true)) {
this.sb.crawlQueues.errorURL.push(request, profile, myHash, new Date(), 1, FailCategory.FINAL_REDIRECT_RULE, "redirect to " + redirectionUrlString, statusCode);
this.sb.crawlQueues.errorURL.push(request.url(), profile, FailCategory.FINAL_REDIRECT_RULE, "redirect to " + redirectionUrlString, statusCode);
}
if (this.sb.getConfigBool(SwitchboardConstants.CRAWLER_FOLLOW_REDIRECTS, true)) {
// if we are already doing a shutdown we don't need to retry crawling
if (Thread.currentThread().isInterrupted()) {
this.sb.crawlQueues.errorURL.push(request, profile, myHash, new Date(), 1, FailCategory.FINAL_LOAD_CONTEXT, "server shutdown", statusCode);
this.sb.crawlQueues.errorURL.push(request.url(), profile, FailCategory.FINAL_LOAD_CONTEXT, "server shutdown", statusCode);
throw new IOException("CRAWLER Retry of URL=" + requestURLString + " aborted because of server shutdown.");
}
@ -174,11 +170,11 @@ public final class HTTPLoader {
return load(request, profile, retryCount - 1, maxFileSize, blacklistType, agent);
}
// we don't want to follow redirects
this.sb.crawlQueues.errorURL.push(request, profile, myHash, new Date(), 1, FailCategory.FINAL_PROCESS_CONTEXT, "redirection not wanted", statusCode);
this.sb.crawlQueues.errorURL.push(request.url(), profile, FailCategory.FINAL_PROCESS_CONTEXT, "redirection not wanted", statusCode);
throw new IOException("REJECTED UNWANTED REDIRECTION '" + client.getHttpResponse().getStatusLine() + "' for URL " + requestURLString);
} else if (responseBody == null) {
// no response, reject file
this.sb.crawlQueues.errorURL.push(request, profile, myHash, new Date(), 1, FailCategory.TEMPORARY_NETWORK_FAILURE, "no response body", statusCode);
this.sb.crawlQueues.errorURL.push(request.url(), profile, FailCategory.TEMPORARY_NETWORK_FAILURE, "no response body", statusCode);
throw new IOException("REJECTED EMPTY RESPONSE BODY '" + client.getHttpResponse().getStatusLine() + "' for URL " + requestURLString);
} else if (statusCode == 200 || statusCode == 203) {
// the transfer is ok
@ -189,7 +185,7 @@ public final class HTTPLoader {
// check length again in case it was not possible to get the length before loading
if (maxFileSize >= 0 && contentLength > maxFileSize) {
this.sb.crawlQueues.errorURL.push(request, profile, myHash, new Date(), 1, FailCategory.FINAL_PROCESS_CONTEXT, "file size limit exceeded", statusCode);
this.sb.crawlQueues.errorURL.push(request.url(), profile, FailCategory.FINAL_PROCESS_CONTEXT, "file size limit exceeded", statusCode);
throw new IOException("REJECTED URL " + request.url() + " because file size '" + contentLength + "' exceeds max filesize limit of " + maxFileSize + " bytes. (GET)");
}
@ -206,7 +202,7 @@ public final class HTTPLoader {
return response;
} else {
// if the response has not the right response type then reject file
this.sb.crawlQueues.errorURL.push(request, profile, myHash, new Date(), 1, FailCategory.TEMPORARY_NETWORK_FAILURE, "wrong http status code", statusCode);
this.sb.crawlQueues.errorURL.push(request.url(), profile, FailCategory.TEMPORARY_NETWORK_FAILURE, "wrong http status code", statusCode);
throw new IOException("REJECTED WRONG STATUS TYPE '" + client.getHttpResponse().getStatusLine() + "' for URL " + requestURLString);
}
}

View File

@ -42,6 +42,7 @@ import net.yacy.cora.document.encoding.ASCII;
import net.yacy.cora.document.encoding.UTF8;
import net.yacy.cora.document.id.AnchorURL;
import net.yacy.cora.document.id.DigestURL;
import net.yacy.cora.federate.solr.FailCategory;
import net.yacy.cora.federate.yacy.CacheStrategy;
import net.yacy.cora.protocol.ClientIdentification;
import net.yacy.cora.protocol.HeaderFramework;
@ -50,7 +51,6 @@ import net.yacy.cora.protocol.ResponseHeader;
import net.yacy.cora.util.ConcurrentLog;
import net.yacy.crawler.data.Cache;
import net.yacy.crawler.data.CrawlProfile;
import net.yacy.crawler.data.ZURL.FailCategory;
import net.yacy.crawler.retrieval.FTPLoader;
import net.yacy.crawler.retrieval.FileLoader;
import net.yacy.crawler.retrieval.HTTPLoader;
@ -191,7 +191,7 @@ public final class LoaderDispatcher {
// check if url is in blacklist
if (blacklistType != null && host != null && Switchboard.urlBlacklist.isListed(blacklistType, host.toLowerCase(), url.getFile())) {
this.sb.crawlQueues.errorURL.push(request, crawlProfile, this.sb.peers.mySeed().hash.getBytes(), new Date(), 1, FailCategory.FINAL_LOAD_CONTEXT, "url in blacklist", -1);
this.sb.crawlQueues.errorURL.push(request.url(), crawlProfile, FailCategory.FINAL_LOAD_CONTEXT, "url in blacklist", -1);
throw new IOException("DISPATCHER Rejecting URL '" + request.url().toString() + "'. URL is in blacklist.");
}

View File

@ -97,6 +97,7 @@ import net.yacy.cora.document.feed.RSSReader;
import net.yacy.cora.document.id.AnchorURL;
import net.yacy.cora.document.id.DigestURL;
import net.yacy.cora.document.id.MultiProtocolURL;
import net.yacy.cora.federate.solr.FailCategory;
import net.yacy.cora.federate.solr.Ranking;
import net.yacy.cora.federate.solr.SchemaConfiguration;
import net.yacy.cora.federate.solr.instance.RemoteInstance;
@ -127,7 +128,6 @@ import net.yacy.crawler.data.ResultImages;
import net.yacy.crawler.data.ResultURLs;
import net.yacy.crawler.data.NoticedURL.StackType;
import net.yacy.crawler.data.ResultURLs.EventOrigin;
import net.yacy.crawler.data.ZURL.FailCategory;
import net.yacy.crawler.retrieval.Request;
import net.yacy.crawler.retrieval.Response;
import net.yacy.crawler.robots.RobotsTxt;
@ -1789,16 +1789,9 @@ public final class Switchboard extends serverSwitch {
// in the noIndexReason is set, indexing is not allowed
if ( noIndexReason != null ) {
// log cause and close queue
final DigestURL referrerURL = response.referrerURL();
//if (log.isFine()) log.logFine("deQueue: not indexed any word in URL " + response.url() + "; cause: " + noIndexReason);
addURLtoErrorDB(
response.url(),
response.profile(),
(referrerURL == null) ? null : referrerURL.hash(),
response.initiator(),
response.name(),
FailCategory.FINAL_PROCESS_CONTEXT,
noIndexReason);
// create a new errorURL DB entry
this.crawlQueues.errorURL.push(response.url(), response.profile(), FailCategory.FINAL_PROCESS_CONTEXT, noIndexReason, -1);
// finish this entry
return "not allowed: " + noIndexReason;
}
@ -1991,7 +1984,7 @@ public final class Switchboard extends serverSwitch {
public int cleanupJobSize() {
int c = 1; // "es gibt immer was zu tun"
if ( (this.crawlQueues.delegatedURL.stackSize() > 1000) ) {
if ( (this.crawlQueues.delegatedURL.size() > 1000) ) {
c++;
}
if ( (this.crawlQueues.errorURL.stackSize() > 1000) ) {
@ -2101,13 +2094,13 @@ public final class Switchboard extends serverSwitch {
// clean up delegated stack
checkInterruption();
if ( (this.crawlQueues.delegatedURL.stackSize() > 1000) ) {
if ( (this.crawlQueues.delegatedURL.size() > 1000) ) {
if ( this.log.isFine() ) {
this.log.fine("Cleaning Delegated-URLs report stack, "
+ this.crawlQueues.delegatedURL.stackSize()
+ this.crawlQueues.delegatedURL.size()
+ " entries on stack");
}
this.crawlQueues.delegatedURL.clearStack();
this.crawlQueues.delegatedURL.clear();
}
// clean up error stack
@ -2428,7 +2421,6 @@ public final class Switchboard extends serverSwitch {
public IndexingQueueEntry parseDocument(final IndexingQueueEntry in) {
in.queueEntry.updateStatus(Response.QUEUE_STATE_PARSING);
Document[] documents = null;
try {
documents = parseDocument(in.queueEntry);
@ -2439,7 +2431,7 @@ public final class Switchboard extends serverSwitch {
}
if ( documents == null ) {
return null;
}
}
return new IndexingQueueEntry(in.queueEntry, documents, null);
}
@ -2465,14 +2457,8 @@ public final class Switchboard extends serverSwitch {
response.setContent(Cache.getContent(response.url().hash()));
if ( response.getContent() == null ) {
this.log.warn("the resource '" + response.url() + "' is missing in the cache.");
addURLtoErrorDB(
response.url(),
response.profile(),
response.referrerHash(),
response.initiator(),
response.name(),
FailCategory.FINAL_LOAD_CONTEXT,
"missing in cache");
// create a new errorURL DB entry
this.crawlQueues.errorURL.push(response.url(), response.profile(), FailCategory.FINAL_LOAD_CONTEXT, "missing in cache", -1);
return null;
}
}
@ -2490,20 +2476,37 @@ public final class Switchboard extends serverSwitch {
}
} catch (final Parser.Failure e ) {
this.log.warn("Unable to parse the resource '" + response.url() + "'. " + e.getMessage());
addURLtoErrorDB(
response.url(),
response.profile(),
response.referrerHash(),
response.initiator(),
response.name(),
FailCategory.FINAL_PROCESS_CONTEXT,
e.getMessage());
// create a new errorURL DB entry
this.crawlQueues.errorURL.push(response.url(), response.profile(), FailCategory.FINAL_PROCESS_CONTEXT, e.getMessage(), -1);
return null;
}
final long parsingEndTime = System.currentTimeMillis();
// put anchors on crawl stack
final long stackStartTime = System.currentTimeMillis();
// check if the documents have valid urls; this is not a bug patch; it is possible that
// i.e. the result of a feed parsing results in documents from domains which shall be filtered by the crawl profile
if (response.profile() != null) {
ArrayList<Document> newDocs = new ArrayList<Document>();
for (Document doc: documents) {
String rejectReason = this.crawlStacker.checkAcceptance(doc.dc_source(), response.profile(), 1 /*depth is irrelevant here, we just make clear its not the start url*/);
if (rejectReason == null) {
newDocs.add(doc);
} else {
// we consider this as fail urls to have a tracking of the problem
if (rejectReason != null && !rejectReason.startsWith("double in")) {
final CrawlProfile profile = this.crawler.getActive(UTF8.getBytes(response.profile().handle()));
this.crawlStacker.nextQueue.errorURL.push(response.url(), profile, FailCategory.FINAL_LOAD_CONTEXT, rejectReason, -1);
}
}
}
if (newDocs.size() != documents.length) {
documents = (Document[]) newDocs.toArray();
}
}
// collect anchors within remaining documents
if ((processCase == EventOrigin.PROXY_LOAD || processCase == EventOrigin.LOCAL_CRAWLING) &&
(
response.profile() == null ||
@ -2592,14 +2595,8 @@ public final class Switchboard extends serverSwitch {
if (!(profile.indexUrlMustMatchPattern() == CrawlProfile.MATCH_ALL_PATTERN || profile.indexUrlMustMatchPattern().matcher(urls).matches()) ||
(profile.indexUrlMustNotMatchPattern() != CrawlProfile.MATCH_NEVER_PATTERN && profile.indexUrlMustNotMatchPattern().matcher(urls).matches())) {
if (this.log.isInfo()) this.log.info("Not Condensed Resource '" + urls + "': indexing prevented by regular expression on url; indexUrlMustMatchPattern = " + profile.indexUrlMustMatchPattern().pattern() + ", indexUrlMustNotMatchPattern = " + profile.indexUrlMustNotMatchPattern().pattern());
addURLtoErrorDB(
in.queueEntry.url(),
profile,
in.queueEntry.referrerHash(),
in.queueEntry.initiator(),
in.queueEntry.name(),
FailCategory.FINAL_PROCESS_CONTEXT,
"indexing prevented by regular expression on url; indexUrlMustMatchPattern = " + profile.indexUrlMustMatchPattern().pattern() + ", indexUrlMustNotMatchPattern = " + profile.indexUrlMustNotMatchPattern().pattern());
// create a new errorURL DB entry
this.crawlQueues.errorURL.push(in.queueEntry.url(), profile, FailCategory.FINAL_PROCESS_CONTEXT, "indexing prevented by regular expression on url; indexUrlMustMatchPattern = " + profile.indexUrlMustMatchPattern().pattern() + ", indexUrlMustNotMatchPattern = " + profile.indexUrlMustNotMatchPattern().pattern(), -1);
return new IndexingQueueEntry(in.queueEntry, in.documents, null);
}
@ -2608,27 +2605,15 @@ public final class Switchboard extends serverSwitch {
docloop: for (final Document document : in.documents) {
if (document.indexingDenied() && profile.obeyHtmlRobotsNoindex()) {
if (this.log.isInfo()) this.log.info("Not Condensed Resource '" + urls + "': denied by document-attached noindexing rule");
addURLtoErrorDB(
in.queueEntry.url(),
profile,
in.queueEntry.referrerHash(),
in.queueEntry.initiator(),
in.queueEntry.name(),
FailCategory.FINAL_PROCESS_CONTEXT,
"denied by document-attached noindexing rule");
// create a new errorURL DB entry
this.crawlQueues.errorURL.push(in.queueEntry.url(), profile, FailCategory.FINAL_PROCESS_CONTEXT, "denied by document-attached noindexing rule", -1);
continue docloop;
}
if (!(profile.indexContentMustMatchPattern() == CrawlProfile.MATCH_ALL_PATTERN || profile.indexContentMustMatchPattern().matcher(document.getTextString()).matches()) ||
(profile.indexContentMustNotMatchPattern() != CrawlProfile.MATCH_NEVER_PATTERN && profile.indexContentMustNotMatchPattern().matcher(document.getTextString()).matches())) {
if (this.log.isInfo()) this.log.info("Not Condensed Resource '" + urls + "': indexing prevented by regular expression on content; indexContentMustMatchPattern = " + profile.indexContentMustMatchPattern().pattern() + ", indexContentMustNotMatchPattern = " + profile.indexContentMustNotMatchPattern().pattern());
addURLtoErrorDB(
in.queueEntry.url(),
profile,
in.queueEntry.referrerHash(),
in.queueEntry.initiator(),
in.queueEntry.name(),
FailCategory.FINAL_PROCESS_CONTEXT,
"indexing prevented by regular expression on content; indexContentMustMatchPattern = " + profile.indexContentMustMatchPattern().pattern() + ", indexContentMustNotMatchPattern = " + profile.indexContentMustNotMatchPattern().pattern());
// create a new errorURL DB entry
this.crawlQueues.errorURL.push(in.queueEntry.url(), profile, FailCategory.FINAL_PROCESS_CONTEXT, "indexing prevented by regular expression on content; indexContentMustMatchPattern = " + profile.indexContentMustMatchPattern().pattern() + ", indexContentMustNotMatchPattern = " + profile.indexContentMustNotMatchPattern().pattern(), -1);
continue docloop;
}
doclist.add(document);
@ -2705,30 +2690,18 @@ public final class Switchboard extends serverSwitch {
if (condenser == null || (document.indexingDenied() && profile.obeyHtmlRobotsNoindex())) {
//if (this.log.isInfo()) log.logInfo("Not Indexed Resource '" + queueEntry.url().toNormalform(false, true) + "': denied by rule in document, process case=" + processCase);
addURLtoErrorDB(
url,
profile,
(referrerURL == null) ? null : referrerURL.hash(),
queueEntry.initiator(),
dc_title,
FailCategory.FINAL_PROCESS_CONTEXT,
"denied by rule in document, process case=" + processCase);
// create a new errorURL DB entry
this.crawlQueues.errorURL.push(url, profile, FailCategory.FINAL_PROCESS_CONTEXT, "denied by rule in document, process case=" + processCase, -1);
return;
}
if ( profile != null && !profile.indexText() && !profile.indexMedia() ) {
//if (this.log.isInfo()) log.logInfo("Not Indexed Resource '" + queueEntry.url().toNormalform(false, true) + "': denied by profile rule, process case=" + processCase + ", profile name = " + queueEntry.profile().name());
addURLtoErrorDB(
url,
profile,
(referrerURL == null) ? null : referrerURL.hash(),
queueEntry.initiator(),
dc_title,
FailCategory.FINAL_LOAD_CONTEXT,
"denied by profile rule, process case="
+ processCase
+ ", profile name = "
+ profile.collectionName());
// create a new errorURL DB entry
this.crawlQueues.errorURL.push(url, profile, FailCategory.FINAL_LOAD_CONTEXT, "denied by profile rule, process case="
+ processCase
+ ", profile name = "
+ profile.collectionName(), -1);
return;
}
@ -2906,7 +2879,7 @@ public final class Switchboard extends serverSwitch {
// remove the document from the error-db
byte[] hosthash = new byte[6]; System.arraycopy(urlhash, 6, hosthash, 0, 6);
List<byte[]> hosthashes = new ArrayList<byte[]>(); hosthashes.add(hosthash);
this.crawlQueues.errorURL.removeHosts(hosthashes, false);
this.crawlQueues.errorURL.removeHosts(hosthashes);
this.crawlQueues.removeURL(urlhash);
// get a scraper to get the title
@ -3373,31 +3346,6 @@ public final class Switchboard extends serverSwitch {
return hasDoneSomething;
}
private void addURLtoErrorDB(
final DigestURL url,
final CrawlProfile profile,
final byte[] referrerHash,
final byte[] initiator,
final String name,
final FailCategory failCategory,
final String failreason) {
// assert initiator != null; // null == proxy
// create a new errorURL DB entry
final Request bentry =
new Request(
initiator,
url,
referrerHash,
(name == null) ? "" : name,
new Date(),
null,
0,
0,
0,
0);
this.crawlQueues.errorURL.push(bentry, profile, initiator, new Date(), 0, failCategory, failreason, -1);
}
public final void heuristicSite(final SearchEvent searchEvent, final String host) {
new Thread() {
@Override

View File

@ -0,0 +1,173 @@
/**
* ErrorCache
* Copyright 2013 by Michael Peter Christen
* First released 17.10.2013 at http://yacy.net
*
* This library is free software; you can redistribute it and/or
* modify it under the terms of the GNU Lesser General Public
* License as published by the Free Software Foundation; either
* version 2.1 of the License, or (at your option) any later version.
*
* This library is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
* Lesser General Public License for more details.
*
* You should have received a copy of the GNU Lesser General Public License
* along with this program in the file lgpl21.txt
* If not, see <http://www.gnu.org/licenses/>.
*/
package net.yacy.search.index;
import java.io.IOException;
import java.util.ArrayList;
import java.util.Iterator;
import java.util.LinkedHashMap;
import org.apache.solr.client.solrj.SolrQuery;
import org.apache.solr.client.solrj.SolrQuery.SortClause;
import org.apache.solr.client.solrj.response.QueryResponse;
import org.apache.solr.common.SolrDocument;
import org.apache.solr.common.SolrDocumentList;
import org.apache.solr.common.SolrInputDocument;
import net.yacy.cora.document.encoding.ASCII;
import net.yacy.cora.document.id.DigestURL;
import net.yacy.cora.federate.solr.FailCategory;
import net.yacy.cora.order.NaturalOrder;
import net.yacy.cora.util.ConcurrentLog;
import net.yacy.crawler.data.CrawlProfile;
import net.yacy.search.index.Fulltext;
import net.yacy.search.schema.CollectionConfiguration;
import net.yacy.search.schema.CollectionSchema;
public class ErrorCache {
private static ConcurrentLog log = new ConcurrentLog("REJECTED");
private static final int maxStackSize = 1000;
// the class object
private final LinkedHashMap<String, CollectionConfiguration.FailDoc> stack;
private final Fulltext fulltext;
public ErrorCache(final Fulltext fulltext) {
this.fulltext = fulltext;
this.stack = new LinkedHashMap<String, CollectionConfiguration.FailDoc>();
try {
// fill stack with latest values
final SolrQuery params = new SolrQuery();
params.setParam("defType", "edismax");
params.setStart(0);
params.setRows(100);
params.setFacet(false);
params.setSort(new SortClause(CollectionSchema.last_modified.getSolrFieldName(), SolrQuery.ORDER.desc));
params.setFacet(false);
params.setQuery(CollectionSchema.failreason_s.getSolrFieldName() + ":[* TO *]");
QueryResponse rsp = fulltext.getDefaultConnector().getResponseByParams(params);
SolrDocumentList docList = rsp == null ? null : rsp.getResults();
if (docList != null) for (int i = docList.size() - 1; i >= 0; i--) {
CollectionConfiguration.FailDoc failDoc = new CollectionConfiguration.FailDoc(docList.get(i));
this.stack.put(ASCII.String(failDoc.getDigestURL().hash()), failDoc);
}
} catch (final Throwable e) {
}
}
public void clear() throws IOException {
if (this.stack != null) this.stack.clear();
this.fulltext.getDefaultConnector().deleteByQuery(CollectionSchema.failreason_s.getSolrFieldName() + ":[* TO *]");
}
public void remove(final String hash) {
if (hash == null) return;
this.stack.remove(hash);
try {
this.fulltext.getDefaultConnector().deleteByQuery(CollectionSchema.id.getSolrFieldName() + ":\"" + hash + "\" AND " + CollectionSchema.failreason_s.getSolrFieldName() + ":[* TO *]");
} catch (final IOException e) {
return;
}
}
public void removeHosts(final Iterable<byte[]> hosthashes) {
if (hosthashes == null) return;
try {
for (byte[] hosthash : hosthashes) {
this.fulltext.getDefaultConnector().deleteByQuery(CollectionSchema.host_id_s.getSolrFieldName() + ":\"" + ASCII.String(hosthash) + "\" AND " + CollectionSchema.failreason_s.getSolrFieldName() + ":[* TO *]");
}
Iterator<String> i = ErrorCache.this.stack.keySet().iterator();
while (i.hasNext()) {
String b = i.next();
for (byte[] hosthash : hosthashes) {
if (NaturalOrder.naturalOrder.equal(hosthash, 0, ASCII.getBytes(b), 6, 6)) i.remove();
}
}
} catch (final IOException e) {
}
}
public void push(final DigestURL url, final CrawlProfile profile, final FailCategory failCategory, String anycause, final int httpcode) {
// assert executor != null; // null == proxy !
assert failCategory.store || httpcode == -1 : "failCategory=" + failCategory.name();
if (exists(url.hash()))
return; // don't insert double causes
if (anycause == null) anycause = "unknown";
final String reason = anycause + ((httpcode >= 0) ? " (http return code = " + httpcode + ")" : "");
if (!reason.startsWith("double")) log.info(url.toNormalform(true) + " - " + reason);
CollectionConfiguration.FailDoc failDoc = new CollectionConfiguration.FailDoc(
url, profile == null ? null : profile.collections(),
failCategory.name() + " " + reason, failCategory.failType,
httpcode);
this.stack.put(ASCII.String(url.hash()), failDoc);
if (this.fulltext.getDefaultConnector() != null && failCategory.store) {
// send the error to solr
try {
SolrInputDocument errorDoc = failDoc.toSolr(this.fulltext.getDefaultConfiguration());
this.fulltext.getDefaultConnector().add(errorDoc);
} catch (final IOException e) {
ConcurrentLog.warn("SOLR", "failed to send error " + url.toNormalform(true) + " to solr: " + e.getMessage());
}
}
while (this.stack.size() > maxStackSize)
this.stack.remove(this.stack.keySet().iterator());
}
public ArrayList<CollectionConfiguration.FailDoc> list(int max) {
final ArrayList<CollectionConfiguration.FailDoc> l = new ArrayList<CollectionConfiguration.FailDoc>();
Iterator<CollectionConfiguration.FailDoc> fdi = this.stack.values().iterator();
for (int i = 0; i < this.stack.size() - max; i++) fdi.next();
while (fdi.hasNext()) l.add(fdi.next());
return l;
}
public CollectionConfiguration.FailDoc get(final String urlhash) {
CollectionConfiguration.FailDoc fd = this.stack.get(urlhash);
if (fd != null) return fd;
try {
SolrDocument doc = this.fulltext.getDefaultConnector().getDocumentById(urlhash);
if (doc == null) return null;
return new CollectionConfiguration.FailDoc(doc);
} catch (final IOException e) {
ConcurrentLog.logException(e);
return null;
}
}
public boolean exists(final byte[] urlHash) {
try {
return this.fulltext.getDefaultConnector().existsByQuery(CollectionSchema.id.getSolrFieldName() + ":\"" + ASCII.String(urlHash) + "\" AND " + CollectionSchema.failreason_s.getSolrFieldName() + ":[* TO *]");
} catch (IOException e) {
return false;
}
}
public void clearStack() {
this.stack.clear();
}
public int stackSize() {
return this.stack.size();
}
}

View File

@ -80,6 +80,7 @@ import net.yacy.kelondro.util.Bitfield;
import net.yacy.search.index.Segment;
import net.yacy.search.index.Segment.ReferenceReport;
import net.yacy.search.index.Segment.ReferenceReportCache;
import net.yacy.search.query.QueryParams;
import net.yacy.search.schema.WebgraphConfiguration.Subgraph;
import org.apache.solr.common.SolrDocument;
@ -1195,34 +1196,73 @@ public class CollectionConfiguration extends SchemaConfiguration implements Seri
return il;
}
*/
/**
* register an entry as error document
* @param digestURI
* @param failReason
* @param httpstatus
* @throws IOException
*/
public SolrInputDocument err(final DigestURL digestURI, final Map<String, Pattern> collections, final String failReason, final FailType failType, final int httpstatus) throws IOException {
boolean allAttr = this.isEmpty();
assert allAttr || contains(CollectionSchema.failreason_s);
final SolrInputDocument doc = new SolrInputDocument();
String url = addURIAttributes(doc, allAttr, digestURI, Response.docType(digestURI));
if (allAttr || contains(CollectionSchema.load_date_dt)) add(doc, CollectionSchema.load_date_dt, new Date());
// fail reason and status
if (allAttr || contains(CollectionSchema.failreason_s)) add(doc, CollectionSchema.failreason_s, failReason);
if (allAttr || contains(CollectionSchema.failtype_s)) add(doc, CollectionSchema.failtype_s, failType.name());
if (allAttr || contains(CollectionSchema.httpstatus_i)) add(doc, CollectionSchema.httpstatus_i, httpstatus);
if (allAttr || contains(CollectionSchema.collection_sxt) && collections != null && collections.size() > 0) {
List<String> cs = new ArrayList<String>();
for (Map.Entry<String, Pattern> e: collections.entrySet()) {
if (e.getValue().matcher(url).matches()) cs.add(e.getKey());
}
add(doc, CollectionSchema.collection_sxt, cs);
}
return doc;
}
public static class FailDoc {
DigestURL digestURL;
final Map<String, Pattern> collections;
final String failReason;
final FailType failType;
final int httpstatus;
final Date failtime;
public FailDoc(final DigestURL digestURL, final Map<String, Pattern> collections, final String failReason, final FailType failType, final int httpstatus) {
this.digestURL = digestURL;
this.collections = collections;
this.failReason = failReason;
this.failType = failType;
this.httpstatus = httpstatus;
this.failtime = new Date();
}
public FailDoc(final SolrDocument doc) {
try {
this.digestURL = new DigestURL((String) doc.getFieldValue(CollectionSchema.sku.getSolrFieldName()));
} catch (MalformedURLException e) {
this.digestURL = null;
}
this.collections = new HashMap<String, Pattern>();
Collection<Object> c = doc.getFieldValues(CollectionSchema.collection_sxt.getSolrFieldName());
for (Object cn: c) this.collections.put((String) cn, QueryParams.catchall_pattern);
this.failReason = (String) doc.getFieldValue(CollectionSchema.failreason_s.getSolrFieldName());
this.failType = FailType.valueOf((String) doc.getFieldValue(CollectionSchema.failtype_s.getSolrFieldName()));
this.httpstatus = (Integer) doc.getFieldValue(CollectionSchema.httpstatus_i.getSolrFieldName());
this.failtime = (Date) doc.getFieldValue(CollectionSchema.load_date_dt.getSolrFieldName());
}
public DigestURL getDigestURL() {
return digestURL;
}
public Map<String, Pattern> getCollections() {
return collections;
}
public String getFailReason() {
return failReason;
}
public FailType getFailType() {
return failType;
}
public int getHttpstatus() {
return httpstatus;
}
public SolrInputDocument toSolr(CollectionConfiguration configuration) {
boolean allAttr = configuration.isEmpty();
assert allAttr || configuration.contains(CollectionSchema.failreason_s);
final SolrInputDocument doc = new SolrInputDocument();
String url = configuration.addURIAttributes(doc, allAttr, this.getDigestURL(), Response.docType(this.getDigestURL()));
if (allAttr || configuration.contains(CollectionSchema.load_date_dt)) configuration.add(doc, CollectionSchema.load_date_dt, new Date());
// fail reason and status
if (allAttr || configuration.contains(CollectionSchema.failreason_s)) configuration.add(doc, CollectionSchema.failreason_s, this.getFailReason());
if (allAttr || configuration.contains(CollectionSchema.failtype_s)) configuration.add(doc, CollectionSchema.failtype_s, this.getFailType().name());
if (allAttr || configuration.contains(CollectionSchema.httpstatus_i)) configuration.add(doc, CollectionSchema.httpstatus_i, this.getHttpstatus());
if (allAttr || configuration.contains(CollectionSchema.collection_sxt) && this.getCollections() != null && this.getCollections().size() > 0) {
List<String> cs = new ArrayList<String>();
for (Map.Entry<String, Pattern> e: this.getCollections().entrySet()) {
if (e.getValue().matcher(url).matches()) cs.add(e.getKey());
}
configuration.add(doc, CollectionSchema.collection_sxt, cs);
}
return doc;
}
}
}

View File

@ -40,6 +40,7 @@ import net.yacy.cora.document.analysis.Classification.ContentDomain;
import net.yacy.cora.document.encoding.ASCII;
import net.yacy.cora.document.id.AnchorURL;
import net.yacy.cora.document.id.DigestURL;
import net.yacy.cora.federate.solr.FailCategory;
import net.yacy.cora.federate.yacy.CacheStrategy;
import net.yacy.cora.order.Base64Order;
import net.yacy.cora.protocol.ClientIdentification;
@ -48,8 +49,6 @@ import net.yacy.cora.util.ByteArray;
import net.yacy.cora.util.ConcurrentLog;
import net.yacy.cora.util.NumberTools;
import net.yacy.cora.util.SpaceExceededException;
import net.yacy.crawler.data.ZURL.FailCategory;
import net.yacy.crawler.retrieval.Request;
import net.yacy.document.Document;
import net.yacy.document.Parser;
import net.yacy.document.WordTokenizer;
@ -59,6 +58,7 @@ import net.yacy.repository.Blacklist.BlacklistType;
import net.yacy.search.Switchboard;
@SuppressWarnings("unused")
public class MediaSnippet implements Comparable<MediaSnippet>, Comparator<MediaSnippet> {
public ContentDomain type;
public DigestURL href, source;
@ -260,7 +260,7 @@ public class MediaSnippet implements Comparable<MediaSnippet>, Comparator<MediaS
// check if url is in blacklist
if (Switchboard.urlBlacklist.isListed(blacklistType, url.getHost().toLowerCase(), url.getFile())) {
Switchboard.getSwitchboard().crawlQueues.errorURL.push(new Request(url, null), null, ASCII.getBytes(Switchboard.getSwitchboard().peers.mySeed().hash), new Date(), 1, FailCategory.FINAL_LOAD_CONTEXT, "url in blacklist", -1);
Switchboard.getSwitchboard().crawlQueues.errorURL.push(url, null, FailCategory.FINAL_LOAD_CONTEXT, "url in blacklist", -1);
ConcurrentLog.fine("snippet fetch", "MEDIA-SNIPPET Rejecting URL '" + url.toString() + "'. URL is in blacklist.");
isBlacklisted = true;
}