mirror of
https://github.com/yacy/yacy_search_server.git
synced 2024-09-19 00:01:41 +02:00
- removed ZURL data structure; removed also the ZURL data file
- replaced load failure logging by information which is stored in Solr - fixed a bug with crawling of feeds: added must-match pattern application to feed urls to filter out such urls which shall not be in a wanted domain - delegatedURLs, which also used ZURLs are now temporary objects in memory
This commit is contained in:
parent
31920385f7
commit
2602be8d1e
|
@ -37,6 +37,7 @@ import java.util.regex.PatternSyntaxException;
|
|||
import net.yacy.cora.document.encoding.ASCII;
|
||||
import net.yacy.cora.document.id.AnchorURL;
|
||||
import net.yacy.cora.document.id.DigestURL;
|
||||
import net.yacy.cora.federate.solr.FailCategory;
|
||||
import net.yacy.cora.federate.yacy.CacheStrategy;
|
||||
import net.yacy.cora.protocol.ClientIdentification;
|
||||
import net.yacy.cora.protocol.RequestHeader;
|
||||
|
@ -44,8 +45,6 @@ import net.yacy.cora.util.ConcurrentLog;
|
|||
import net.yacy.cora.util.SpaceExceededException;
|
||||
import net.yacy.crawler.CrawlSwitchboard;
|
||||
import net.yacy.crawler.data.CrawlProfile;
|
||||
import net.yacy.crawler.data.ZURL.FailCategory;
|
||||
import net.yacy.crawler.retrieval.Request;
|
||||
import net.yacy.crawler.retrieval.SitemapImporter;
|
||||
import net.yacy.data.WorkTables;
|
||||
import net.yacy.document.Document;
|
||||
|
@ -392,7 +391,7 @@ public class Crawler_p {
|
|||
for (DigestURL u: rootURLs) {
|
||||
hosthashes.add(ASCII.getBytes(u.hosthash()));
|
||||
}
|
||||
sb.crawlQueues.errorURL.removeHosts(hosthashes, false);
|
||||
sb.crawlQueues.errorURL.removeHosts(hosthashes);
|
||||
for (byte[] hosthash: hosthashes) {
|
||||
try {
|
||||
String deletequery = CollectionSchema.host_id_s.getSolrFieldName() + ":\"" + ASCII.String(hosthash) + "\" AND " + CollectionSchema.failreason_s.getSolrFieldName() + ":[* TO *]";
|
||||
|
@ -440,24 +439,7 @@ public class Crawler_p {
|
|||
} else {
|
||||
StringBuilder fr = new StringBuilder();
|
||||
for (Map.Entry<DigestURL, String> failure: failurls.entrySet()) {
|
||||
sb.crawlQueues.errorURL.push(
|
||||
new Request(
|
||||
sb.peers.mySeed().hash.getBytes(),
|
||||
failure.getKey(),
|
||||
null,
|
||||
"",
|
||||
new Date(),
|
||||
profile.handle(),
|
||||
0,
|
||||
0,
|
||||
0,
|
||||
0),
|
||||
null,
|
||||
sb.peers.mySeed().hash.getBytes(),
|
||||
new Date(),
|
||||
1,
|
||||
FailCategory.FINAL_LOAD_CONTEXT,
|
||||
failure.getValue(), -1);
|
||||
sb.crawlQueues.errorURL.push(failure.getKey(), null, FailCategory.FINAL_LOAD_CONTEXT, failure.getValue(), -1);
|
||||
fr.append(failure.getValue()).append('/');
|
||||
}
|
||||
|
||||
|
|
|
@ -439,7 +439,7 @@ public class HostBrowser {
|
|||
FailType failType = errorDocs.get(entry.getKey());
|
||||
if (failType == null) {
|
||||
// maybe this is only in the errorURL
|
||||
prop.put("files_list_" + c + "_type_stored_error", process == HarvestProcess.ERRORS ? sb.crawlQueues.errorURL.get(uri.hash()).anycause() : "unknown error");
|
||||
prop.put("files_list_" + c + "_type_stored_error", process == HarvestProcess.ERRORS ? sb.crawlQueues.errorURL.get(ASCII.String(uri.hash())).getFailReason() : "unknown error");
|
||||
} else {
|
||||
prop.put("files_list_" + c + "_type_stored_error", failType == FailType.excl ? "excluded from indexing" : "load fail");
|
||||
}
|
||||
|
|
|
@ -32,16 +32,12 @@
|
|||
</colgroup>
|
||||
<tr class="TableHeader">
|
||||
<th>Time</th>
|
||||
<th>Initiator</th>
|
||||
<th>Executor</th>
|
||||
<th>URL</th>
|
||||
<th>Fail-Reason</th>
|
||||
</tr>
|
||||
#{list}#
|
||||
<tr class="TableCell#(dark)#Light::Dark#(/dark)#">
|
||||
<td>#[time]#</td>
|
||||
<td>#[initiator]#</td>
|
||||
<td>#[executor]#</td>
|
||||
<td><a href="#[url]#">#[url]#</a></td>
|
||||
<td>#[failreason]#</td>
|
||||
</tr>
|
||||
|
|
|
@ -24,15 +24,14 @@
|
|||
|
||||
|
||||
import java.util.ArrayList;
|
||||
import java.util.Date;
|
||||
|
||||
import net.yacy.cora.date.GenericFormatter;
|
||||
import net.yacy.cora.document.encoding.ASCII;
|
||||
import net.yacy.cora.document.id.DigestURL;
|
||||
import net.yacy.cora.protocol.RequestHeader;
|
||||
import net.yacy.crawler.CrawlStacker;
|
||||
import net.yacy.crawler.data.ZURL;
|
||||
import net.yacy.peers.Seed;
|
||||
import net.yacy.search.Switchboard;
|
||||
import net.yacy.search.schema.CollectionConfiguration;
|
||||
import net.yacy.server.serverObjects;
|
||||
import net.yacy.server.serverSwitch;
|
||||
|
||||
|
@ -73,27 +72,19 @@ public class IndexCreateParserErrors_p {
|
|||
}
|
||||
dark = true;
|
||||
DigestURL url;
|
||||
byte[] initiatorHash, executorHash;
|
||||
Seed initiatorSeed, executorSeed;
|
||||
int j=0;
|
||||
ArrayList<ZURL.Entry> l = sb.crawlQueues.errorURL.list(showRejectedCount);
|
||||
ZURL.Entry entry;
|
||||
ArrayList<CollectionConfiguration.FailDoc> l = sb.crawlQueues.errorURL.list(showRejectedCount);
|
||||
CollectionConfiguration.FailDoc entry;
|
||||
for (int i = l.size() - 1; i >= 0; i--) {
|
||||
entry = l.get(i);
|
||||
if (entry == null) continue;
|
||||
url = entry.url();
|
||||
url = entry.getDigestURL();
|
||||
if (url == null) continue;
|
||||
|
||||
initiatorHash = entry.initiator();
|
||||
executorHash = entry.executor();
|
||||
initiatorSeed = (initiatorHash == null) ? null : sb.peers.getConnected(ASCII.String(initiatorHash));
|
||||
executorSeed = (executorHash == null) ? null : sb.peers.getConnected(ASCII.String(executorHash));
|
||||
prop.putHTML("rejected_list_"+j+"_time", GenericFormatter.SIMPLE_FORMATTER.format(entry.workdate()));
|
||||
prop.putHTML("rejected_list_"+j+"_initiator", ((initiatorSeed == null) ? "proxy" : initiatorSeed.getName()));
|
||||
prop.putHTML("rejected_list_"+j+"_executor", ((executorSeed == null) ? "proxy" : executorSeed.getName()));
|
||||
|
||||
prop.putHTML("rejected_list_"+j+"_time", GenericFormatter.SIMPLE_FORMATTER.format(new Date()));
|
||||
prop.putHTML("rejected_list_"+j+"_url", url.toNormalform(false));
|
||||
|
||||
String cause = entry.anycause();
|
||||
String cause = entry.getFailReason();
|
||||
if (cause.startsWith(CrawlStacker.ERROR_NO_MATCH_MUST_MATCH_FILTER)) {
|
||||
prop.put("rejected_list_"+j+"_failreason", "(<a href=\"/RegexTest.html?text=" + url.toNormalform(false) +
|
||||
"®ex=" + cause.substring(CrawlStacker.ERROR_NO_MATCH_MUST_MATCH_FILTER.length()) + "\">test</a>) " + cause);
|
||||
|
|
|
@ -32,6 +32,7 @@
|
|||
import java.net.MalformedURLException;
|
||||
import java.util.Date;
|
||||
|
||||
import net.yacy.cora.document.encoding.ASCII;
|
||||
import net.yacy.cora.document.encoding.UTF8;
|
||||
import net.yacy.cora.document.id.DigestURL;
|
||||
import net.yacy.cora.federate.yacy.CacheStrategy;
|
||||
|
@ -127,7 +128,7 @@ public class QuickCrawlLink_p {
|
|||
final byte[] urlhash = crawlingStartURL.hash();
|
||||
indexSegment.fulltext().remove(urlhash);
|
||||
sb.crawlQueues.noticeURL.removeByURLHash(urlhash);
|
||||
sb.crawlQueues.errorURL.remove(urlhash);
|
||||
sb.crawlQueues.errorURL.remove(ASCII.String(urlhash));
|
||||
|
||||
// create crawling profile
|
||||
CrawlProfile pe = null;
|
||||
|
|
|
@ -30,11 +30,11 @@
|
|||
import java.io.IOException;
|
||||
|
||||
import net.yacy.cora.document.encoding.ASCII;
|
||||
import net.yacy.cora.federate.solr.FailCategory;
|
||||
import net.yacy.cora.protocol.RequestHeader;
|
||||
import net.yacy.cora.util.ConcurrentLog;
|
||||
import net.yacy.crawler.data.ResultURLs;
|
||||
import net.yacy.crawler.data.ResultURLs.EventOrigin;
|
||||
import net.yacy.crawler.data.ZURL.FailCategory;
|
||||
import net.yacy.kelondro.data.meta.URIMetadataRow;
|
||||
import net.yacy.peers.Protocol;
|
||||
import net.yacy.peers.Seed;
|
||||
|
@ -161,14 +161,7 @@ public final class crawlReceipt {
|
|||
}
|
||||
|
||||
sb.crawlQueues.delegatedURL.remove(entry.hash()); // the delegated work is transformed into an error case
|
||||
sb.crawlQueues.errorURL.push(
|
||||
entry.toBalancerEntry(iam),
|
||||
null,
|
||||
youare.getBytes(),
|
||||
null,
|
||||
0,
|
||||
FailCategory.FINAL_LOAD_CONTEXT,
|
||||
result + ":" + reason, -1);
|
||||
sb.crawlQueues.errorURL.push(entry.url(), null, FailCategory.FINAL_LOAD_CONTEXT, result + ":" + reason, -1);
|
||||
//switchboard.noticeURL.remove(receivedUrlhash);
|
||||
prop.put("delay", "3600");
|
||||
return prop;
|
||||
|
|
|
@ -246,7 +246,7 @@ public final class search {
|
|||
false,
|
||||
indexSegment,
|
||||
rankingProfile,
|
||||
header.get(RequestHeader.USER_AGENT, ""),
|
||||
header.get(HeaderFramework.USER_AGENT, ""),
|
||||
false,
|
||||
false,
|
||||
0.0d,
|
||||
|
@ -310,7 +310,7 @@ public final class search {
|
|||
false,
|
||||
sb.index,
|
||||
rankingProfile,
|
||||
header.get(RequestHeader.USER_AGENT, ""),
|
||||
header.get(HeaderFramework.USER_AGENT, ""),
|
||||
false,
|
||||
false,
|
||||
0.0d,
|
||||
|
|
|
@ -25,14 +25,11 @@
|
|||
// Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
|
||||
|
||||
import java.io.IOException;
|
||||
import java.util.Date;
|
||||
|
||||
import net.yacy.cora.date.GenericFormatter;
|
||||
import net.yacy.cora.document.encoding.ASCII;
|
||||
import net.yacy.cora.document.id.DigestURL;
|
||||
import net.yacy.cora.protocol.RequestHeader;
|
||||
import net.yacy.crawler.data.NoticedURL;
|
||||
import net.yacy.crawler.data.ZURL.FailCategory;
|
||||
import net.yacy.crawler.retrieval.Request;
|
||||
import net.yacy.kelondro.data.meta.URIMetadataNode;
|
||||
import net.yacy.peers.Protocol;
|
||||
|
@ -80,15 +77,7 @@ public class urls {
|
|||
referrer = sb.getURL(entry.referrerhash());
|
||||
|
||||
// place url to notice-url db
|
||||
sb.crawlQueues.delegatedURL.push(
|
||||
entry,
|
||||
null,
|
||||
sb.peers.mySeed().hash.getBytes(),
|
||||
new Date(),
|
||||
0,
|
||||
FailCategory.FINAL_PROCESS_CONTEXT,
|
||||
"client=____________",
|
||||
-1);
|
||||
sb.crawlQueues.delegatedURL.put(ASCII.String(entry.url().hash()), entry.url());
|
||||
|
||||
// create RSS entry
|
||||
prop.put("item_" + c + "_title", "");
|
||||
|
|
|
@ -663,7 +663,7 @@ public class yacysearch {
|
|||
authenticated,
|
||||
indexSegment,
|
||||
ranking,
|
||||
header.get(RequestHeader.USER_AGENT, ""),
|
||||
header.get(HeaderFramework.USER_AGENT, ""),
|
||||
sb.getConfigBool(SwitchboardConstants.SEARCH_VERIFY_DELETE, false)
|
||||
&& sb.getConfigBool(SwitchboardConstants.NETWORK_SEARCHVERIFY, false)
|
||||
&& sb.peers.mySeed().getFlagAcceptRemoteIndex(),
|
||||
|
|
39
source/net/yacy/cora/federate/solr/FailCategory.java
Normal file
39
source/net/yacy/cora/federate/solr/FailCategory.java
Normal file
|
@ -0,0 +1,39 @@
|
|||
/**
|
||||
* FailCategory
|
||||
* Copyright 2013 by Michael Peter Christen
|
||||
* First released 17.10.2013 at http://yacy.net
|
||||
*
|
||||
* This library is free software; you can redistribute it and/or
|
||||
* modify it under the terms of the GNU Lesser General Public
|
||||
* License as published by the Free Software Foundation; either
|
||||
* version 2.1 of the License, or (at your option) any later version.
|
||||
*
|
||||
* This library is distributed in the hope that it will be useful,
|
||||
* but WITHOUT ANY WARRANTY; without even the implied warranty of
|
||||
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
|
||||
* Lesser General Public License for more details.
|
||||
*
|
||||
* You should have received a copy of the GNU Lesser General Public License
|
||||
* along with this program in the file lgpl21.txt
|
||||
* If not, see <http://www.gnu.org/licenses/>.
|
||||
*/
|
||||
|
||||
package net.yacy.cora.federate.solr;
|
||||
|
||||
public enum FailCategory {
|
||||
// TEMPORARY categories are such failure cases that should be tried again
|
||||
// FINAL categories are such failure cases that are final and should not be tried again
|
||||
TEMPORARY_NETWORK_FAILURE(true, FailType.fail), // an entity could not been loaded
|
||||
FINAL_PROCESS_CONTEXT(false, FailType.excl), // because of a processing context we do not want that url again (i.e. remote crawling)
|
||||
FINAL_LOAD_CONTEXT(false, FailType.excl), // the crawler configuration does not want to load the entity
|
||||
FINAL_ROBOTS_RULE(true, FailType.excl), // a remote server denies indexing or loading
|
||||
FINAL_REDIRECT_RULE(true, FailType.excl); // the remote server redirects this page, thus disallowing reading of content
|
||||
|
||||
public final boolean store;
|
||||
public final FailType failType;
|
||||
|
||||
private FailCategory(boolean store, FailType failType) {
|
||||
this.store = store;
|
||||
this.failType = failType;
|
||||
}
|
||||
}
|
|
@ -41,6 +41,7 @@ import net.yacy.cora.document.encoding.UTF8;
|
|||
import net.yacy.cora.document.id.AnchorURL;
|
||||
import net.yacy.cora.document.id.DigestURL;
|
||||
import net.yacy.cora.document.id.MultiProtocolURL;
|
||||
import net.yacy.cora.federate.solr.FailCategory;
|
||||
import net.yacy.cora.order.Base64Order;
|
||||
import net.yacy.cora.protocol.Domains;
|
||||
import net.yacy.cora.protocol.ftp.FTPClient;
|
||||
|
@ -49,9 +50,7 @@ import net.yacy.crawler.data.CrawlProfile;
|
|||
import net.yacy.crawler.data.CrawlQueues;
|
||||
import net.yacy.crawler.data.NoticedURL;
|
||||
import net.yacy.crawler.data.ResultURLs;
|
||||
import net.yacy.crawler.data.ZURL;
|
||||
import net.yacy.crawler.data.ResultURLs.EventOrigin;
|
||||
import net.yacy.crawler.data.ZURL.FailCategory;
|
||||
import net.yacy.crawler.retrieval.FTPLoader;
|
||||
import net.yacy.crawler.retrieval.HTTPLoader;
|
||||
import net.yacy.crawler.retrieval.Request;
|
||||
|
@ -65,6 +64,7 @@ import net.yacy.repository.Blacklist.BlacklistType;
|
|||
import net.yacy.repository.FilterEngine;
|
||||
import net.yacy.search.Switchboard;
|
||||
import net.yacy.search.index.Segment;
|
||||
import net.yacy.search.schema.CollectionConfiguration;
|
||||
|
||||
public final class CrawlStacker {
|
||||
|
||||
|
@ -75,7 +75,7 @@ public final class CrawlStacker {
|
|||
private final ConcurrentLog log = new ConcurrentLog("STACKCRAWL");
|
||||
private final RobotsTxt robots;
|
||||
private final WorkflowProcessor<Request> requestQueue;
|
||||
private final CrawlQueues nextQueue;
|
||||
public final CrawlQueues nextQueue;
|
||||
private final CrawlSwitchboard crawler;
|
||||
private final Segment indexSegment;
|
||||
private final SeedDB peers;
|
||||
|
@ -151,7 +151,7 @@ public final class CrawlStacker {
|
|||
// if the url was rejected we store it into the error URL db
|
||||
if (rejectReason != null && !rejectReason.startsWith("double in")) {
|
||||
final CrawlProfile profile = this.crawler.getActive(UTF8.getBytes(entry.profileHandle()));
|
||||
this.nextQueue.errorURL.push(entry, profile, ASCII.getBytes(this.peers.mySeed().hash), new Date(), 1, FailCategory.FINAL_LOAD_CONTEXT, rejectReason, -1);
|
||||
this.nextQueue.errorURL.push(entry.url(), profile, FailCategory.FINAL_LOAD_CONTEXT, rejectReason, -1);
|
||||
}
|
||||
} catch (final Exception e) {
|
||||
CrawlStacker.this.log.warn("Error while processing stackCrawl entry.\n" + "Entry: " + entry.toString() + "Error: " + e.toString(), e);
|
||||
|
@ -186,7 +186,7 @@ public final class CrawlStacker {
|
|||
this.indexSegment.fulltext().remove(urlhash);
|
||||
byte[] hosthash = new byte[6]; System.arraycopy(urlhash, 6, hosthash, 0, 6);
|
||||
List<byte[]> hosthashes = new ArrayList<byte[]>(); hosthashes.add(hosthash);
|
||||
this.nextQueue.errorURL.removeHosts(hosthashes, false);
|
||||
this.nextQueue.errorURL.removeHosts(hosthashes);
|
||||
this.nextQueue.removeURL(urlhash);
|
||||
String u = url.toNormalform(true);
|
||||
if (u.endsWith("/")) {
|
||||
|
@ -198,7 +198,7 @@ public final class CrawlStacker {
|
|||
final byte[] uh = new DigestURL(u).hash();
|
||||
this.indexSegment.fulltext().remove(uh);
|
||||
this.nextQueue.noticeURL.removeByURLHash(uh);
|
||||
this.nextQueue.errorURL.remove(uh);
|
||||
this.nextQueue.errorURL.remove(ASCII.String(uh));
|
||||
} catch (final MalformedURLException e1) {}
|
||||
}
|
||||
|
||||
|
@ -246,7 +246,7 @@ public final class CrawlStacker {
|
|||
if (replace) {
|
||||
CrawlStacker.this.indexSegment.fulltext().remove(urlhash);
|
||||
cq.noticeURL.removeByURLHash(urlhash);
|
||||
cq.errorURL.remove(urlhash);
|
||||
cq.errorURL.remove(ASCII.String(urlhash));
|
||||
}
|
||||
|
||||
// put entry on crawl stack
|
||||
|
@ -425,8 +425,8 @@ public final class CrawlStacker {
|
|||
if (dbocc != null) {
|
||||
// do double-check
|
||||
if (dbocc == HarvestProcess.ERRORS) {
|
||||
final ZURL.Entry errorEntry = this.nextQueue.errorURL.get(url.hash());
|
||||
return "double in: errors (" + errorEntry.anycause() + ")";
|
||||
final CollectionConfiguration.FailDoc errorEntry = this.nextQueue.errorURL.get(ASCII.String(url.hash()));
|
||||
return "double in: errors (" + errorEntry.getFailReason() + ")";
|
||||
}
|
||||
return "double in: " + dbocc.toString();
|
||||
}
|
||||
|
@ -441,9 +441,9 @@ public final class CrawlStacker {
|
|||
return "double in: LURL-DB, oldDate = " + oldDate.toString();
|
||||
}
|
||||
if (dbocc == HarvestProcess.ERRORS) {
|
||||
final ZURL.Entry errorEntry = this.nextQueue.errorURL.get(url.hash());
|
||||
if (this.log.isInfo()) this.log.info("URL '" + urlstring + "' is double registered in '" + dbocc.toString() + "', previous cause: " + errorEntry.anycause());
|
||||
return "double in: errors (" + errorEntry.anycause() + "), oldDate = " + oldDate.toString();
|
||||
final CollectionConfiguration.FailDoc errorEntry = this.nextQueue.errorURL.get(ASCII.String(url.hash()));
|
||||
if (this.log.isInfo()) this.log.info("URL '" + urlstring + "' is double registered in '" + dbocc.toString() + "', previous cause: " + errorEntry.getFailReason());
|
||||
return "double in: errors (" + errorEntry.getFailReason() + "), oldDate = " + oldDate.toString();
|
||||
}
|
||||
if (this.log.isInfo()) this.log.info("URL '" + urlstring + "' is double registered in '" + dbocc.toString() + "'. ");
|
||||
return "double in: " + dbocc.toString() + ", oldDate = " + oldDate.toString();
|
||||
|
|
|
@ -40,17 +40,16 @@ import net.yacy.cora.document.encoding.UTF8;
|
|||
import net.yacy.cora.document.feed.Hit;
|
||||
import net.yacy.cora.document.feed.RSSFeed;
|
||||
import net.yacy.cora.document.id.DigestURL;
|
||||
import net.yacy.cora.federate.solr.FailCategory;
|
||||
import net.yacy.cora.federate.yacy.CacheStrategy;
|
||||
import net.yacy.cora.order.Base64Order;
|
||||
import net.yacy.cora.protocol.ConnectionInfo;
|
||||
import net.yacy.cora.util.ConcurrentLog;
|
||||
import net.yacy.crawler.HarvestProcess;
|
||||
import net.yacy.crawler.data.NoticedURL.StackType;
|
||||
import net.yacy.crawler.data.ZURL.FailCategory;
|
||||
import net.yacy.crawler.retrieval.Request;
|
||||
import net.yacy.crawler.retrieval.Response;
|
||||
import net.yacy.crawler.robots.RobotsTxtEntry;
|
||||
import net.yacy.kelondro.util.FileUtils;
|
||||
import net.yacy.kelondro.workflow.WorkflowJob;
|
||||
import net.yacy.peers.DHTSelection;
|
||||
import net.yacy.peers.Protocol;
|
||||
|
@ -59,19 +58,19 @@ import net.yacy.repository.Blacklist.BlacklistType;
|
|||
import net.yacy.search.IndexingQueueEntry;
|
||||
import net.yacy.search.Switchboard;
|
||||
import net.yacy.search.SwitchboardConstants;
|
||||
import net.yacy.search.index.ErrorCache;
|
||||
import net.yacy.search.schema.CollectionConfiguration;
|
||||
|
||||
public class CrawlQueues {
|
||||
|
||||
private static final String ERROR_DB_FILENAME = "urlError4.db";
|
||||
private static final String DELEGATED_DB_FILENAME = "urlDelegated4.db";
|
||||
|
||||
private Switchboard sb;
|
||||
private ConcurrentLog log;
|
||||
private Map<Integer, Loader> workers; // mapping from url hash to Worker thread object
|
||||
private final ArrayList<String> remoteCrawlProviderHashes;
|
||||
|
||||
public NoticedURL noticeURL;
|
||||
public ZURL errorURL, delegatedURL;
|
||||
public ErrorCache errorURL;
|
||||
public Map<String, DigestURL> delegatedURL;
|
||||
|
||||
public CrawlQueues(final Switchboard sb, final File queuePath) {
|
||||
this.sb = sb;
|
||||
|
@ -82,10 +81,8 @@ public class CrawlQueues {
|
|||
// start crawling management
|
||||
this.log.config("Starting Crawling Management");
|
||||
this.noticeURL = new NoticedURL(queuePath, sb.useTailCache, sb.exceed134217727);
|
||||
FileUtils.deletedelete(new File(queuePath, ERROR_DB_FILENAME));
|
||||
this.errorURL = new ZURL(sb.index.fulltext(), queuePath, ERROR_DB_FILENAME, false, sb.useTailCache, sb.exceed134217727);
|
||||
this.delegatedURL = new ZURL(sb.index.fulltext(), queuePath, DELEGATED_DB_FILENAME, true, sb.useTailCache, sb.exceed134217727);
|
||||
try {this.errorURL.clear();} catch (IOException e) {} // start with empty errors each time
|
||||
this.errorURL = new ErrorCache(sb.index.fulltext());
|
||||
this.delegatedURL = new ConcurrentHashMap<String, DigestURL>();
|
||||
}
|
||||
|
||||
public void relocate(final File newQueuePath) {
|
||||
|
@ -95,10 +92,8 @@ public class CrawlQueues {
|
|||
this.remoteCrawlProviderHashes.clear();
|
||||
|
||||
this.noticeURL = new NoticedURL(newQueuePath, this.sb.useTailCache, this.sb.exceed134217727);
|
||||
FileUtils.deletedelete(new File(newQueuePath, ERROR_DB_FILENAME));
|
||||
this.errorURL = new ZURL(this.sb.index.fulltext(), newQueuePath, ERROR_DB_FILENAME, false, this.sb.useTailCache, this.sb.exceed134217727);
|
||||
this.delegatedURL = new ZURL(this.sb.index.fulltext(), newQueuePath, DELEGATED_DB_FILENAME, true, this.sb.useTailCache, this.sb.exceed134217727);
|
||||
try {this.errorURL.clear();} catch (IOException e) {} // start with empty errors each time
|
||||
this.errorURL = new ErrorCache(this.sb.index.fulltext());
|
||||
this.delegatedURL = new ConcurrentHashMap<String, DigestURL>();
|
||||
}
|
||||
|
||||
public synchronized void close() {
|
||||
|
@ -114,8 +109,7 @@ public class CrawlQueues {
|
|||
}
|
||||
}
|
||||
this.noticeURL.close();
|
||||
this.errorURL.close();
|
||||
this.delegatedURL.close();
|
||||
this.delegatedURL.clear();
|
||||
}
|
||||
|
||||
public void clear() {
|
||||
|
@ -130,11 +124,7 @@ public class CrawlQueues {
|
|||
} catch (final IOException e) {
|
||||
ConcurrentLog.logException(e);
|
||||
}
|
||||
try {
|
||||
this.delegatedURL.clear();
|
||||
} catch (final IOException e) {
|
||||
ConcurrentLog.logException(e);
|
||||
}
|
||||
this.delegatedURL.clear();
|
||||
}
|
||||
|
||||
/**
|
||||
|
@ -143,7 +133,7 @@ public class CrawlQueues {
|
|||
* @return if the hash exists, the name of the database is returned, otherwise null is returned
|
||||
*/
|
||||
public HarvestProcess exists(final byte[] hash) {
|
||||
if (this.delegatedURL.exists(hash)) {
|
||||
if (this.delegatedURL.containsKey(ASCII.String(hash))) {
|
||||
return HarvestProcess.DELEGATED;
|
||||
}
|
||||
if (this.errorURL.exists(hash)) {
|
||||
|
@ -164,7 +154,7 @@ public class CrawlQueues {
|
|||
assert hash != null && hash.length == 12;
|
||||
this.noticeURL.removeByURLHash(hash);
|
||||
this.delegatedURL.remove(hash);
|
||||
this.errorURL.remove(hash);
|
||||
this.errorURL.remove(ASCII.String(hash));
|
||||
}
|
||||
|
||||
public DigestURL getURL(final byte[] urlhash) {
|
||||
|
@ -172,13 +162,13 @@ public class CrawlQueues {
|
|||
if (urlhash == null || urlhash.length == 0) {
|
||||
return null;
|
||||
}
|
||||
ZURL.Entry ee = this.delegatedURL.get(urlhash);
|
||||
if (ee != null) {
|
||||
return ee.url();
|
||||
DigestURL u = this.delegatedURL.get(ASCII.String(urlhash));
|
||||
if (u != null) {
|
||||
return u;
|
||||
}
|
||||
ee = this.errorURL.get(urlhash);
|
||||
CollectionConfiguration.FailDoc ee = this.errorURL.get(ASCII.String(urlhash));
|
||||
if (ee != null) {
|
||||
return ee.url();
|
||||
return ee.getDigestURL();
|
||||
}
|
||||
for (final Loader w: this.workers.values()) {
|
||||
if (Base64Order.enhancedCoder.equal(w.request.url().hash(), urlhash)) {
|
||||
|
@ -639,14 +629,7 @@ public class CrawlQueues {
|
|||
(robotsEntry = CrawlQueues.this.sb.robots.getEntry(this.request.url(), this.profile.getAgent())) != null &&
|
||||
robotsEntry.isDisallowed(this.request.url())) {
|
||||
//if (log.isFine()) log.logFine("Crawling of URL '" + request.url().toString() + "' disallowed by robots.txt.");
|
||||
CrawlQueues.this.errorURL.push(
|
||||
this.request,
|
||||
profile,
|
||||
ASCII.getBytes(CrawlQueues.this.sb.peers.mySeed().hash),
|
||||
new Date(),
|
||||
1,
|
||||
FailCategory.FINAL_ROBOTS_RULE,
|
||||
"denied by robots.txt", -1);
|
||||
CrawlQueues.this.errorURL.push(this.request.url(), profile, FailCategory.FINAL_ROBOTS_RULE, "denied by robots.txt", -1);
|
||||
this.request.setStatus("worker-disallowed", WorkflowJob.STATUS_FINISHED);
|
||||
} else {
|
||||
// starting a load from the internet
|
||||
|
@ -679,28 +662,14 @@ public class CrawlQueues {
|
|||
}
|
||||
|
||||
if (result != null) {
|
||||
CrawlQueues.this.errorURL.push(
|
||||
this.request,
|
||||
profile,
|
||||
ASCII.getBytes(CrawlQueues.this.sb.peers.mySeed().hash),
|
||||
new Date(),
|
||||
1,
|
||||
FailCategory.TEMPORARY_NETWORK_FAILURE,
|
||||
"cannot load: " + result, -1);
|
||||
CrawlQueues.this.errorURL.push(this.request.url(), profile, FailCategory.TEMPORARY_NETWORK_FAILURE, "cannot load: " + result, -1);
|
||||
this.request.setStatus("worker-error", WorkflowJob.STATUS_FINISHED);
|
||||
} else {
|
||||
this.request.setStatus("worker-processed", WorkflowJob.STATUS_FINISHED);
|
||||
}
|
||||
}
|
||||
} catch (final Exception e) {
|
||||
CrawlQueues.this.errorURL.push(
|
||||
this.request,
|
||||
profile,
|
||||
ASCII.getBytes(CrawlQueues.this.sb.peers.mySeed().hash),
|
||||
new Date(),
|
||||
1,
|
||||
FailCategory.TEMPORARY_NETWORK_FAILURE,
|
||||
e.getMessage() + " - in worker", -1);
|
||||
CrawlQueues.this.errorURL.push(this.request.url(), profile, FailCategory.TEMPORARY_NETWORK_FAILURE, e.getMessage() + " - in worker", -1);
|
||||
ConcurrentLog.logException(e);
|
||||
this.request.setStatus("worker-exception", WorkflowJob.STATUS_FINISHED);
|
||||
} finally {
|
||||
|
|
|
@ -1,365 +0,0 @@
|
|||
// plasmaCrawlZURL.java
|
||||
// (C) 2007 by Michael Peter Christen; mc@yacy.net, Frankfurt a. M., Germany
|
||||
// first published 15.03.2007 on http://www.anomic.de
|
||||
//
|
||||
// This is a part of YaCy, a peer-to-peer based web search engine
|
||||
//
|
||||
// $LastChangedDate$
|
||||
// $LastChangedRevision$
|
||||
// $LastChangedBy$
|
||||
//
|
||||
// LICENSE
|
||||
//
|
||||
// This program is free software; you can redistribute it and/or modify
|
||||
// it under the terms of the GNU General Public License as published by
|
||||
// the Free Software Foundation; either version 2 of the License, or
|
||||
// (at your option) any later version.
|
||||
//
|
||||
// This program is distributed in the hope that it will be useful,
|
||||
// but WITHOUT ANY WARRANTY; without even the implied warranty of
|
||||
// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
|
||||
// GNU General Public License for more details.
|
||||
//
|
||||
// You should have received a copy of the GNU General Public License
|
||||
// along with this program; if not, write to the Free Software
|
||||
// Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
|
||||
|
||||
package net.yacy.crawler.data;
|
||||
|
||||
import java.io.File;
|
||||
import java.io.IOException;
|
||||
import java.util.ArrayList;
|
||||
import java.util.Date;
|
||||
import java.util.Iterator;
|
||||
import java.util.List;
|
||||
import java.util.Queue;
|
||||
import java.util.concurrent.LinkedBlockingQueue;
|
||||
|
||||
import org.apache.solr.common.SolrInputDocument;
|
||||
|
||||
import net.yacy.cora.document.encoding.UTF8;
|
||||
import net.yacy.cora.document.id.DigestURL;
|
||||
import net.yacy.cora.federate.solr.FailType;
|
||||
import net.yacy.cora.order.Base64Order;
|
||||
import net.yacy.cora.order.NaturalOrder;
|
||||
import net.yacy.cora.util.ConcurrentLog;
|
||||
import net.yacy.cora.util.SpaceExceededException;
|
||||
import net.yacy.crawler.retrieval.Request;
|
||||
import net.yacy.kelondro.data.word.Word;
|
||||
import net.yacy.kelondro.index.Index;
|
||||
import net.yacy.kelondro.index.Row;
|
||||
import net.yacy.kelondro.table.SplitTable;
|
||||
import net.yacy.kelondro.table.Table;
|
||||
import net.yacy.kelondro.util.FileUtils;
|
||||
import net.yacy.search.index.Fulltext;
|
||||
|
||||
public class ZURL implements Iterable<ZURL.Entry> {
|
||||
|
||||
private static ConcurrentLog log = new ConcurrentLog("REJECTED");
|
||||
|
||||
private static final int EcoFSBufferSize = 2000;
|
||||
private static final int maxStackSize = 1000;
|
||||
|
||||
public enum FailCategory {
|
||||
// TEMPORARY categories are such failure cases that should be tried again
|
||||
// FINAL categories are such failure cases that are final and should not be tried again
|
||||
TEMPORARY_NETWORK_FAILURE(true, FailType.fail), // an entity could not been loaded
|
||||
FINAL_PROCESS_CONTEXT(false, FailType.excl), // because of a processing context we do not want that url again (i.e. remote crawling)
|
||||
FINAL_LOAD_CONTEXT(false, FailType.excl), // the crawler configuration does not want to load the entity
|
||||
FINAL_ROBOTS_RULE(true, FailType.excl), // a remote server denies indexing or loading
|
||||
FINAL_REDIRECT_RULE(true, FailType.excl); // the remote server redirects this page, thus disallowing reading of content
|
||||
|
||||
public final boolean store;
|
||||
public final FailType failType;
|
||||
|
||||
private FailCategory(boolean store, FailType failType) {
|
||||
this.store = store;
|
||||
this.failType = failType;
|
||||
}
|
||||
}
|
||||
|
||||
private final static Row rowdef = new Row(
|
||||
"String urlhash-" + Word.commonHashLength + ", " + // the url's hash
|
||||
"String executor-" + Word.commonHashLength + ", " + // the crawling executor
|
||||
"Cardinal workdate-8 {b256}, " + // the time when the url was last time tried to load
|
||||
"Cardinal workcount-4 {b256}, " + // number of load retries
|
||||
"String anycause-132, " + // string describing load failure
|
||||
"byte[] entry-" + Request.rowdef.objectsize, // extra space
|
||||
Base64Order.enhancedCoder
|
||||
);
|
||||
|
||||
// the class object
|
||||
private Index urlIndex;
|
||||
private final Queue<byte[]> stack;
|
||||
private final Fulltext fulltext;
|
||||
|
||||
protected ZURL(
|
||||
final Fulltext fulltext,
|
||||
final File cachePath,
|
||||
final String tablename,
|
||||
final boolean startWithEmptyFile,
|
||||
final boolean useTailCache,
|
||||
final boolean exceed134217727) {
|
||||
this.fulltext = fulltext;
|
||||
// creates a new ZURL in a file
|
||||
cachePath.mkdirs();
|
||||
final File f = new File(cachePath, tablename);
|
||||
if (startWithEmptyFile) {
|
||||
if (f.exists()) {
|
||||
if (f.isDirectory()) SplitTable.delete(cachePath, tablename); else FileUtils.deletedelete(f);
|
||||
}
|
||||
}
|
||||
try {
|
||||
this.urlIndex = new Table(f, rowdef, EcoFSBufferSize, 0, useTailCache, exceed134217727, true);
|
||||
} catch (final SpaceExceededException e) {
|
||||
try {
|
||||
this.urlIndex = new Table(f, rowdef, 0, 0, false, exceed134217727, true);
|
||||
} catch (final SpaceExceededException e1) {
|
||||
ConcurrentLog.logException(e1);
|
||||
}
|
||||
}
|
||||
//urlIndex = new kelondroFlexTable(cachePath, tablename, -1, rowdef, 0, true);
|
||||
this.stack = new LinkedBlockingQueue<byte[]>();
|
||||
}
|
||||
|
||||
protected void clear() throws IOException {
|
||||
if (this.urlIndex != null) this.urlIndex.clear();
|
||||
if (this.stack != null) this.stack.clear();
|
||||
}
|
||||
|
||||
protected void close() {
|
||||
try {clear();} catch (final IOException e) {}
|
||||
if (this.urlIndex != null) this.urlIndex.close();
|
||||
}
|
||||
|
||||
public boolean remove(final byte[] hash) {
|
||||
if (hash == null) return false;
|
||||
//System.out.println("*** DEBUG ZURL " + this.urlIndex.filename() + " remove " + hash);
|
||||
try {
|
||||
Iterator<byte[]> i = ZURL.this.stack.iterator();
|
||||
while (i.hasNext()) {
|
||||
byte[] b = i.next();
|
||||
if (NaturalOrder.naturalOrder.equal(hash, b)) i.remove();
|
||||
}
|
||||
return this.urlIndex.delete(hash);
|
||||
} catch (final IOException e) {
|
||||
return false;
|
||||
}
|
||||
}
|
||||
|
||||
public void removeHosts(final Iterable<byte[]> hosthashes, final boolean concurrent) {
|
||||
if (hosthashes == null) return;
|
||||
Thread t = new Thread() {
|
||||
public void run() {
|
||||
try {
|
||||
Iterator<byte[]> i = ZURL.this.urlIndex.keys(true, null);
|
||||
List<byte[]> r = new ArrayList<byte[]>();
|
||||
while (i.hasNext()) {
|
||||
byte[] b = i.next();
|
||||
for (byte[] hosthash: hosthashes) {
|
||||
if (NaturalOrder.naturalOrder.equal(hosthash, 0, b, 6, 6)) r.add(b);
|
||||
}
|
||||
}
|
||||
for (byte[] b: r) ZURL.this.urlIndex.remove(b);
|
||||
i = ZURL.this.stack.iterator();
|
||||
while (i.hasNext()) {
|
||||
byte[] b = i.next();
|
||||
for (byte[] hosthash: hosthashes) {
|
||||
if (NaturalOrder.naturalOrder.equal(hosthash, 0, b, 6, 6)) i.remove();
|
||||
}
|
||||
}
|
||||
} catch (final IOException e) {}
|
||||
}
|
||||
};
|
||||
if (concurrent) t.start(); else t.run();
|
||||
}
|
||||
|
||||
public void push(
|
||||
final Request bentry,
|
||||
final CrawlProfile profile,
|
||||
final byte[] executor,
|
||||
final Date workdate,
|
||||
final int workcount,
|
||||
final FailCategory failCategory,
|
||||
String anycause,
|
||||
final int httpcode) {
|
||||
// assert executor != null; // null == proxy !
|
||||
assert failCategory.store || httpcode == -1 : "failCategory=" + failCategory.name();
|
||||
if (exists(bentry.url().hash())) return; // don't insert double causes
|
||||
if (anycause == null) anycause = "unknown";
|
||||
final String reason = anycause + ((httpcode >= 0) ? " (http return code = " + httpcode + ")" : "");
|
||||
final Entry entry = new Entry(bentry, executor, workdate, workcount, reason);
|
||||
put(entry);
|
||||
this.stack.add(entry.hash());
|
||||
if (!reason.startsWith("double")) log.info(bentry.url().toNormalform(true) + " - " + reason);
|
||||
if (this.fulltext.getDefaultConnector() != null && failCategory.store) {
|
||||
// send the error to solr
|
||||
try {
|
||||
SolrInputDocument errorDoc = this.fulltext.getDefaultConfiguration().err(bentry.url(), profile == null ? null : profile.collections(), failCategory.name() + " " + reason, failCategory.failType, httpcode);
|
||||
this.fulltext.getDefaultConnector().add(errorDoc);
|
||||
} catch (final IOException e) {
|
||||
ConcurrentLog.warn("SOLR", "failed to send error " + bentry.url().toNormalform(true) + " to solr: " + e.getMessage());
|
||||
}
|
||||
}
|
||||
while (this.stack.size() > maxStackSize) this.stack.poll();
|
||||
}
|
||||
|
||||
@Override
|
||||
public Iterator<ZURL.Entry> iterator() {
|
||||
return new EntryIterator();
|
||||
}
|
||||
|
||||
public ArrayList<ZURL.Entry> list(int max) {
|
||||
final ArrayList<ZURL.Entry> l = new ArrayList<ZURL.Entry>();
|
||||
DigestURL url;
|
||||
for (final ZURL.Entry entry: this) {
|
||||
if (entry == null) continue;
|
||||
url = entry.url();
|
||||
if (url == null) continue;
|
||||
l.add(entry);
|
||||
if (max-- <= 0) l.remove(0);
|
||||
}
|
||||
return l;
|
||||
}
|
||||
|
||||
private class EntryIterator implements Iterator<ZURL.Entry> {
|
||||
private final Iterator<byte[]> hi;
|
||||
public EntryIterator() {
|
||||
this.hi = ZURL.this.stack.iterator();
|
||||
}
|
||||
@Override
|
||||
public boolean hasNext() {
|
||||
return this.hi.hasNext();
|
||||
}
|
||||
|
||||
@Override
|
||||
public ZURL.Entry next() {
|
||||
return get(this.hi.next());
|
||||
}
|
||||
|
||||
@Override
|
||||
public void remove() {
|
||||
this.hi.remove();
|
||||
}
|
||||
|
||||
}
|
||||
|
||||
public ZURL.Entry get(final byte[] urlhash) {
|
||||
try {
|
||||
if (this.urlIndex == null) return null;
|
||||
// System.out.println("*** DEBUG ZURL " + this.urlIndex.filename() + " get " + urlhash);
|
||||
final Row.Entry entry = this.urlIndex.get(urlhash, false);
|
||||
if (entry == null) return null;
|
||||
return new Entry(entry);
|
||||
} catch (final IOException e) {
|
||||
ConcurrentLog.logException(e);
|
||||
return null;
|
||||
}
|
||||
}
|
||||
|
||||
/**
|
||||
* private put (use push instead)
|
||||
* @param entry
|
||||
*/
|
||||
private void put(final Entry entry) {
|
||||
// stores the values from the object variables into the database
|
||||
if (entry.stored) return;
|
||||
if (entry.bentry == null) return;
|
||||
final Row.Entry newrow = rowdef.newEntry();
|
||||
newrow.setCol(0, entry.bentry.url().hash());
|
||||
newrow.setCol(1, entry.executor);
|
||||
newrow.setCol(2, entry.workdate.getTime());
|
||||
newrow.setCol(3, entry.workcount);
|
||||
newrow.setCol(4, UTF8.getBytes(entry.anycause));
|
||||
newrow.setCol(5, entry.bentry.toRow().bytes());
|
||||
try {
|
||||
if (this.urlIndex != null) this.urlIndex.put(newrow);
|
||||
entry.stored = true;
|
||||
} catch (final Exception e) {
|
||||
ConcurrentLog.logException(e);
|
||||
}
|
||||
}
|
||||
|
||||
boolean exists(final byte[] urlHash) {
|
||||
return this.urlIndex.has(urlHash);
|
||||
}
|
||||
|
||||
public void clearStack() {
|
||||
this.stack.clear();
|
||||
}
|
||||
|
||||
public int stackSize() {
|
||||
return this.stack.size();
|
||||
}
|
||||
|
||||
public class Entry {
|
||||
|
||||
private Request bentry; // the balancer entry
|
||||
private final byte[] executor; // the crawling executor
|
||||
private final Date workdate; // the time when the url was last time tried to load
|
||||
private final int workcount; // number of tryings
|
||||
private final String anycause; // string describing reason for load fail
|
||||
private boolean stored;
|
||||
|
||||
private Entry(
|
||||
final Request bentry,
|
||||
final byte[] executor,
|
||||
final Date workdate,
|
||||
final int workcount,
|
||||
final String anycause) {
|
||||
// create new entry
|
||||
assert bentry != null;
|
||||
// assert executor != null; // null == proxy !
|
||||
this.bentry = bentry;
|
||||
this.executor = executor;
|
||||
this.workdate = (workdate == null) ? new Date() : workdate;
|
||||
this.workcount = workcount;
|
||||
this.anycause = (anycause == null) ? "" : anycause;
|
||||
this.stored = false;
|
||||
}
|
||||
|
||||
private Entry(final Row.Entry entry) throws IOException {
|
||||
assert (entry != null);
|
||||
this.executor = entry.getColBytes(1, true);
|
||||
this.workdate = new Date(entry.getColLong(2));
|
||||
this.workcount = (int) entry.getColLong(3);
|
||||
this.anycause = entry.getColUTF8(4);
|
||||
this.bentry = new Request(Request.rowdef.newEntry(entry.getColBytes(5, false)));
|
||||
assert (Base64Order.enhancedCoder.equal(entry.getPrimaryKeyBytes(), this.bentry.url().hash()));
|
||||
this.stored = true;
|
||||
return;
|
||||
}
|
||||
|
||||
public DigestURL url() {
|
||||
return this.bentry.url();
|
||||
}
|
||||
|
||||
public byte[] initiator() {
|
||||
return this.bentry.initiator();
|
||||
}
|
||||
|
||||
private byte[] hash() {
|
||||
// return a url-hash, based on the md5 algorithm
|
||||
// the result is a String of 12 bytes within a 72-bit space
|
||||
// (each byte has an 6-bit range)
|
||||
// that should be enough for all web pages on the world
|
||||
return this.bentry.url().hash();
|
||||
}
|
||||
|
||||
public Date workdate() {
|
||||
return this.workdate;
|
||||
}
|
||||
|
||||
public byte[] executor() {
|
||||
// return the creator's hash
|
||||
return this.executor;
|
||||
}
|
||||
|
||||
public String anycause() {
|
||||
return this.anycause;
|
||||
}
|
||||
|
||||
}
|
||||
|
||||
}
|
||||
|
|
@ -36,6 +36,7 @@ import net.yacy.cora.document.encoding.ASCII;
|
|||
import net.yacy.cora.document.encoding.UTF8;
|
||||
import net.yacy.cora.document.id.DigestURL;
|
||||
import net.yacy.cora.document.id.MultiProtocolURL;
|
||||
import net.yacy.cora.federate.solr.FailCategory;
|
||||
import net.yacy.cora.protocol.HeaderFramework;
|
||||
import net.yacy.cora.protocol.RequestHeader;
|
||||
import net.yacy.cora.protocol.ResponseHeader;
|
||||
|
@ -43,7 +44,6 @@ import net.yacy.cora.protocol.ftp.FTPClient;
|
|||
import net.yacy.cora.util.ConcurrentLog;
|
||||
import net.yacy.crawler.data.CrawlProfile;
|
||||
import net.yacy.crawler.data.Latency;
|
||||
import net.yacy.crawler.data.ZURL.FailCategory;
|
||||
import net.yacy.document.TextParser;
|
||||
import net.yacy.search.Switchboard;
|
||||
|
||||
|
@ -156,7 +156,7 @@ public class FTPLoader {
|
|||
if (berr.size() > 0 || response == null) {
|
||||
// some error logging
|
||||
final String detail = (berr.size() > 0) ? "Errorlog: " + berr.toString() : "";
|
||||
this.sb.crawlQueues.errorURL.push(request, profile, ASCII.getBytes(this.sb.peers.mySeed().hash), new Date(), 1, FailCategory.TEMPORARY_NETWORK_FAILURE, " ftp server download, " + detail, -1);
|
||||
this.sb.crawlQueues.errorURL.push(request.url(), profile, FailCategory.TEMPORARY_NETWORK_FAILURE, " ftp server download, " + detail, -1);
|
||||
throw new IOException("FTPLoader: Unable to download URL '" + request.url().toString() + "': " + detail);
|
||||
}
|
||||
|
||||
|
|
|
@ -25,10 +25,9 @@
|
|||
package net.yacy.crawler.retrieval;
|
||||
|
||||
import java.io.IOException;
|
||||
import java.util.Date;
|
||||
|
||||
import net.yacy.cora.document.encoding.ASCII;
|
||||
import net.yacy.cora.document.id.DigestURL;
|
||||
import net.yacy.cora.federate.solr.FailCategory;
|
||||
import net.yacy.cora.protocol.ClientIdentification;
|
||||
import net.yacy.cora.protocol.HeaderFramework;
|
||||
import net.yacy.cora.protocol.RequestHeader;
|
||||
|
@ -37,7 +36,6 @@ import net.yacy.cora.protocol.http.HTTPClient;
|
|||
import net.yacy.cora.util.ConcurrentLog;
|
||||
import net.yacy.crawler.data.CrawlProfile;
|
||||
import net.yacy.crawler.data.Latency;
|
||||
import net.yacy.crawler.data.ZURL.FailCategory;
|
||||
import net.yacy.kelondro.io.ByteCount;
|
||||
import net.yacy.repository.Blacklist.BlacklistType;
|
||||
import net.yacy.search.Switchboard;
|
||||
|
@ -79,10 +77,8 @@ public final class HTTPLoader {
|
|||
|
||||
private Response load(final Request request, CrawlProfile profile, final int retryCount, final int maxFileSize, final BlacklistType blacklistType, final ClientIdentification.Agent agent) throws IOException {
|
||||
|
||||
byte[] myHash = ASCII.getBytes(this.sb.peers.mySeed().hash);
|
||||
|
||||
if (retryCount < 0) {
|
||||
this.sb.crawlQueues.errorURL.push(request, profile, myHash, new Date(), 1, FailCategory.TEMPORARY_NETWORK_FAILURE, "retry counter exceeded", -1);
|
||||
this.sb.crawlQueues.errorURL.push(request.url(), profile, FailCategory.TEMPORARY_NETWORK_FAILURE, "retry counter exceeded", -1);
|
||||
throw new IOException("retry counter exceeded for URL " + request.url().toString() + ". Processing aborted.");
|
||||
}
|
||||
|
||||
|
@ -98,7 +94,7 @@ public final class HTTPLoader {
|
|||
// check if url is in blacklist
|
||||
final String hostlow = host.toLowerCase();
|
||||
if (blacklistType != null && Switchboard.urlBlacklist.isListed(blacklistType, hostlow, path)) {
|
||||
this.sb.crawlQueues.errorURL.push(request, profile, myHash, new Date(), 1, FailCategory.FINAL_LOAD_CONTEXT, "url in blacklist", -1);
|
||||
this.sb.crawlQueues.errorURL.push(request.url(), profile, FailCategory.FINAL_LOAD_CONTEXT, "url in blacklist", -1);
|
||||
throw new IOException("CRAWLER Rejecting URL '" + request.url().toString() + "'. URL is in blacklist.");
|
||||
}
|
||||
|
||||
|
@ -145,7 +141,7 @@ public final class HTTPLoader {
|
|||
redirectionUrlString = redirectionUrlString == null ? "" : redirectionUrlString.trim();
|
||||
|
||||
if (redirectionUrlString.isEmpty()) {
|
||||
this.sb.crawlQueues.errorURL.push(request, profile, myHash, new Date(), 1, FailCategory.TEMPORARY_NETWORK_FAILURE, "no redirection url provided, field '" + HeaderFramework.LOCATION + "' is empty", statusCode);
|
||||
this.sb.crawlQueues.errorURL.push(request.url(), profile, FailCategory.TEMPORARY_NETWORK_FAILURE, "no redirection url provided, field '" + HeaderFramework.LOCATION + "' is empty", statusCode);
|
||||
throw new IOException("REJECTED EMTPY REDIRECTION '" + client.getHttpResponse().getStatusLine() + "' for URL " + requestURLString);
|
||||
}
|
||||
|
||||
|
@ -159,13 +155,13 @@ public final class HTTPLoader {
|
|||
this.sb.webStructure.generateCitationReference(url, redirectionUrl);
|
||||
|
||||
if (this.sb.getConfigBool(SwitchboardConstants.CRAWLER_RECORD_REDIRECTS, true)) {
|
||||
this.sb.crawlQueues.errorURL.push(request, profile, myHash, new Date(), 1, FailCategory.FINAL_REDIRECT_RULE, "redirect to " + redirectionUrlString, statusCode);
|
||||
this.sb.crawlQueues.errorURL.push(request.url(), profile, FailCategory.FINAL_REDIRECT_RULE, "redirect to " + redirectionUrlString, statusCode);
|
||||
}
|
||||
|
||||
if (this.sb.getConfigBool(SwitchboardConstants.CRAWLER_FOLLOW_REDIRECTS, true)) {
|
||||
// if we are already doing a shutdown we don't need to retry crawling
|
||||
if (Thread.currentThread().isInterrupted()) {
|
||||
this.sb.crawlQueues.errorURL.push(request, profile, myHash, new Date(), 1, FailCategory.FINAL_LOAD_CONTEXT, "server shutdown", statusCode);
|
||||
this.sb.crawlQueues.errorURL.push(request.url(), profile, FailCategory.FINAL_LOAD_CONTEXT, "server shutdown", statusCode);
|
||||
throw new IOException("CRAWLER Retry of URL=" + requestURLString + " aborted because of server shutdown.");
|
||||
}
|
||||
|
||||
|
@ -174,11 +170,11 @@ public final class HTTPLoader {
|
|||
return load(request, profile, retryCount - 1, maxFileSize, blacklistType, agent);
|
||||
}
|
||||
// we don't want to follow redirects
|
||||
this.sb.crawlQueues.errorURL.push(request, profile, myHash, new Date(), 1, FailCategory.FINAL_PROCESS_CONTEXT, "redirection not wanted", statusCode);
|
||||
this.sb.crawlQueues.errorURL.push(request.url(), profile, FailCategory.FINAL_PROCESS_CONTEXT, "redirection not wanted", statusCode);
|
||||
throw new IOException("REJECTED UNWANTED REDIRECTION '" + client.getHttpResponse().getStatusLine() + "' for URL " + requestURLString);
|
||||
} else if (responseBody == null) {
|
||||
// no response, reject file
|
||||
this.sb.crawlQueues.errorURL.push(request, profile, myHash, new Date(), 1, FailCategory.TEMPORARY_NETWORK_FAILURE, "no response body", statusCode);
|
||||
this.sb.crawlQueues.errorURL.push(request.url(), profile, FailCategory.TEMPORARY_NETWORK_FAILURE, "no response body", statusCode);
|
||||
throw new IOException("REJECTED EMPTY RESPONSE BODY '" + client.getHttpResponse().getStatusLine() + "' for URL " + requestURLString);
|
||||
} else if (statusCode == 200 || statusCode == 203) {
|
||||
// the transfer is ok
|
||||
|
@ -189,7 +185,7 @@ public final class HTTPLoader {
|
|||
|
||||
// check length again in case it was not possible to get the length before loading
|
||||
if (maxFileSize >= 0 && contentLength > maxFileSize) {
|
||||
this.sb.crawlQueues.errorURL.push(request, profile, myHash, new Date(), 1, FailCategory.FINAL_PROCESS_CONTEXT, "file size limit exceeded", statusCode);
|
||||
this.sb.crawlQueues.errorURL.push(request.url(), profile, FailCategory.FINAL_PROCESS_CONTEXT, "file size limit exceeded", statusCode);
|
||||
throw new IOException("REJECTED URL " + request.url() + " because file size '" + contentLength + "' exceeds max filesize limit of " + maxFileSize + " bytes. (GET)");
|
||||
}
|
||||
|
||||
|
@ -206,7 +202,7 @@ public final class HTTPLoader {
|
|||
return response;
|
||||
} else {
|
||||
// if the response has not the right response type then reject file
|
||||
this.sb.crawlQueues.errorURL.push(request, profile, myHash, new Date(), 1, FailCategory.TEMPORARY_NETWORK_FAILURE, "wrong http status code", statusCode);
|
||||
this.sb.crawlQueues.errorURL.push(request.url(), profile, FailCategory.TEMPORARY_NETWORK_FAILURE, "wrong http status code", statusCode);
|
||||
throw new IOException("REJECTED WRONG STATUS TYPE '" + client.getHttpResponse().getStatusLine() + "' for URL " + requestURLString);
|
||||
}
|
||||
}
|
||||
|
|
|
@ -42,6 +42,7 @@ import net.yacy.cora.document.encoding.ASCII;
|
|||
import net.yacy.cora.document.encoding.UTF8;
|
||||
import net.yacy.cora.document.id.AnchorURL;
|
||||
import net.yacy.cora.document.id.DigestURL;
|
||||
import net.yacy.cora.federate.solr.FailCategory;
|
||||
import net.yacy.cora.federate.yacy.CacheStrategy;
|
||||
import net.yacy.cora.protocol.ClientIdentification;
|
||||
import net.yacy.cora.protocol.HeaderFramework;
|
||||
|
@ -50,7 +51,6 @@ import net.yacy.cora.protocol.ResponseHeader;
|
|||
import net.yacy.cora.util.ConcurrentLog;
|
||||
import net.yacy.crawler.data.Cache;
|
||||
import net.yacy.crawler.data.CrawlProfile;
|
||||
import net.yacy.crawler.data.ZURL.FailCategory;
|
||||
import net.yacy.crawler.retrieval.FTPLoader;
|
||||
import net.yacy.crawler.retrieval.FileLoader;
|
||||
import net.yacy.crawler.retrieval.HTTPLoader;
|
||||
|
@ -191,7 +191,7 @@ public final class LoaderDispatcher {
|
|||
|
||||
// check if url is in blacklist
|
||||
if (blacklistType != null && host != null && Switchboard.urlBlacklist.isListed(blacklistType, host.toLowerCase(), url.getFile())) {
|
||||
this.sb.crawlQueues.errorURL.push(request, crawlProfile, this.sb.peers.mySeed().hash.getBytes(), new Date(), 1, FailCategory.FINAL_LOAD_CONTEXT, "url in blacklist", -1);
|
||||
this.sb.crawlQueues.errorURL.push(request.url(), crawlProfile, FailCategory.FINAL_LOAD_CONTEXT, "url in blacklist", -1);
|
||||
throw new IOException("DISPATCHER Rejecting URL '" + request.url().toString() + "'. URL is in blacklist.");
|
||||
}
|
||||
|
||||
|
|
|
@ -97,6 +97,7 @@ import net.yacy.cora.document.feed.RSSReader;
|
|||
import net.yacy.cora.document.id.AnchorURL;
|
||||
import net.yacy.cora.document.id.DigestURL;
|
||||
import net.yacy.cora.document.id.MultiProtocolURL;
|
||||
import net.yacy.cora.federate.solr.FailCategory;
|
||||
import net.yacy.cora.federate.solr.Ranking;
|
||||
import net.yacy.cora.federate.solr.SchemaConfiguration;
|
||||
import net.yacy.cora.federate.solr.instance.RemoteInstance;
|
||||
|
@ -127,7 +128,6 @@ import net.yacy.crawler.data.ResultImages;
|
|||
import net.yacy.crawler.data.ResultURLs;
|
||||
import net.yacy.crawler.data.NoticedURL.StackType;
|
||||
import net.yacy.crawler.data.ResultURLs.EventOrigin;
|
||||
import net.yacy.crawler.data.ZURL.FailCategory;
|
||||
import net.yacy.crawler.retrieval.Request;
|
||||
import net.yacy.crawler.retrieval.Response;
|
||||
import net.yacy.crawler.robots.RobotsTxt;
|
||||
|
@ -1789,16 +1789,9 @@ public final class Switchboard extends serverSwitch {
|
|||
// in the noIndexReason is set, indexing is not allowed
|
||||
if ( noIndexReason != null ) {
|
||||
// log cause and close queue
|
||||
final DigestURL referrerURL = response.referrerURL();
|
||||
//if (log.isFine()) log.logFine("deQueue: not indexed any word in URL " + response.url() + "; cause: " + noIndexReason);
|
||||
addURLtoErrorDB(
|
||||
response.url(),
|
||||
response.profile(),
|
||||
(referrerURL == null) ? null : referrerURL.hash(),
|
||||
response.initiator(),
|
||||
response.name(),
|
||||
FailCategory.FINAL_PROCESS_CONTEXT,
|
||||
noIndexReason);
|
||||
// create a new errorURL DB entry
|
||||
this.crawlQueues.errorURL.push(response.url(), response.profile(), FailCategory.FINAL_PROCESS_CONTEXT, noIndexReason, -1);
|
||||
// finish this entry
|
||||
return "not allowed: " + noIndexReason;
|
||||
}
|
||||
|
@ -1991,7 +1984,7 @@ public final class Switchboard extends serverSwitch {
|
|||
|
||||
public int cleanupJobSize() {
|
||||
int c = 1; // "es gibt immer was zu tun"
|
||||
if ( (this.crawlQueues.delegatedURL.stackSize() > 1000) ) {
|
||||
if ( (this.crawlQueues.delegatedURL.size() > 1000) ) {
|
||||
c++;
|
||||
}
|
||||
if ( (this.crawlQueues.errorURL.stackSize() > 1000) ) {
|
||||
|
@ -2101,13 +2094,13 @@ public final class Switchboard extends serverSwitch {
|
|||
|
||||
// clean up delegated stack
|
||||
checkInterruption();
|
||||
if ( (this.crawlQueues.delegatedURL.stackSize() > 1000) ) {
|
||||
if ( (this.crawlQueues.delegatedURL.size() > 1000) ) {
|
||||
if ( this.log.isFine() ) {
|
||||
this.log.fine("Cleaning Delegated-URLs report stack, "
|
||||
+ this.crawlQueues.delegatedURL.stackSize()
|
||||
+ this.crawlQueues.delegatedURL.size()
|
||||
+ " entries on stack");
|
||||
}
|
||||
this.crawlQueues.delegatedURL.clearStack();
|
||||
this.crawlQueues.delegatedURL.clear();
|
||||
}
|
||||
|
||||
// clean up error stack
|
||||
|
@ -2428,7 +2421,6 @@ public final class Switchboard extends serverSwitch {
|
|||
|
||||
public IndexingQueueEntry parseDocument(final IndexingQueueEntry in) {
|
||||
in.queueEntry.updateStatus(Response.QUEUE_STATE_PARSING);
|
||||
|
||||
Document[] documents = null;
|
||||
try {
|
||||
documents = parseDocument(in.queueEntry);
|
||||
|
@ -2439,7 +2431,7 @@ public final class Switchboard extends serverSwitch {
|
|||
}
|
||||
if ( documents == null ) {
|
||||
return null;
|
||||
}
|
||||
}
|
||||
return new IndexingQueueEntry(in.queueEntry, documents, null);
|
||||
}
|
||||
|
||||
|
@ -2465,14 +2457,8 @@ public final class Switchboard extends serverSwitch {
|
|||
response.setContent(Cache.getContent(response.url().hash()));
|
||||
if ( response.getContent() == null ) {
|
||||
this.log.warn("the resource '" + response.url() + "' is missing in the cache.");
|
||||
addURLtoErrorDB(
|
||||
response.url(),
|
||||
response.profile(),
|
||||
response.referrerHash(),
|
||||
response.initiator(),
|
||||
response.name(),
|
||||
FailCategory.FINAL_LOAD_CONTEXT,
|
||||
"missing in cache");
|
||||
// create a new errorURL DB entry
|
||||
this.crawlQueues.errorURL.push(response.url(), response.profile(), FailCategory.FINAL_LOAD_CONTEXT, "missing in cache", -1);
|
||||
return null;
|
||||
}
|
||||
}
|
||||
|
@ -2490,20 +2476,37 @@ public final class Switchboard extends serverSwitch {
|
|||
}
|
||||
} catch (final Parser.Failure e ) {
|
||||
this.log.warn("Unable to parse the resource '" + response.url() + "'. " + e.getMessage());
|
||||
addURLtoErrorDB(
|
||||
response.url(),
|
||||
response.profile(),
|
||||
response.referrerHash(),
|
||||
response.initiator(),
|
||||
response.name(),
|
||||
FailCategory.FINAL_PROCESS_CONTEXT,
|
||||
e.getMessage());
|
||||
// create a new errorURL DB entry
|
||||
this.crawlQueues.errorURL.push(response.url(), response.profile(), FailCategory.FINAL_PROCESS_CONTEXT, e.getMessage(), -1);
|
||||
return null;
|
||||
}
|
||||
|
||||
final long parsingEndTime = System.currentTimeMillis();
|
||||
|
||||
|
||||
// put anchors on crawl stack
|
||||
final long stackStartTime = System.currentTimeMillis();
|
||||
// check if the documents have valid urls; this is not a bug patch; it is possible that
|
||||
// i.e. the result of a feed parsing results in documents from domains which shall be filtered by the crawl profile
|
||||
if (response.profile() != null) {
|
||||
ArrayList<Document> newDocs = new ArrayList<Document>();
|
||||
for (Document doc: documents) {
|
||||
String rejectReason = this.crawlStacker.checkAcceptance(doc.dc_source(), response.profile(), 1 /*depth is irrelevant here, we just make clear its not the start url*/);
|
||||
if (rejectReason == null) {
|
||||
newDocs.add(doc);
|
||||
} else {
|
||||
// we consider this as fail urls to have a tracking of the problem
|
||||
if (rejectReason != null && !rejectReason.startsWith("double in")) {
|
||||
final CrawlProfile profile = this.crawler.getActive(UTF8.getBytes(response.profile().handle()));
|
||||
this.crawlStacker.nextQueue.errorURL.push(response.url(), profile, FailCategory.FINAL_LOAD_CONTEXT, rejectReason, -1);
|
||||
}
|
||||
}
|
||||
}
|
||||
if (newDocs.size() != documents.length) {
|
||||
documents = (Document[]) newDocs.toArray();
|
||||
}
|
||||
}
|
||||
|
||||
// collect anchors within remaining documents
|
||||
if ((processCase == EventOrigin.PROXY_LOAD || processCase == EventOrigin.LOCAL_CRAWLING) &&
|
||||
(
|
||||
response.profile() == null ||
|
||||
|
@ -2592,14 +2595,8 @@ public final class Switchboard extends serverSwitch {
|
|||
if (!(profile.indexUrlMustMatchPattern() == CrawlProfile.MATCH_ALL_PATTERN || profile.indexUrlMustMatchPattern().matcher(urls).matches()) ||
|
||||
(profile.indexUrlMustNotMatchPattern() != CrawlProfile.MATCH_NEVER_PATTERN && profile.indexUrlMustNotMatchPattern().matcher(urls).matches())) {
|
||||
if (this.log.isInfo()) this.log.info("Not Condensed Resource '" + urls + "': indexing prevented by regular expression on url; indexUrlMustMatchPattern = " + profile.indexUrlMustMatchPattern().pattern() + ", indexUrlMustNotMatchPattern = " + profile.indexUrlMustNotMatchPattern().pattern());
|
||||
addURLtoErrorDB(
|
||||
in.queueEntry.url(),
|
||||
profile,
|
||||
in.queueEntry.referrerHash(),
|
||||
in.queueEntry.initiator(),
|
||||
in.queueEntry.name(),
|
||||
FailCategory.FINAL_PROCESS_CONTEXT,
|
||||
"indexing prevented by regular expression on url; indexUrlMustMatchPattern = " + profile.indexUrlMustMatchPattern().pattern() + ", indexUrlMustNotMatchPattern = " + profile.indexUrlMustNotMatchPattern().pattern());
|
||||
// create a new errorURL DB entry
|
||||
this.crawlQueues.errorURL.push(in.queueEntry.url(), profile, FailCategory.FINAL_PROCESS_CONTEXT, "indexing prevented by regular expression on url; indexUrlMustMatchPattern = " + profile.indexUrlMustMatchPattern().pattern() + ", indexUrlMustNotMatchPattern = " + profile.indexUrlMustNotMatchPattern().pattern(), -1);
|
||||
return new IndexingQueueEntry(in.queueEntry, in.documents, null);
|
||||
}
|
||||
|
||||
|
@ -2608,27 +2605,15 @@ public final class Switchboard extends serverSwitch {
|
|||
docloop: for (final Document document : in.documents) {
|
||||
if (document.indexingDenied() && profile.obeyHtmlRobotsNoindex()) {
|
||||
if (this.log.isInfo()) this.log.info("Not Condensed Resource '" + urls + "': denied by document-attached noindexing rule");
|
||||
addURLtoErrorDB(
|
||||
in.queueEntry.url(),
|
||||
profile,
|
||||
in.queueEntry.referrerHash(),
|
||||
in.queueEntry.initiator(),
|
||||
in.queueEntry.name(),
|
||||
FailCategory.FINAL_PROCESS_CONTEXT,
|
||||
"denied by document-attached noindexing rule");
|
||||
// create a new errorURL DB entry
|
||||
this.crawlQueues.errorURL.push(in.queueEntry.url(), profile, FailCategory.FINAL_PROCESS_CONTEXT, "denied by document-attached noindexing rule", -1);
|
||||
continue docloop;
|
||||
}
|
||||
if (!(profile.indexContentMustMatchPattern() == CrawlProfile.MATCH_ALL_PATTERN || profile.indexContentMustMatchPattern().matcher(document.getTextString()).matches()) ||
|
||||
(profile.indexContentMustNotMatchPattern() != CrawlProfile.MATCH_NEVER_PATTERN && profile.indexContentMustNotMatchPattern().matcher(document.getTextString()).matches())) {
|
||||
if (this.log.isInfo()) this.log.info("Not Condensed Resource '" + urls + "': indexing prevented by regular expression on content; indexContentMustMatchPattern = " + profile.indexContentMustMatchPattern().pattern() + ", indexContentMustNotMatchPattern = " + profile.indexContentMustNotMatchPattern().pattern());
|
||||
addURLtoErrorDB(
|
||||
in.queueEntry.url(),
|
||||
profile,
|
||||
in.queueEntry.referrerHash(),
|
||||
in.queueEntry.initiator(),
|
||||
in.queueEntry.name(),
|
||||
FailCategory.FINAL_PROCESS_CONTEXT,
|
||||
"indexing prevented by regular expression on content; indexContentMustMatchPattern = " + profile.indexContentMustMatchPattern().pattern() + ", indexContentMustNotMatchPattern = " + profile.indexContentMustNotMatchPattern().pattern());
|
||||
// create a new errorURL DB entry
|
||||
this.crawlQueues.errorURL.push(in.queueEntry.url(), profile, FailCategory.FINAL_PROCESS_CONTEXT, "indexing prevented by regular expression on content; indexContentMustMatchPattern = " + profile.indexContentMustMatchPattern().pattern() + ", indexContentMustNotMatchPattern = " + profile.indexContentMustNotMatchPattern().pattern(), -1);
|
||||
continue docloop;
|
||||
}
|
||||
doclist.add(document);
|
||||
|
@ -2705,30 +2690,18 @@ public final class Switchboard extends serverSwitch {
|
|||
|
||||
if (condenser == null || (document.indexingDenied() && profile.obeyHtmlRobotsNoindex())) {
|
||||
//if (this.log.isInfo()) log.logInfo("Not Indexed Resource '" + queueEntry.url().toNormalform(false, true) + "': denied by rule in document, process case=" + processCase);
|
||||
addURLtoErrorDB(
|
||||
url,
|
||||
profile,
|
||||
(referrerURL == null) ? null : referrerURL.hash(),
|
||||
queueEntry.initiator(),
|
||||
dc_title,
|
||||
FailCategory.FINAL_PROCESS_CONTEXT,
|
||||
"denied by rule in document, process case=" + processCase);
|
||||
// create a new errorURL DB entry
|
||||
this.crawlQueues.errorURL.push(url, profile, FailCategory.FINAL_PROCESS_CONTEXT, "denied by rule in document, process case=" + processCase, -1);
|
||||
return;
|
||||
}
|
||||
|
||||
if ( profile != null && !profile.indexText() && !profile.indexMedia() ) {
|
||||
//if (this.log.isInfo()) log.logInfo("Not Indexed Resource '" + queueEntry.url().toNormalform(false, true) + "': denied by profile rule, process case=" + processCase + ", profile name = " + queueEntry.profile().name());
|
||||
addURLtoErrorDB(
|
||||
url,
|
||||
profile,
|
||||
(referrerURL == null) ? null : referrerURL.hash(),
|
||||
queueEntry.initiator(),
|
||||
dc_title,
|
||||
FailCategory.FINAL_LOAD_CONTEXT,
|
||||
"denied by profile rule, process case="
|
||||
+ processCase
|
||||
+ ", profile name = "
|
||||
+ profile.collectionName());
|
||||
// create a new errorURL DB entry
|
||||
this.crawlQueues.errorURL.push(url, profile, FailCategory.FINAL_LOAD_CONTEXT, "denied by profile rule, process case="
|
||||
+ processCase
|
||||
+ ", profile name = "
|
||||
+ profile.collectionName(), -1);
|
||||
return;
|
||||
}
|
||||
|
||||
|
@ -2906,7 +2879,7 @@ public final class Switchboard extends serverSwitch {
|
|||
// remove the document from the error-db
|
||||
byte[] hosthash = new byte[6]; System.arraycopy(urlhash, 6, hosthash, 0, 6);
|
||||
List<byte[]> hosthashes = new ArrayList<byte[]>(); hosthashes.add(hosthash);
|
||||
this.crawlQueues.errorURL.removeHosts(hosthashes, false);
|
||||
this.crawlQueues.errorURL.removeHosts(hosthashes);
|
||||
this.crawlQueues.removeURL(urlhash);
|
||||
|
||||
// get a scraper to get the title
|
||||
|
@ -3373,31 +3346,6 @@ public final class Switchboard extends serverSwitch {
|
|||
return hasDoneSomething;
|
||||
}
|
||||
|
||||
private void addURLtoErrorDB(
|
||||
final DigestURL url,
|
||||
final CrawlProfile profile,
|
||||
final byte[] referrerHash,
|
||||
final byte[] initiator,
|
||||
final String name,
|
||||
final FailCategory failCategory,
|
||||
final String failreason) {
|
||||
// assert initiator != null; // null == proxy
|
||||
// create a new errorURL DB entry
|
||||
final Request bentry =
|
||||
new Request(
|
||||
initiator,
|
||||
url,
|
||||
referrerHash,
|
||||
(name == null) ? "" : name,
|
||||
new Date(),
|
||||
null,
|
||||
0,
|
||||
0,
|
||||
0,
|
||||
0);
|
||||
this.crawlQueues.errorURL.push(bentry, profile, initiator, new Date(), 0, failCategory, failreason, -1);
|
||||
}
|
||||
|
||||
public final void heuristicSite(final SearchEvent searchEvent, final String host) {
|
||||
new Thread() {
|
||||
@Override
|
||||
|
|
173
source/net/yacy/search/index/ErrorCache.java
Normal file
173
source/net/yacy/search/index/ErrorCache.java
Normal file
|
@ -0,0 +1,173 @@
|
|||
/**
|
||||
* ErrorCache
|
||||
* Copyright 2013 by Michael Peter Christen
|
||||
* First released 17.10.2013 at http://yacy.net
|
||||
*
|
||||
* This library is free software; you can redistribute it and/or
|
||||
* modify it under the terms of the GNU Lesser General Public
|
||||
* License as published by the Free Software Foundation; either
|
||||
* version 2.1 of the License, or (at your option) any later version.
|
||||
*
|
||||
* This library is distributed in the hope that it will be useful,
|
||||
* but WITHOUT ANY WARRANTY; without even the implied warranty of
|
||||
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
|
||||
* Lesser General Public License for more details.
|
||||
*
|
||||
* You should have received a copy of the GNU Lesser General Public License
|
||||
* along with this program in the file lgpl21.txt
|
||||
* If not, see <http://www.gnu.org/licenses/>.
|
||||
*/
|
||||
|
||||
package net.yacy.search.index;
|
||||
|
||||
import java.io.IOException;
|
||||
import java.util.ArrayList;
|
||||
import java.util.Iterator;
|
||||
import java.util.LinkedHashMap;
|
||||
|
||||
import org.apache.solr.client.solrj.SolrQuery;
|
||||
import org.apache.solr.client.solrj.SolrQuery.SortClause;
|
||||
import org.apache.solr.client.solrj.response.QueryResponse;
|
||||
import org.apache.solr.common.SolrDocument;
|
||||
import org.apache.solr.common.SolrDocumentList;
|
||||
import org.apache.solr.common.SolrInputDocument;
|
||||
|
||||
import net.yacy.cora.document.encoding.ASCII;
|
||||
import net.yacy.cora.document.id.DigestURL;
|
||||
import net.yacy.cora.federate.solr.FailCategory;
|
||||
import net.yacy.cora.order.NaturalOrder;
|
||||
import net.yacy.cora.util.ConcurrentLog;
|
||||
import net.yacy.crawler.data.CrawlProfile;
|
||||
import net.yacy.search.index.Fulltext;
|
||||
import net.yacy.search.schema.CollectionConfiguration;
|
||||
import net.yacy.search.schema.CollectionSchema;
|
||||
|
||||
public class ErrorCache {
|
||||
|
||||
private static ConcurrentLog log = new ConcurrentLog("REJECTED");
|
||||
private static final int maxStackSize = 1000;
|
||||
|
||||
// the class object
|
||||
private final LinkedHashMap<String, CollectionConfiguration.FailDoc> stack;
|
||||
private final Fulltext fulltext;
|
||||
|
||||
public ErrorCache(final Fulltext fulltext) {
|
||||
this.fulltext = fulltext;
|
||||
this.stack = new LinkedHashMap<String, CollectionConfiguration.FailDoc>();
|
||||
try {
|
||||
// fill stack with latest values
|
||||
final SolrQuery params = new SolrQuery();
|
||||
params.setParam("defType", "edismax");
|
||||
params.setStart(0);
|
||||
params.setRows(100);
|
||||
params.setFacet(false);
|
||||
params.setSort(new SortClause(CollectionSchema.last_modified.getSolrFieldName(), SolrQuery.ORDER.desc));
|
||||
params.setFacet(false);
|
||||
params.setQuery(CollectionSchema.failreason_s.getSolrFieldName() + ":[* TO *]");
|
||||
QueryResponse rsp = fulltext.getDefaultConnector().getResponseByParams(params);
|
||||
SolrDocumentList docList = rsp == null ? null : rsp.getResults();
|
||||
if (docList != null) for (int i = docList.size() - 1; i >= 0; i--) {
|
||||
CollectionConfiguration.FailDoc failDoc = new CollectionConfiguration.FailDoc(docList.get(i));
|
||||
this.stack.put(ASCII.String(failDoc.getDigestURL().hash()), failDoc);
|
||||
}
|
||||
} catch (final Throwable e) {
|
||||
}
|
||||
}
|
||||
|
||||
public void clear() throws IOException {
|
||||
if (this.stack != null) this.stack.clear();
|
||||
this.fulltext.getDefaultConnector().deleteByQuery(CollectionSchema.failreason_s.getSolrFieldName() + ":[* TO *]");
|
||||
}
|
||||
|
||||
public void remove(final String hash) {
|
||||
if (hash == null) return;
|
||||
this.stack.remove(hash);
|
||||
try {
|
||||
this.fulltext.getDefaultConnector().deleteByQuery(CollectionSchema.id.getSolrFieldName() + ":\"" + hash + "\" AND " + CollectionSchema.failreason_s.getSolrFieldName() + ":[* TO *]");
|
||||
} catch (final IOException e) {
|
||||
return;
|
||||
}
|
||||
}
|
||||
|
||||
public void removeHosts(final Iterable<byte[]> hosthashes) {
|
||||
if (hosthashes == null) return;
|
||||
try {
|
||||
for (byte[] hosthash : hosthashes) {
|
||||
this.fulltext.getDefaultConnector().deleteByQuery(CollectionSchema.host_id_s.getSolrFieldName() + ":\"" + ASCII.String(hosthash) + "\" AND " + CollectionSchema.failreason_s.getSolrFieldName() + ":[* TO *]");
|
||||
}
|
||||
Iterator<String> i = ErrorCache.this.stack.keySet().iterator();
|
||||
while (i.hasNext()) {
|
||||
String b = i.next();
|
||||
for (byte[] hosthash : hosthashes) {
|
||||
if (NaturalOrder.naturalOrder.equal(hosthash, 0, ASCII.getBytes(b), 6, 6)) i.remove();
|
||||
}
|
||||
}
|
||||
} catch (final IOException e) {
|
||||
}
|
||||
}
|
||||
|
||||
public void push(final DigestURL url, final CrawlProfile profile, final FailCategory failCategory, String anycause, final int httpcode) {
|
||||
// assert executor != null; // null == proxy !
|
||||
assert failCategory.store || httpcode == -1 : "failCategory=" + failCategory.name();
|
||||
if (exists(url.hash()))
|
||||
return; // don't insert double causes
|
||||
if (anycause == null) anycause = "unknown";
|
||||
final String reason = anycause + ((httpcode >= 0) ? " (http return code = " + httpcode + ")" : "");
|
||||
if (!reason.startsWith("double")) log.info(url.toNormalform(true) + " - " + reason);
|
||||
CollectionConfiguration.FailDoc failDoc = new CollectionConfiguration.FailDoc(
|
||||
url, profile == null ? null : profile.collections(),
|
||||
failCategory.name() + " " + reason, failCategory.failType,
|
||||
httpcode);
|
||||
this.stack.put(ASCII.String(url.hash()), failDoc);
|
||||
if (this.fulltext.getDefaultConnector() != null && failCategory.store) {
|
||||
// send the error to solr
|
||||
try {
|
||||
SolrInputDocument errorDoc = failDoc.toSolr(this.fulltext.getDefaultConfiguration());
|
||||
this.fulltext.getDefaultConnector().add(errorDoc);
|
||||
} catch (final IOException e) {
|
||||
ConcurrentLog.warn("SOLR", "failed to send error " + url.toNormalform(true) + " to solr: " + e.getMessage());
|
||||
}
|
||||
}
|
||||
while (this.stack.size() > maxStackSize)
|
||||
this.stack.remove(this.stack.keySet().iterator());
|
||||
}
|
||||
|
||||
public ArrayList<CollectionConfiguration.FailDoc> list(int max) {
|
||||
final ArrayList<CollectionConfiguration.FailDoc> l = new ArrayList<CollectionConfiguration.FailDoc>();
|
||||
Iterator<CollectionConfiguration.FailDoc> fdi = this.stack.values().iterator();
|
||||
for (int i = 0; i < this.stack.size() - max; i++) fdi.next();
|
||||
while (fdi.hasNext()) l.add(fdi.next());
|
||||
return l;
|
||||
}
|
||||
|
||||
public CollectionConfiguration.FailDoc get(final String urlhash) {
|
||||
CollectionConfiguration.FailDoc fd = this.stack.get(urlhash);
|
||||
if (fd != null) return fd;
|
||||
try {
|
||||
SolrDocument doc = this.fulltext.getDefaultConnector().getDocumentById(urlhash);
|
||||
if (doc == null) return null;
|
||||
return new CollectionConfiguration.FailDoc(doc);
|
||||
} catch (final IOException e) {
|
||||
ConcurrentLog.logException(e);
|
||||
return null;
|
||||
}
|
||||
}
|
||||
|
||||
public boolean exists(final byte[] urlHash) {
|
||||
try {
|
||||
return this.fulltext.getDefaultConnector().existsByQuery(CollectionSchema.id.getSolrFieldName() + ":\"" + ASCII.String(urlHash) + "\" AND " + CollectionSchema.failreason_s.getSolrFieldName() + ":[* TO *]");
|
||||
} catch (IOException e) {
|
||||
return false;
|
||||
}
|
||||
}
|
||||
|
||||
public void clearStack() {
|
||||
this.stack.clear();
|
||||
}
|
||||
|
||||
public int stackSize() {
|
||||
return this.stack.size();
|
||||
}
|
||||
|
||||
}
|
||||
|
|
@ -80,6 +80,7 @@ import net.yacy.kelondro.util.Bitfield;
|
|||
import net.yacy.search.index.Segment;
|
||||
import net.yacy.search.index.Segment.ReferenceReport;
|
||||
import net.yacy.search.index.Segment.ReferenceReportCache;
|
||||
import net.yacy.search.query.QueryParams;
|
||||
import net.yacy.search.schema.WebgraphConfiguration.Subgraph;
|
||||
|
||||
import org.apache.solr.common.SolrDocument;
|
||||
|
@ -1195,34 +1196,73 @@ public class CollectionConfiguration extends SchemaConfiguration implements Seri
|
|||
return il;
|
||||
}
|
||||
*/
|
||||
|
||||
/**
|
||||
* register an entry as error document
|
||||
* @param digestURI
|
||||
* @param failReason
|
||||
* @param httpstatus
|
||||
* @throws IOException
|
||||
*/
|
||||
public SolrInputDocument err(final DigestURL digestURI, final Map<String, Pattern> collections, final String failReason, final FailType failType, final int httpstatus) throws IOException {
|
||||
boolean allAttr = this.isEmpty();
|
||||
assert allAttr || contains(CollectionSchema.failreason_s);
|
||||
|
||||
final SolrInputDocument doc = new SolrInputDocument();
|
||||
String url = addURIAttributes(doc, allAttr, digestURI, Response.docType(digestURI));
|
||||
if (allAttr || contains(CollectionSchema.load_date_dt)) add(doc, CollectionSchema.load_date_dt, new Date());
|
||||
|
||||
// fail reason and status
|
||||
if (allAttr || contains(CollectionSchema.failreason_s)) add(doc, CollectionSchema.failreason_s, failReason);
|
||||
if (allAttr || contains(CollectionSchema.failtype_s)) add(doc, CollectionSchema.failtype_s, failType.name());
|
||||
if (allAttr || contains(CollectionSchema.httpstatus_i)) add(doc, CollectionSchema.httpstatus_i, httpstatus);
|
||||
if (allAttr || contains(CollectionSchema.collection_sxt) && collections != null && collections.size() > 0) {
|
||||
List<String> cs = new ArrayList<String>();
|
||||
for (Map.Entry<String, Pattern> e: collections.entrySet()) {
|
||||
if (e.getValue().matcher(url).matches()) cs.add(e.getKey());
|
||||
}
|
||||
add(doc, CollectionSchema.collection_sxt, cs);
|
||||
}
|
||||
return doc;
|
||||
}
|
||||
|
||||
public static class FailDoc {
|
||||
DigestURL digestURL;
|
||||
final Map<String, Pattern> collections;
|
||||
final String failReason;
|
||||
final FailType failType;
|
||||
final int httpstatus;
|
||||
final Date failtime;
|
||||
public FailDoc(final DigestURL digestURL, final Map<String, Pattern> collections, final String failReason, final FailType failType, final int httpstatus) {
|
||||
this.digestURL = digestURL;
|
||||
this.collections = collections;
|
||||
this.failReason = failReason;
|
||||
this.failType = failType;
|
||||
this.httpstatus = httpstatus;
|
||||
this.failtime = new Date();
|
||||
}
|
||||
public FailDoc(final SolrDocument doc) {
|
||||
try {
|
||||
this.digestURL = new DigestURL((String) doc.getFieldValue(CollectionSchema.sku.getSolrFieldName()));
|
||||
} catch (MalformedURLException e) {
|
||||
this.digestURL = null;
|
||||
}
|
||||
this.collections = new HashMap<String, Pattern>();
|
||||
Collection<Object> c = doc.getFieldValues(CollectionSchema.collection_sxt.getSolrFieldName());
|
||||
for (Object cn: c) this.collections.put((String) cn, QueryParams.catchall_pattern);
|
||||
this.failReason = (String) doc.getFieldValue(CollectionSchema.failreason_s.getSolrFieldName());
|
||||
this.failType = FailType.valueOf((String) doc.getFieldValue(CollectionSchema.failtype_s.getSolrFieldName()));
|
||||
this.httpstatus = (Integer) doc.getFieldValue(CollectionSchema.httpstatus_i.getSolrFieldName());
|
||||
this.failtime = (Date) doc.getFieldValue(CollectionSchema.load_date_dt.getSolrFieldName());
|
||||
}
|
||||
public DigestURL getDigestURL() {
|
||||
return digestURL;
|
||||
}
|
||||
public Map<String, Pattern> getCollections() {
|
||||
return collections;
|
||||
}
|
||||
public String getFailReason() {
|
||||
return failReason;
|
||||
}
|
||||
public FailType getFailType() {
|
||||
return failType;
|
||||
}
|
||||
public int getHttpstatus() {
|
||||
return httpstatus;
|
||||
}
|
||||
public SolrInputDocument toSolr(CollectionConfiguration configuration) {
|
||||
boolean allAttr = configuration.isEmpty();
|
||||
assert allAttr || configuration.contains(CollectionSchema.failreason_s);
|
||||
|
||||
final SolrInputDocument doc = new SolrInputDocument();
|
||||
String url = configuration.addURIAttributes(doc, allAttr, this.getDigestURL(), Response.docType(this.getDigestURL()));
|
||||
if (allAttr || configuration.contains(CollectionSchema.load_date_dt)) configuration.add(doc, CollectionSchema.load_date_dt, new Date());
|
||||
|
||||
// fail reason and status
|
||||
if (allAttr || configuration.contains(CollectionSchema.failreason_s)) configuration.add(doc, CollectionSchema.failreason_s, this.getFailReason());
|
||||
if (allAttr || configuration.contains(CollectionSchema.failtype_s)) configuration.add(doc, CollectionSchema.failtype_s, this.getFailType().name());
|
||||
if (allAttr || configuration.contains(CollectionSchema.httpstatus_i)) configuration.add(doc, CollectionSchema.httpstatus_i, this.getHttpstatus());
|
||||
if (allAttr || configuration.contains(CollectionSchema.collection_sxt) && this.getCollections() != null && this.getCollections().size() > 0) {
|
||||
List<String> cs = new ArrayList<String>();
|
||||
for (Map.Entry<String, Pattern> e: this.getCollections().entrySet()) {
|
||||
if (e.getValue().matcher(url).matches()) cs.add(e.getKey());
|
||||
}
|
||||
configuration.add(doc, CollectionSchema.collection_sxt, cs);
|
||||
}
|
||||
return doc;
|
||||
}
|
||||
|
||||
}
|
||||
|
||||
}
|
||||
|
|
|
@ -40,6 +40,7 @@ import net.yacy.cora.document.analysis.Classification.ContentDomain;
|
|||
import net.yacy.cora.document.encoding.ASCII;
|
||||
import net.yacy.cora.document.id.AnchorURL;
|
||||
import net.yacy.cora.document.id.DigestURL;
|
||||
import net.yacy.cora.federate.solr.FailCategory;
|
||||
import net.yacy.cora.federate.yacy.CacheStrategy;
|
||||
import net.yacy.cora.order.Base64Order;
|
||||
import net.yacy.cora.protocol.ClientIdentification;
|
||||
|
@ -48,8 +49,6 @@ import net.yacy.cora.util.ByteArray;
|
|||
import net.yacy.cora.util.ConcurrentLog;
|
||||
import net.yacy.cora.util.NumberTools;
|
||||
import net.yacy.cora.util.SpaceExceededException;
|
||||
import net.yacy.crawler.data.ZURL.FailCategory;
|
||||
import net.yacy.crawler.retrieval.Request;
|
||||
import net.yacy.document.Document;
|
||||
import net.yacy.document.Parser;
|
||||
import net.yacy.document.WordTokenizer;
|
||||
|
@ -59,6 +58,7 @@ import net.yacy.repository.Blacklist.BlacklistType;
|
|||
import net.yacy.search.Switchboard;
|
||||
|
||||
|
||||
@SuppressWarnings("unused")
|
||||
public class MediaSnippet implements Comparable<MediaSnippet>, Comparator<MediaSnippet> {
|
||||
public ContentDomain type;
|
||||
public DigestURL href, source;
|
||||
|
@ -260,7 +260,7 @@ public class MediaSnippet implements Comparable<MediaSnippet>, Comparator<MediaS
|
|||
|
||||
// check if url is in blacklist
|
||||
if (Switchboard.urlBlacklist.isListed(blacklistType, url.getHost().toLowerCase(), url.getFile())) {
|
||||
Switchboard.getSwitchboard().crawlQueues.errorURL.push(new Request(url, null), null, ASCII.getBytes(Switchboard.getSwitchboard().peers.mySeed().hash), new Date(), 1, FailCategory.FINAL_LOAD_CONTEXT, "url in blacklist", -1);
|
||||
Switchboard.getSwitchboard().crawlQueues.errorURL.push(url, null, FailCategory.FINAL_LOAD_CONTEXT, "url in blacklist", -1);
|
||||
ConcurrentLog.fine("snippet fetch", "MEDIA-SNIPPET Rejecting URL '" + url.toString() + "'. URL is in blacklist.");
|
||||
isBlacklisted = true;
|
||||
}
|
||||
|
|
Loading…
Reference in New Issue
Block a user