From 2602be8d1e756d6fac20182916706b155d79b34f Mon Sep 17 00:00:00 2001 From: Michael Peter Christen Date: Tue, 17 Sep 2013 15:27:02 +0200 Subject: [PATCH] - removed ZURL data structure; removed also the ZURL data file - replaced load failure logging by information which is stored in Solr - fixed a bug with crawling of feeds: added must-match pattern application to feed urls to filter out such urls which shall not be in a wanted domain - delegatedURLs, which also used ZURLs are now temporary objects in memory --- htroot/Crawler_p.java | 24 +- htroot/HostBrowser.java | 2 +- htroot/IndexCreateParserErrors_p.html | 4 - htroot/IndexCreateParserErrors_p.java | 25 +- htroot/QuickCrawlLink_p.java | 3 +- htroot/yacy/crawlReceipt.java | 11 +- htroot/yacy/search.java | 4 +- htroot/yacy/urls.java | 13 +- htroot/yacysearch.java | 2 +- .../yacy/cora/federate/solr/FailCategory.java | 39 ++ source/net/yacy/crawler/CrawlStacker.java | 24 +- source/net/yacy/crawler/data/CrawlQueues.java | 73 +--- source/net/yacy/crawler/data/ZURL.java | 365 ------------------ .../net/yacy/crawler/retrieval/FTPLoader.java | 4 +- .../yacy/crawler/retrieval/HTTPLoader.java | 24 +- .../net/yacy/repository/LoaderDispatcher.java | 4 +- source/net/yacy/search/Switchboard.java | 152 +++----- source/net/yacy/search/index/ErrorCache.java | 173 +++++++++ .../schema/CollectionConfiguration.java | 98 +++-- .../net/yacy/search/snippet/MediaSnippet.java | 6 +- 20 files changed, 401 insertions(+), 649 deletions(-) create mode 100644 source/net/yacy/cora/federate/solr/FailCategory.java delete mode 100644 source/net/yacy/crawler/data/ZURL.java create mode 100644 source/net/yacy/search/index/ErrorCache.java diff --git a/htroot/Crawler_p.java b/htroot/Crawler_p.java index e972b5fa0..e2d72fe1b 100644 --- a/htroot/Crawler_p.java +++ b/htroot/Crawler_p.java @@ -37,6 +37,7 @@ import java.util.regex.PatternSyntaxException; import net.yacy.cora.document.encoding.ASCII; import net.yacy.cora.document.id.AnchorURL; import net.yacy.cora.document.id.DigestURL; +import net.yacy.cora.federate.solr.FailCategory; import net.yacy.cora.federate.yacy.CacheStrategy; import net.yacy.cora.protocol.ClientIdentification; import net.yacy.cora.protocol.RequestHeader; @@ -44,8 +45,6 @@ import net.yacy.cora.util.ConcurrentLog; import net.yacy.cora.util.SpaceExceededException; import net.yacy.crawler.CrawlSwitchboard; import net.yacy.crawler.data.CrawlProfile; -import net.yacy.crawler.data.ZURL.FailCategory; -import net.yacy.crawler.retrieval.Request; import net.yacy.crawler.retrieval.SitemapImporter; import net.yacy.data.WorkTables; import net.yacy.document.Document; @@ -392,7 +391,7 @@ public class Crawler_p { for (DigestURL u: rootURLs) { hosthashes.add(ASCII.getBytes(u.hosthash())); } - sb.crawlQueues.errorURL.removeHosts(hosthashes, false); + sb.crawlQueues.errorURL.removeHosts(hosthashes); for (byte[] hosthash: hosthashes) { try { String deletequery = CollectionSchema.host_id_s.getSolrFieldName() + ":\"" + ASCII.String(hosthash) + "\" AND " + CollectionSchema.failreason_s.getSolrFieldName() + ":[* TO *]"; @@ -440,24 +439,7 @@ public class Crawler_p { } else { StringBuilder fr = new StringBuilder(); for (Map.Entry failure: failurls.entrySet()) { - sb.crawlQueues.errorURL.push( - new Request( - sb.peers.mySeed().hash.getBytes(), - failure.getKey(), - null, - "", - new Date(), - profile.handle(), - 0, - 0, - 0, - 0), - null, - sb.peers.mySeed().hash.getBytes(), - new Date(), - 1, - FailCategory.FINAL_LOAD_CONTEXT, - failure.getValue(), -1); + sb.crawlQueues.errorURL.push(failure.getKey(), null, FailCategory.FINAL_LOAD_CONTEXT, failure.getValue(), -1); fr.append(failure.getValue()).append('/'); } diff --git a/htroot/HostBrowser.java b/htroot/HostBrowser.java index 4b4df0c22..c00352f3f 100644 --- a/htroot/HostBrowser.java +++ b/htroot/HostBrowser.java @@ -439,7 +439,7 @@ public class HostBrowser { FailType failType = errorDocs.get(entry.getKey()); if (failType == null) { // maybe this is only in the errorURL - prop.put("files_list_" + c + "_type_stored_error", process == HarvestProcess.ERRORS ? sb.crawlQueues.errorURL.get(uri.hash()).anycause() : "unknown error"); + prop.put("files_list_" + c + "_type_stored_error", process == HarvestProcess.ERRORS ? sb.crawlQueues.errorURL.get(ASCII.String(uri.hash())).getFailReason() : "unknown error"); } else { prop.put("files_list_" + c + "_type_stored_error", failType == FailType.excl ? "excluded from indexing" : "load fail"); } diff --git a/htroot/IndexCreateParserErrors_p.html b/htroot/IndexCreateParserErrors_p.html index 4ac85ed24..1e463e16a 100644 --- a/htroot/IndexCreateParserErrors_p.html +++ b/htroot/IndexCreateParserErrors_p.html @@ -32,16 +32,12 @@ Time - Initiator - Executor URL Fail-Reason #{list}# #[time]# - #[initiator]# - #[executor]# #[url]# #[failreason]# diff --git a/htroot/IndexCreateParserErrors_p.java b/htroot/IndexCreateParserErrors_p.java index 6a10f44de..acbb9bab3 100644 --- a/htroot/IndexCreateParserErrors_p.java +++ b/htroot/IndexCreateParserErrors_p.java @@ -24,15 +24,14 @@ import java.util.ArrayList; +import java.util.Date; import net.yacy.cora.date.GenericFormatter; -import net.yacy.cora.document.encoding.ASCII; import net.yacy.cora.document.id.DigestURL; import net.yacy.cora.protocol.RequestHeader; import net.yacy.crawler.CrawlStacker; -import net.yacy.crawler.data.ZURL; -import net.yacy.peers.Seed; import net.yacy.search.Switchboard; +import net.yacy.search.schema.CollectionConfiguration; import net.yacy.server.serverObjects; import net.yacy.server.serverSwitch; @@ -73,27 +72,19 @@ public class IndexCreateParserErrors_p { } dark = true; DigestURL url; - byte[] initiatorHash, executorHash; - Seed initiatorSeed, executorSeed; int j=0; - ArrayList l = sb.crawlQueues.errorURL.list(showRejectedCount); - ZURL.Entry entry; + ArrayList l = sb.crawlQueues.errorURL.list(showRejectedCount); + CollectionConfiguration.FailDoc entry; for (int i = l.size() - 1; i >= 0; i--) { entry = l.get(i); if (entry == null) continue; - url = entry.url(); + url = entry.getDigestURL(); if (url == null) continue; - - initiatorHash = entry.initiator(); - executorHash = entry.executor(); - initiatorSeed = (initiatorHash == null) ? null : sb.peers.getConnected(ASCII.String(initiatorHash)); - executorSeed = (executorHash == null) ? null : sb.peers.getConnected(ASCII.String(executorHash)); - prop.putHTML("rejected_list_"+j+"_time", GenericFormatter.SIMPLE_FORMATTER.format(entry.workdate())); - prop.putHTML("rejected_list_"+j+"_initiator", ((initiatorSeed == null) ? "proxy" : initiatorSeed.getName())); - prop.putHTML("rejected_list_"+j+"_executor", ((executorSeed == null) ? "proxy" : executorSeed.getName())); + + prop.putHTML("rejected_list_"+j+"_time", GenericFormatter.SIMPLE_FORMATTER.format(new Date())); prop.putHTML("rejected_list_"+j+"_url", url.toNormalform(false)); - String cause = entry.anycause(); + String cause = entry.getFailReason(); if (cause.startsWith(CrawlStacker.ERROR_NO_MATCH_MUST_MATCH_FILTER)) { prop.put("rejected_list_"+j+"_failreason", "(test) " + cause); diff --git a/htroot/QuickCrawlLink_p.java b/htroot/QuickCrawlLink_p.java index 159b23cdf..3684117b3 100644 --- a/htroot/QuickCrawlLink_p.java +++ b/htroot/QuickCrawlLink_p.java @@ -32,6 +32,7 @@ import java.net.MalformedURLException; import java.util.Date; +import net.yacy.cora.document.encoding.ASCII; import net.yacy.cora.document.encoding.UTF8; import net.yacy.cora.document.id.DigestURL; import net.yacy.cora.federate.yacy.CacheStrategy; @@ -127,7 +128,7 @@ public class QuickCrawlLink_p { final byte[] urlhash = crawlingStartURL.hash(); indexSegment.fulltext().remove(urlhash); sb.crawlQueues.noticeURL.removeByURLHash(urlhash); - sb.crawlQueues.errorURL.remove(urlhash); + sb.crawlQueues.errorURL.remove(ASCII.String(urlhash)); // create crawling profile CrawlProfile pe = null; diff --git a/htroot/yacy/crawlReceipt.java b/htroot/yacy/crawlReceipt.java index a3a3318e6..ae8cd6050 100644 --- a/htroot/yacy/crawlReceipt.java +++ b/htroot/yacy/crawlReceipt.java @@ -30,11 +30,11 @@ import java.io.IOException; import net.yacy.cora.document.encoding.ASCII; +import net.yacy.cora.federate.solr.FailCategory; import net.yacy.cora.protocol.RequestHeader; import net.yacy.cora.util.ConcurrentLog; import net.yacy.crawler.data.ResultURLs; import net.yacy.crawler.data.ResultURLs.EventOrigin; -import net.yacy.crawler.data.ZURL.FailCategory; import net.yacy.kelondro.data.meta.URIMetadataRow; import net.yacy.peers.Protocol; import net.yacy.peers.Seed; @@ -161,14 +161,7 @@ public final class crawlReceipt { } sb.crawlQueues.delegatedURL.remove(entry.hash()); // the delegated work is transformed into an error case - sb.crawlQueues.errorURL.push( - entry.toBalancerEntry(iam), - null, - youare.getBytes(), - null, - 0, - FailCategory.FINAL_LOAD_CONTEXT, - result + ":" + reason, -1); + sb.crawlQueues.errorURL.push(entry.url(), null, FailCategory.FINAL_LOAD_CONTEXT, result + ":" + reason, -1); //switchboard.noticeURL.remove(receivedUrlhash); prop.put("delay", "3600"); return prop; diff --git a/htroot/yacy/search.java b/htroot/yacy/search.java index 77316289d..13f40f22b 100644 --- a/htroot/yacy/search.java +++ b/htroot/yacy/search.java @@ -246,7 +246,7 @@ public final class search { false, indexSegment, rankingProfile, - header.get(RequestHeader.USER_AGENT, ""), + header.get(HeaderFramework.USER_AGENT, ""), false, false, 0.0d, @@ -310,7 +310,7 @@ public final class search { false, sb.index, rankingProfile, - header.get(RequestHeader.USER_AGENT, ""), + header.get(HeaderFramework.USER_AGENT, ""), false, false, 0.0d, diff --git a/htroot/yacy/urls.java b/htroot/yacy/urls.java index f14e71225..9bb930116 100644 --- a/htroot/yacy/urls.java +++ b/htroot/yacy/urls.java @@ -25,14 +25,11 @@ // Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA import java.io.IOException; -import java.util.Date; - import net.yacy.cora.date.GenericFormatter; import net.yacy.cora.document.encoding.ASCII; import net.yacy.cora.document.id.DigestURL; import net.yacy.cora.protocol.RequestHeader; import net.yacy.crawler.data.NoticedURL; -import net.yacy.crawler.data.ZURL.FailCategory; import net.yacy.crawler.retrieval.Request; import net.yacy.kelondro.data.meta.URIMetadataNode; import net.yacy.peers.Protocol; @@ -80,15 +77,7 @@ public class urls { referrer = sb.getURL(entry.referrerhash()); // place url to notice-url db - sb.crawlQueues.delegatedURL.push( - entry, - null, - sb.peers.mySeed().hash.getBytes(), - new Date(), - 0, - FailCategory.FINAL_PROCESS_CONTEXT, - "client=____________", - -1); + sb.crawlQueues.delegatedURL.put(ASCII.String(entry.url().hash()), entry.url()); // create RSS entry prop.put("item_" + c + "_title", ""); diff --git a/htroot/yacysearch.java b/htroot/yacysearch.java index 6e028f53e..cd9e38ff3 100644 --- a/htroot/yacysearch.java +++ b/htroot/yacysearch.java @@ -663,7 +663,7 @@ public class yacysearch { authenticated, indexSegment, ranking, - header.get(RequestHeader.USER_AGENT, ""), + header.get(HeaderFramework.USER_AGENT, ""), sb.getConfigBool(SwitchboardConstants.SEARCH_VERIFY_DELETE, false) && sb.getConfigBool(SwitchboardConstants.NETWORK_SEARCHVERIFY, false) && sb.peers.mySeed().getFlagAcceptRemoteIndex(), diff --git a/source/net/yacy/cora/federate/solr/FailCategory.java b/source/net/yacy/cora/federate/solr/FailCategory.java new file mode 100644 index 000000000..cad47f461 --- /dev/null +++ b/source/net/yacy/cora/federate/solr/FailCategory.java @@ -0,0 +1,39 @@ +/** + * FailCategory + * Copyright 2013 by Michael Peter Christen + * First released 17.10.2013 at http://yacy.net + * + * This library is free software; you can redistribute it and/or + * modify it under the terms of the GNU Lesser General Public + * License as published by the Free Software Foundation; either + * version 2.1 of the License, or (at your option) any later version. + * + * This library is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * Lesser General Public License for more details. + * + * You should have received a copy of the GNU Lesser General Public License + * along with this program in the file lgpl21.txt + * If not, see . + */ + +package net.yacy.cora.federate.solr; + +public enum FailCategory { + // TEMPORARY categories are such failure cases that should be tried again + // FINAL categories are such failure cases that are final and should not be tried again + TEMPORARY_NETWORK_FAILURE(true, FailType.fail), // an entity could not been loaded + FINAL_PROCESS_CONTEXT(false, FailType.excl), // because of a processing context we do not want that url again (i.e. remote crawling) + FINAL_LOAD_CONTEXT(false, FailType.excl), // the crawler configuration does not want to load the entity + FINAL_ROBOTS_RULE(true, FailType.excl), // a remote server denies indexing or loading + FINAL_REDIRECT_RULE(true, FailType.excl); // the remote server redirects this page, thus disallowing reading of content + + public final boolean store; + public final FailType failType; + + private FailCategory(boolean store, FailType failType) { + this.store = store; + this.failType = failType; + } +} diff --git a/source/net/yacy/crawler/CrawlStacker.java b/source/net/yacy/crawler/CrawlStacker.java index 953619fda..c6c903f0c 100644 --- a/source/net/yacy/crawler/CrawlStacker.java +++ b/source/net/yacy/crawler/CrawlStacker.java @@ -41,6 +41,7 @@ import net.yacy.cora.document.encoding.UTF8; import net.yacy.cora.document.id.AnchorURL; import net.yacy.cora.document.id.DigestURL; import net.yacy.cora.document.id.MultiProtocolURL; +import net.yacy.cora.federate.solr.FailCategory; import net.yacy.cora.order.Base64Order; import net.yacy.cora.protocol.Domains; import net.yacy.cora.protocol.ftp.FTPClient; @@ -49,9 +50,7 @@ import net.yacy.crawler.data.CrawlProfile; import net.yacy.crawler.data.CrawlQueues; import net.yacy.crawler.data.NoticedURL; import net.yacy.crawler.data.ResultURLs; -import net.yacy.crawler.data.ZURL; import net.yacy.crawler.data.ResultURLs.EventOrigin; -import net.yacy.crawler.data.ZURL.FailCategory; import net.yacy.crawler.retrieval.FTPLoader; import net.yacy.crawler.retrieval.HTTPLoader; import net.yacy.crawler.retrieval.Request; @@ -65,6 +64,7 @@ import net.yacy.repository.Blacklist.BlacklistType; import net.yacy.repository.FilterEngine; import net.yacy.search.Switchboard; import net.yacy.search.index.Segment; +import net.yacy.search.schema.CollectionConfiguration; public final class CrawlStacker { @@ -75,7 +75,7 @@ public final class CrawlStacker { private final ConcurrentLog log = new ConcurrentLog("STACKCRAWL"); private final RobotsTxt robots; private final WorkflowProcessor requestQueue; - private final CrawlQueues nextQueue; + public final CrawlQueues nextQueue; private final CrawlSwitchboard crawler; private final Segment indexSegment; private final SeedDB peers; @@ -151,7 +151,7 @@ public final class CrawlStacker { // if the url was rejected we store it into the error URL db if (rejectReason != null && !rejectReason.startsWith("double in")) { final CrawlProfile profile = this.crawler.getActive(UTF8.getBytes(entry.profileHandle())); - this.nextQueue.errorURL.push(entry, profile, ASCII.getBytes(this.peers.mySeed().hash), new Date(), 1, FailCategory.FINAL_LOAD_CONTEXT, rejectReason, -1); + this.nextQueue.errorURL.push(entry.url(), profile, FailCategory.FINAL_LOAD_CONTEXT, rejectReason, -1); } } catch (final Exception e) { CrawlStacker.this.log.warn("Error while processing stackCrawl entry.\n" + "Entry: " + entry.toString() + "Error: " + e.toString(), e); @@ -186,7 +186,7 @@ public final class CrawlStacker { this.indexSegment.fulltext().remove(urlhash); byte[] hosthash = new byte[6]; System.arraycopy(urlhash, 6, hosthash, 0, 6); List hosthashes = new ArrayList(); hosthashes.add(hosthash); - this.nextQueue.errorURL.removeHosts(hosthashes, false); + this.nextQueue.errorURL.removeHosts(hosthashes); this.nextQueue.removeURL(urlhash); String u = url.toNormalform(true); if (u.endsWith("/")) { @@ -198,7 +198,7 @@ public final class CrawlStacker { final byte[] uh = new DigestURL(u).hash(); this.indexSegment.fulltext().remove(uh); this.nextQueue.noticeURL.removeByURLHash(uh); - this.nextQueue.errorURL.remove(uh); + this.nextQueue.errorURL.remove(ASCII.String(uh)); } catch (final MalformedURLException e1) {} } @@ -246,7 +246,7 @@ public final class CrawlStacker { if (replace) { CrawlStacker.this.indexSegment.fulltext().remove(urlhash); cq.noticeURL.removeByURLHash(urlhash); - cq.errorURL.remove(urlhash); + cq.errorURL.remove(ASCII.String(urlhash)); } // put entry on crawl stack @@ -425,8 +425,8 @@ public final class CrawlStacker { if (dbocc != null) { // do double-check if (dbocc == HarvestProcess.ERRORS) { - final ZURL.Entry errorEntry = this.nextQueue.errorURL.get(url.hash()); - return "double in: errors (" + errorEntry.anycause() + ")"; + final CollectionConfiguration.FailDoc errorEntry = this.nextQueue.errorURL.get(ASCII.String(url.hash())); + return "double in: errors (" + errorEntry.getFailReason() + ")"; } return "double in: " + dbocc.toString(); } @@ -441,9 +441,9 @@ public final class CrawlStacker { return "double in: LURL-DB, oldDate = " + oldDate.toString(); } if (dbocc == HarvestProcess.ERRORS) { - final ZURL.Entry errorEntry = this.nextQueue.errorURL.get(url.hash()); - if (this.log.isInfo()) this.log.info("URL '" + urlstring + "' is double registered in '" + dbocc.toString() + "', previous cause: " + errorEntry.anycause()); - return "double in: errors (" + errorEntry.anycause() + "), oldDate = " + oldDate.toString(); + final CollectionConfiguration.FailDoc errorEntry = this.nextQueue.errorURL.get(ASCII.String(url.hash())); + if (this.log.isInfo()) this.log.info("URL '" + urlstring + "' is double registered in '" + dbocc.toString() + "', previous cause: " + errorEntry.getFailReason()); + return "double in: errors (" + errorEntry.getFailReason() + "), oldDate = " + oldDate.toString(); } if (this.log.isInfo()) this.log.info("URL '" + urlstring + "' is double registered in '" + dbocc.toString() + "'. "); return "double in: " + dbocc.toString() + ", oldDate = " + oldDate.toString(); diff --git a/source/net/yacy/crawler/data/CrawlQueues.java b/source/net/yacy/crawler/data/CrawlQueues.java index 04ecfb924..fc355d1ba 100644 --- a/source/net/yacy/crawler/data/CrawlQueues.java +++ b/source/net/yacy/crawler/data/CrawlQueues.java @@ -40,17 +40,16 @@ import net.yacy.cora.document.encoding.UTF8; import net.yacy.cora.document.feed.Hit; import net.yacy.cora.document.feed.RSSFeed; import net.yacy.cora.document.id.DigestURL; +import net.yacy.cora.federate.solr.FailCategory; import net.yacy.cora.federate.yacy.CacheStrategy; import net.yacy.cora.order.Base64Order; import net.yacy.cora.protocol.ConnectionInfo; import net.yacy.cora.util.ConcurrentLog; import net.yacy.crawler.HarvestProcess; import net.yacy.crawler.data.NoticedURL.StackType; -import net.yacy.crawler.data.ZURL.FailCategory; import net.yacy.crawler.retrieval.Request; import net.yacy.crawler.retrieval.Response; import net.yacy.crawler.robots.RobotsTxtEntry; -import net.yacy.kelondro.util.FileUtils; import net.yacy.kelondro.workflow.WorkflowJob; import net.yacy.peers.DHTSelection; import net.yacy.peers.Protocol; @@ -59,19 +58,19 @@ import net.yacy.repository.Blacklist.BlacklistType; import net.yacy.search.IndexingQueueEntry; import net.yacy.search.Switchboard; import net.yacy.search.SwitchboardConstants; +import net.yacy.search.index.ErrorCache; +import net.yacy.search.schema.CollectionConfiguration; public class CrawlQueues { - private static final String ERROR_DB_FILENAME = "urlError4.db"; - private static final String DELEGATED_DB_FILENAME = "urlDelegated4.db"; - private Switchboard sb; private ConcurrentLog log; private Map workers; // mapping from url hash to Worker thread object private final ArrayList remoteCrawlProviderHashes; public NoticedURL noticeURL; - public ZURL errorURL, delegatedURL; + public ErrorCache errorURL; + public Map delegatedURL; public CrawlQueues(final Switchboard sb, final File queuePath) { this.sb = sb; @@ -82,10 +81,8 @@ public class CrawlQueues { // start crawling management this.log.config("Starting Crawling Management"); this.noticeURL = new NoticedURL(queuePath, sb.useTailCache, sb.exceed134217727); - FileUtils.deletedelete(new File(queuePath, ERROR_DB_FILENAME)); - this.errorURL = new ZURL(sb.index.fulltext(), queuePath, ERROR_DB_FILENAME, false, sb.useTailCache, sb.exceed134217727); - this.delegatedURL = new ZURL(sb.index.fulltext(), queuePath, DELEGATED_DB_FILENAME, true, sb.useTailCache, sb.exceed134217727); - try {this.errorURL.clear();} catch (IOException e) {} // start with empty errors each time + this.errorURL = new ErrorCache(sb.index.fulltext()); + this.delegatedURL = new ConcurrentHashMap(); } public void relocate(final File newQueuePath) { @@ -95,10 +92,8 @@ public class CrawlQueues { this.remoteCrawlProviderHashes.clear(); this.noticeURL = new NoticedURL(newQueuePath, this.sb.useTailCache, this.sb.exceed134217727); - FileUtils.deletedelete(new File(newQueuePath, ERROR_DB_FILENAME)); - this.errorURL = new ZURL(this.sb.index.fulltext(), newQueuePath, ERROR_DB_FILENAME, false, this.sb.useTailCache, this.sb.exceed134217727); - this.delegatedURL = new ZURL(this.sb.index.fulltext(), newQueuePath, DELEGATED_DB_FILENAME, true, this.sb.useTailCache, this.sb.exceed134217727); - try {this.errorURL.clear();} catch (IOException e) {} // start with empty errors each time + this.errorURL = new ErrorCache(this.sb.index.fulltext()); + this.delegatedURL = new ConcurrentHashMap(); } public synchronized void close() { @@ -114,8 +109,7 @@ public class CrawlQueues { } } this.noticeURL.close(); - this.errorURL.close(); - this.delegatedURL.close(); + this.delegatedURL.clear(); } public void clear() { @@ -130,11 +124,7 @@ public class CrawlQueues { } catch (final IOException e) { ConcurrentLog.logException(e); } - try { - this.delegatedURL.clear(); - } catch (final IOException e) { - ConcurrentLog.logException(e); - } + this.delegatedURL.clear(); } /** @@ -143,7 +133,7 @@ public class CrawlQueues { * @return if the hash exists, the name of the database is returned, otherwise null is returned */ public HarvestProcess exists(final byte[] hash) { - if (this.delegatedURL.exists(hash)) { + if (this.delegatedURL.containsKey(ASCII.String(hash))) { return HarvestProcess.DELEGATED; } if (this.errorURL.exists(hash)) { @@ -164,7 +154,7 @@ public class CrawlQueues { assert hash != null && hash.length == 12; this.noticeURL.removeByURLHash(hash); this.delegatedURL.remove(hash); - this.errorURL.remove(hash); + this.errorURL.remove(ASCII.String(hash)); } public DigestURL getURL(final byte[] urlhash) { @@ -172,13 +162,13 @@ public class CrawlQueues { if (urlhash == null || urlhash.length == 0) { return null; } - ZURL.Entry ee = this.delegatedURL.get(urlhash); - if (ee != null) { - return ee.url(); + DigestURL u = this.delegatedURL.get(ASCII.String(urlhash)); + if (u != null) { + return u; } - ee = this.errorURL.get(urlhash); + CollectionConfiguration.FailDoc ee = this.errorURL.get(ASCII.String(urlhash)); if (ee != null) { - return ee.url(); + return ee.getDigestURL(); } for (final Loader w: this.workers.values()) { if (Base64Order.enhancedCoder.equal(w.request.url().hash(), urlhash)) { @@ -639,14 +629,7 @@ public class CrawlQueues { (robotsEntry = CrawlQueues.this.sb.robots.getEntry(this.request.url(), this.profile.getAgent())) != null && robotsEntry.isDisallowed(this.request.url())) { //if (log.isFine()) log.logFine("Crawling of URL '" + request.url().toString() + "' disallowed by robots.txt."); - CrawlQueues.this.errorURL.push( - this.request, - profile, - ASCII.getBytes(CrawlQueues.this.sb.peers.mySeed().hash), - new Date(), - 1, - FailCategory.FINAL_ROBOTS_RULE, - "denied by robots.txt", -1); + CrawlQueues.this.errorURL.push(this.request.url(), profile, FailCategory.FINAL_ROBOTS_RULE, "denied by robots.txt", -1); this.request.setStatus("worker-disallowed", WorkflowJob.STATUS_FINISHED); } else { // starting a load from the internet @@ -679,28 +662,14 @@ public class CrawlQueues { } if (result != null) { - CrawlQueues.this.errorURL.push( - this.request, - profile, - ASCII.getBytes(CrawlQueues.this.sb.peers.mySeed().hash), - new Date(), - 1, - FailCategory.TEMPORARY_NETWORK_FAILURE, - "cannot load: " + result, -1); + CrawlQueues.this.errorURL.push(this.request.url(), profile, FailCategory.TEMPORARY_NETWORK_FAILURE, "cannot load: " + result, -1); this.request.setStatus("worker-error", WorkflowJob.STATUS_FINISHED); } else { this.request.setStatus("worker-processed", WorkflowJob.STATUS_FINISHED); } } } catch (final Exception e) { - CrawlQueues.this.errorURL.push( - this.request, - profile, - ASCII.getBytes(CrawlQueues.this.sb.peers.mySeed().hash), - new Date(), - 1, - FailCategory.TEMPORARY_NETWORK_FAILURE, - e.getMessage() + " - in worker", -1); + CrawlQueues.this.errorURL.push(this.request.url(), profile, FailCategory.TEMPORARY_NETWORK_FAILURE, e.getMessage() + " - in worker", -1); ConcurrentLog.logException(e); this.request.setStatus("worker-exception", WorkflowJob.STATUS_FINISHED); } finally { diff --git a/source/net/yacy/crawler/data/ZURL.java b/source/net/yacy/crawler/data/ZURL.java deleted file mode 100644 index af0c47a4c..000000000 --- a/source/net/yacy/crawler/data/ZURL.java +++ /dev/null @@ -1,365 +0,0 @@ -// plasmaCrawlZURL.java -// (C) 2007 by Michael Peter Christen; mc@yacy.net, Frankfurt a. M., Germany -// first published 15.03.2007 on http://www.anomic.de -// -// This is a part of YaCy, a peer-to-peer based web search engine -// -// $LastChangedDate$ -// $LastChangedRevision$ -// $LastChangedBy$ -// -// LICENSE -// -// This program is free software; you can redistribute it and/or modify -// it under the terms of the GNU General Public License as published by -// the Free Software Foundation; either version 2 of the License, or -// (at your option) any later version. -// -// This program is distributed in the hope that it will be useful, -// but WITHOUT ANY WARRANTY; without even the implied warranty of -// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the -// GNU General Public License for more details. -// -// You should have received a copy of the GNU General Public License -// along with this program; if not, write to the Free Software -// Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA - -package net.yacy.crawler.data; - -import java.io.File; -import java.io.IOException; -import java.util.ArrayList; -import java.util.Date; -import java.util.Iterator; -import java.util.List; -import java.util.Queue; -import java.util.concurrent.LinkedBlockingQueue; - -import org.apache.solr.common.SolrInputDocument; - -import net.yacy.cora.document.encoding.UTF8; -import net.yacy.cora.document.id.DigestURL; -import net.yacy.cora.federate.solr.FailType; -import net.yacy.cora.order.Base64Order; -import net.yacy.cora.order.NaturalOrder; -import net.yacy.cora.util.ConcurrentLog; -import net.yacy.cora.util.SpaceExceededException; -import net.yacy.crawler.retrieval.Request; -import net.yacy.kelondro.data.word.Word; -import net.yacy.kelondro.index.Index; -import net.yacy.kelondro.index.Row; -import net.yacy.kelondro.table.SplitTable; -import net.yacy.kelondro.table.Table; -import net.yacy.kelondro.util.FileUtils; -import net.yacy.search.index.Fulltext; - -public class ZURL implements Iterable { - - private static ConcurrentLog log = new ConcurrentLog("REJECTED"); - - private static final int EcoFSBufferSize = 2000; - private static final int maxStackSize = 1000; - - public enum FailCategory { - // TEMPORARY categories are such failure cases that should be tried again - // FINAL categories are such failure cases that are final and should not be tried again - TEMPORARY_NETWORK_FAILURE(true, FailType.fail), // an entity could not been loaded - FINAL_PROCESS_CONTEXT(false, FailType.excl), // because of a processing context we do not want that url again (i.e. remote crawling) - FINAL_LOAD_CONTEXT(false, FailType.excl), // the crawler configuration does not want to load the entity - FINAL_ROBOTS_RULE(true, FailType.excl), // a remote server denies indexing or loading - FINAL_REDIRECT_RULE(true, FailType.excl); // the remote server redirects this page, thus disallowing reading of content - - public final boolean store; - public final FailType failType; - - private FailCategory(boolean store, FailType failType) { - this.store = store; - this.failType = failType; - } - } - - private final static Row rowdef = new Row( - "String urlhash-" + Word.commonHashLength + ", " + // the url's hash - "String executor-" + Word.commonHashLength + ", " + // the crawling executor - "Cardinal workdate-8 {b256}, " + // the time when the url was last time tried to load - "Cardinal workcount-4 {b256}, " + // number of load retries - "String anycause-132, " + // string describing load failure - "byte[] entry-" + Request.rowdef.objectsize, // extra space - Base64Order.enhancedCoder - ); - - // the class object - private Index urlIndex; - private final Queue stack; - private final Fulltext fulltext; - - protected ZURL( - final Fulltext fulltext, - final File cachePath, - final String tablename, - final boolean startWithEmptyFile, - final boolean useTailCache, - final boolean exceed134217727) { - this.fulltext = fulltext; - // creates a new ZURL in a file - cachePath.mkdirs(); - final File f = new File(cachePath, tablename); - if (startWithEmptyFile) { - if (f.exists()) { - if (f.isDirectory()) SplitTable.delete(cachePath, tablename); else FileUtils.deletedelete(f); - } - } - try { - this.urlIndex = new Table(f, rowdef, EcoFSBufferSize, 0, useTailCache, exceed134217727, true); - } catch (final SpaceExceededException e) { - try { - this.urlIndex = new Table(f, rowdef, 0, 0, false, exceed134217727, true); - } catch (final SpaceExceededException e1) { - ConcurrentLog.logException(e1); - } - } - //urlIndex = new kelondroFlexTable(cachePath, tablename, -1, rowdef, 0, true); - this.stack = new LinkedBlockingQueue(); - } - - protected void clear() throws IOException { - if (this.urlIndex != null) this.urlIndex.clear(); - if (this.stack != null) this.stack.clear(); - } - - protected void close() { - try {clear();} catch (final IOException e) {} - if (this.urlIndex != null) this.urlIndex.close(); - } - - public boolean remove(final byte[] hash) { - if (hash == null) return false; - //System.out.println("*** DEBUG ZURL " + this.urlIndex.filename() + " remove " + hash); - try { - Iterator i = ZURL.this.stack.iterator(); - while (i.hasNext()) { - byte[] b = i.next(); - if (NaturalOrder.naturalOrder.equal(hash, b)) i.remove(); - } - return this.urlIndex.delete(hash); - } catch (final IOException e) { - return false; - } - } - - public void removeHosts(final Iterable hosthashes, final boolean concurrent) { - if (hosthashes == null) return; - Thread t = new Thread() { - public void run() { - try { - Iterator i = ZURL.this.urlIndex.keys(true, null); - List r = new ArrayList(); - while (i.hasNext()) { - byte[] b = i.next(); - for (byte[] hosthash: hosthashes) { - if (NaturalOrder.naturalOrder.equal(hosthash, 0, b, 6, 6)) r.add(b); - } - } - for (byte[] b: r) ZURL.this.urlIndex.remove(b); - i = ZURL.this.stack.iterator(); - while (i.hasNext()) { - byte[] b = i.next(); - for (byte[] hosthash: hosthashes) { - if (NaturalOrder.naturalOrder.equal(hosthash, 0, b, 6, 6)) i.remove(); - } - } - } catch (final IOException e) {} - } - }; - if (concurrent) t.start(); else t.run(); - } - - public void push( - final Request bentry, - final CrawlProfile profile, - final byte[] executor, - final Date workdate, - final int workcount, - final FailCategory failCategory, - String anycause, - final int httpcode) { - // assert executor != null; // null == proxy ! - assert failCategory.store || httpcode == -1 : "failCategory=" + failCategory.name(); - if (exists(bentry.url().hash())) return; // don't insert double causes - if (anycause == null) anycause = "unknown"; - final String reason = anycause + ((httpcode >= 0) ? " (http return code = " + httpcode + ")" : ""); - final Entry entry = new Entry(bentry, executor, workdate, workcount, reason); - put(entry); - this.stack.add(entry.hash()); - if (!reason.startsWith("double")) log.info(bentry.url().toNormalform(true) + " - " + reason); - if (this.fulltext.getDefaultConnector() != null && failCategory.store) { - // send the error to solr - try { - SolrInputDocument errorDoc = this.fulltext.getDefaultConfiguration().err(bentry.url(), profile == null ? null : profile.collections(), failCategory.name() + " " + reason, failCategory.failType, httpcode); - this.fulltext.getDefaultConnector().add(errorDoc); - } catch (final IOException e) { - ConcurrentLog.warn("SOLR", "failed to send error " + bentry.url().toNormalform(true) + " to solr: " + e.getMessage()); - } - } - while (this.stack.size() > maxStackSize) this.stack.poll(); - } - - @Override - public Iterator iterator() { - return new EntryIterator(); - } - - public ArrayList list(int max) { - final ArrayList l = new ArrayList(); - DigestURL url; - for (final ZURL.Entry entry: this) { - if (entry == null) continue; - url = entry.url(); - if (url == null) continue; - l.add(entry); - if (max-- <= 0) l.remove(0); - } - return l; - } - - private class EntryIterator implements Iterator { - private final Iterator hi; - public EntryIterator() { - this.hi = ZURL.this.stack.iterator(); - } - @Override - public boolean hasNext() { - return this.hi.hasNext(); - } - - @Override - public ZURL.Entry next() { - return get(this.hi.next()); - } - - @Override - public void remove() { - this.hi.remove(); - } - - } - - public ZURL.Entry get(final byte[] urlhash) { - try { - if (this.urlIndex == null) return null; - // System.out.println("*** DEBUG ZURL " + this.urlIndex.filename() + " get " + urlhash); - final Row.Entry entry = this.urlIndex.get(urlhash, false); - if (entry == null) return null; - return new Entry(entry); - } catch (final IOException e) { - ConcurrentLog.logException(e); - return null; - } - } - - /** - * private put (use push instead) - * @param entry - */ - private void put(final Entry entry) { - // stores the values from the object variables into the database - if (entry.stored) return; - if (entry.bentry == null) return; - final Row.Entry newrow = rowdef.newEntry(); - newrow.setCol(0, entry.bentry.url().hash()); - newrow.setCol(1, entry.executor); - newrow.setCol(2, entry.workdate.getTime()); - newrow.setCol(3, entry.workcount); - newrow.setCol(4, UTF8.getBytes(entry.anycause)); - newrow.setCol(5, entry.bentry.toRow().bytes()); - try { - if (this.urlIndex != null) this.urlIndex.put(newrow); - entry.stored = true; - } catch (final Exception e) { - ConcurrentLog.logException(e); - } - } - - boolean exists(final byte[] urlHash) { - return this.urlIndex.has(urlHash); - } - - public void clearStack() { - this.stack.clear(); - } - - public int stackSize() { - return this.stack.size(); - } - - public class Entry { - - private Request bentry; // the balancer entry - private final byte[] executor; // the crawling executor - private final Date workdate; // the time when the url was last time tried to load - private final int workcount; // number of tryings - private final String anycause; // string describing reason for load fail - private boolean stored; - - private Entry( - final Request bentry, - final byte[] executor, - final Date workdate, - final int workcount, - final String anycause) { - // create new entry - assert bentry != null; - // assert executor != null; // null == proxy ! - this.bentry = bentry; - this.executor = executor; - this.workdate = (workdate == null) ? new Date() : workdate; - this.workcount = workcount; - this.anycause = (anycause == null) ? "" : anycause; - this.stored = false; - } - - private Entry(final Row.Entry entry) throws IOException { - assert (entry != null); - this.executor = entry.getColBytes(1, true); - this.workdate = new Date(entry.getColLong(2)); - this.workcount = (int) entry.getColLong(3); - this.anycause = entry.getColUTF8(4); - this.bentry = new Request(Request.rowdef.newEntry(entry.getColBytes(5, false))); - assert (Base64Order.enhancedCoder.equal(entry.getPrimaryKeyBytes(), this.bentry.url().hash())); - this.stored = true; - return; - } - - public DigestURL url() { - return this.bentry.url(); - } - - public byte[] initiator() { - return this.bentry.initiator(); - } - - private byte[] hash() { - // return a url-hash, based on the md5 algorithm - // the result is a String of 12 bytes within a 72-bit space - // (each byte has an 6-bit range) - // that should be enough for all web pages on the world - return this.bentry.url().hash(); - } - - public Date workdate() { - return this.workdate; - } - - public byte[] executor() { - // return the creator's hash - return this.executor; - } - - public String anycause() { - return this.anycause; - } - - } - -} - diff --git a/source/net/yacy/crawler/retrieval/FTPLoader.java b/source/net/yacy/crawler/retrieval/FTPLoader.java index 148853636..81bc12e68 100644 --- a/source/net/yacy/crawler/retrieval/FTPLoader.java +++ b/source/net/yacy/crawler/retrieval/FTPLoader.java @@ -36,6 +36,7 @@ import net.yacy.cora.document.encoding.ASCII; import net.yacy.cora.document.encoding.UTF8; import net.yacy.cora.document.id.DigestURL; import net.yacy.cora.document.id.MultiProtocolURL; +import net.yacy.cora.federate.solr.FailCategory; import net.yacy.cora.protocol.HeaderFramework; import net.yacy.cora.protocol.RequestHeader; import net.yacy.cora.protocol.ResponseHeader; @@ -43,7 +44,6 @@ import net.yacy.cora.protocol.ftp.FTPClient; import net.yacy.cora.util.ConcurrentLog; import net.yacy.crawler.data.CrawlProfile; import net.yacy.crawler.data.Latency; -import net.yacy.crawler.data.ZURL.FailCategory; import net.yacy.document.TextParser; import net.yacy.search.Switchboard; @@ -156,7 +156,7 @@ public class FTPLoader { if (berr.size() > 0 || response == null) { // some error logging final String detail = (berr.size() > 0) ? "Errorlog: " + berr.toString() : ""; - this.sb.crawlQueues.errorURL.push(request, profile, ASCII.getBytes(this.sb.peers.mySeed().hash), new Date(), 1, FailCategory.TEMPORARY_NETWORK_FAILURE, " ftp server download, " + detail, -1); + this.sb.crawlQueues.errorURL.push(request.url(), profile, FailCategory.TEMPORARY_NETWORK_FAILURE, " ftp server download, " + detail, -1); throw new IOException("FTPLoader: Unable to download URL '" + request.url().toString() + "': " + detail); } diff --git a/source/net/yacy/crawler/retrieval/HTTPLoader.java b/source/net/yacy/crawler/retrieval/HTTPLoader.java index 2383cc128..5d9982be1 100644 --- a/source/net/yacy/crawler/retrieval/HTTPLoader.java +++ b/source/net/yacy/crawler/retrieval/HTTPLoader.java @@ -25,10 +25,9 @@ package net.yacy.crawler.retrieval; import java.io.IOException; -import java.util.Date; -import net.yacy.cora.document.encoding.ASCII; import net.yacy.cora.document.id.DigestURL; +import net.yacy.cora.federate.solr.FailCategory; import net.yacy.cora.protocol.ClientIdentification; import net.yacy.cora.protocol.HeaderFramework; import net.yacy.cora.protocol.RequestHeader; @@ -37,7 +36,6 @@ import net.yacy.cora.protocol.http.HTTPClient; import net.yacy.cora.util.ConcurrentLog; import net.yacy.crawler.data.CrawlProfile; import net.yacy.crawler.data.Latency; -import net.yacy.crawler.data.ZURL.FailCategory; import net.yacy.kelondro.io.ByteCount; import net.yacy.repository.Blacklist.BlacklistType; import net.yacy.search.Switchboard; @@ -79,10 +77,8 @@ public final class HTTPLoader { private Response load(final Request request, CrawlProfile profile, final int retryCount, final int maxFileSize, final BlacklistType blacklistType, final ClientIdentification.Agent agent) throws IOException { - byte[] myHash = ASCII.getBytes(this.sb.peers.mySeed().hash); - if (retryCount < 0) { - this.sb.crawlQueues.errorURL.push(request, profile, myHash, new Date(), 1, FailCategory.TEMPORARY_NETWORK_FAILURE, "retry counter exceeded", -1); + this.sb.crawlQueues.errorURL.push(request.url(), profile, FailCategory.TEMPORARY_NETWORK_FAILURE, "retry counter exceeded", -1); throw new IOException("retry counter exceeded for URL " + request.url().toString() + ". Processing aborted."); } @@ -98,7 +94,7 @@ public final class HTTPLoader { // check if url is in blacklist final String hostlow = host.toLowerCase(); if (blacklistType != null && Switchboard.urlBlacklist.isListed(blacklistType, hostlow, path)) { - this.sb.crawlQueues.errorURL.push(request, profile, myHash, new Date(), 1, FailCategory.FINAL_LOAD_CONTEXT, "url in blacklist", -1); + this.sb.crawlQueues.errorURL.push(request.url(), profile, FailCategory.FINAL_LOAD_CONTEXT, "url in blacklist", -1); throw new IOException("CRAWLER Rejecting URL '" + request.url().toString() + "'. URL is in blacklist."); } @@ -145,7 +141,7 @@ public final class HTTPLoader { redirectionUrlString = redirectionUrlString == null ? "" : redirectionUrlString.trim(); if (redirectionUrlString.isEmpty()) { - this.sb.crawlQueues.errorURL.push(request, profile, myHash, new Date(), 1, FailCategory.TEMPORARY_NETWORK_FAILURE, "no redirection url provided, field '" + HeaderFramework.LOCATION + "' is empty", statusCode); + this.sb.crawlQueues.errorURL.push(request.url(), profile, FailCategory.TEMPORARY_NETWORK_FAILURE, "no redirection url provided, field '" + HeaderFramework.LOCATION + "' is empty", statusCode); throw new IOException("REJECTED EMTPY REDIRECTION '" + client.getHttpResponse().getStatusLine() + "' for URL " + requestURLString); } @@ -159,13 +155,13 @@ public final class HTTPLoader { this.sb.webStructure.generateCitationReference(url, redirectionUrl); if (this.sb.getConfigBool(SwitchboardConstants.CRAWLER_RECORD_REDIRECTS, true)) { - this.sb.crawlQueues.errorURL.push(request, profile, myHash, new Date(), 1, FailCategory.FINAL_REDIRECT_RULE, "redirect to " + redirectionUrlString, statusCode); + this.sb.crawlQueues.errorURL.push(request.url(), profile, FailCategory.FINAL_REDIRECT_RULE, "redirect to " + redirectionUrlString, statusCode); } if (this.sb.getConfigBool(SwitchboardConstants.CRAWLER_FOLLOW_REDIRECTS, true)) { // if we are already doing a shutdown we don't need to retry crawling if (Thread.currentThread().isInterrupted()) { - this.sb.crawlQueues.errorURL.push(request, profile, myHash, new Date(), 1, FailCategory.FINAL_LOAD_CONTEXT, "server shutdown", statusCode); + this.sb.crawlQueues.errorURL.push(request.url(), profile, FailCategory.FINAL_LOAD_CONTEXT, "server shutdown", statusCode); throw new IOException("CRAWLER Retry of URL=" + requestURLString + " aborted because of server shutdown."); } @@ -174,11 +170,11 @@ public final class HTTPLoader { return load(request, profile, retryCount - 1, maxFileSize, blacklistType, agent); } // we don't want to follow redirects - this.sb.crawlQueues.errorURL.push(request, profile, myHash, new Date(), 1, FailCategory.FINAL_PROCESS_CONTEXT, "redirection not wanted", statusCode); + this.sb.crawlQueues.errorURL.push(request.url(), profile, FailCategory.FINAL_PROCESS_CONTEXT, "redirection not wanted", statusCode); throw new IOException("REJECTED UNWANTED REDIRECTION '" + client.getHttpResponse().getStatusLine() + "' for URL " + requestURLString); } else if (responseBody == null) { // no response, reject file - this.sb.crawlQueues.errorURL.push(request, profile, myHash, new Date(), 1, FailCategory.TEMPORARY_NETWORK_FAILURE, "no response body", statusCode); + this.sb.crawlQueues.errorURL.push(request.url(), profile, FailCategory.TEMPORARY_NETWORK_FAILURE, "no response body", statusCode); throw new IOException("REJECTED EMPTY RESPONSE BODY '" + client.getHttpResponse().getStatusLine() + "' for URL " + requestURLString); } else if (statusCode == 200 || statusCode == 203) { // the transfer is ok @@ -189,7 +185,7 @@ public final class HTTPLoader { // check length again in case it was not possible to get the length before loading if (maxFileSize >= 0 && contentLength > maxFileSize) { - this.sb.crawlQueues.errorURL.push(request, profile, myHash, new Date(), 1, FailCategory.FINAL_PROCESS_CONTEXT, "file size limit exceeded", statusCode); + this.sb.crawlQueues.errorURL.push(request.url(), profile, FailCategory.FINAL_PROCESS_CONTEXT, "file size limit exceeded", statusCode); throw new IOException("REJECTED URL " + request.url() + " because file size '" + contentLength + "' exceeds max filesize limit of " + maxFileSize + " bytes. (GET)"); } @@ -206,7 +202,7 @@ public final class HTTPLoader { return response; } else { // if the response has not the right response type then reject file - this.sb.crawlQueues.errorURL.push(request, profile, myHash, new Date(), 1, FailCategory.TEMPORARY_NETWORK_FAILURE, "wrong http status code", statusCode); + this.sb.crawlQueues.errorURL.push(request.url(), profile, FailCategory.TEMPORARY_NETWORK_FAILURE, "wrong http status code", statusCode); throw new IOException("REJECTED WRONG STATUS TYPE '" + client.getHttpResponse().getStatusLine() + "' for URL " + requestURLString); } } diff --git a/source/net/yacy/repository/LoaderDispatcher.java b/source/net/yacy/repository/LoaderDispatcher.java index 6ad3273ef..7f7d59836 100644 --- a/source/net/yacy/repository/LoaderDispatcher.java +++ b/source/net/yacy/repository/LoaderDispatcher.java @@ -42,6 +42,7 @@ import net.yacy.cora.document.encoding.ASCII; import net.yacy.cora.document.encoding.UTF8; import net.yacy.cora.document.id.AnchorURL; import net.yacy.cora.document.id.DigestURL; +import net.yacy.cora.federate.solr.FailCategory; import net.yacy.cora.federate.yacy.CacheStrategy; import net.yacy.cora.protocol.ClientIdentification; import net.yacy.cora.protocol.HeaderFramework; @@ -50,7 +51,6 @@ import net.yacy.cora.protocol.ResponseHeader; import net.yacy.cora.util.ConcurrentLog; import net.yacy.crawler.data.Cache; import net.yacy.crawler.data.CrawlProfile; -import net.yacy.crawler.data.ZURL.FailCategory; import net.yacy.crawler.retrieval.FTPLoader; import net.yacy.crawler.retrieval.FileLoader; import net.yacy.crawler.retrieval.HTTPLoader; @@ -191,7 +191,7 @@ public final class LoaderDispatcher { // check if url is in blacklist if (blacklistType != null && host != null && Switchboard.urlBlacklist.isListed(blacklistType, host.toLowerCase(), url.getFile())) { - this.sb.crawlQueues.errorURL.push(request, crawlProfile, this.sb.peers.mySeed().hash.getBytes(), new Date(), 1, FailCategory.FINAL_LOAD_CONTEXT, "url in blacklist", -1); + this.sb.crawlQueues.errorURL.push(request.url(), crawlProfile, FailCategory.FINAL_LOAD_CONTEXT, "url in blacklist", -1); throw new IOException("DISPATCHER Rejecting URL '" + request.url().toString() + "'. URL is in blacklist."); } diff --git a/source/net/yacy/search/Switchboard.java b/source/net/yacy/search/Switchboard.java index f2d7d5943..170d5ef30 100644 --- a/source/net/yacy/search/Switchboard.java +++ b/source/net/yacy/search/Switchboard.java @@ -97,6 +97,7 @@ import net.yacy.cora.document.feed.RSSReader; import net.yacy.cora.document.id.AnchorURL; import net.yacy.cora.document.id.DigestURL; import net.yacy.cora.document.id.MultiProtocolURL; +import net.yacy.cora.federate.solr.FailCategory; import net.yacy.cora.federate.solr.Ranking; import net.yacy.cora.federate.solr.SchemaConfiguration; import net.yacy.cora.federate.solr.instance.RemoteInstance; @@ -127,7 +128,6 @@ import net.yacy.crawler.data.ResultImages; import net.yacy.crawler.data.ResultURLs; import net.yacy.crawler.data.NoticedURL.StackType; import net.yacy.crawler.data.ResultURLs.EventOrigin; -import net.yacy.crawler.data.ZURL.FailCategory; import net.yacy.crawler.retrieval.Request; import net.yacy.crawler.retrieval.Response; import net.yacy.crawler.robots.RobotsTxt; @@ -1789,16 +1789,9 @@ public final class Switchboard extends serverSwitch { // in the noIndexReason is set, indexing is not allowed if ( noIndexReason != null ) { // log cause and close queue - final DigestURL referrerURL = response.referrerURL(); //if (log.isFine()) log.logFine("deQueue: not indexed any word in URL " + response.url() + "; cause: " + noIndexReason); - addURLtoErrorDB( - response.url(), - response.profile(), - (referrerURL == null) ? null : referrerURL.hash(), - response.initiator(), - response.name(), - FailCategory.FINAL_PROCESS_CONTEXT, - noIndexReason); + // create a new errorURL DB entry + this.crawlQueues.errorURL.push(response.url(), response.profile(), FailCategory.FINAL_PROCESS_CONTEXT, noIndexReason, -1); // finish this entry return "not allowed: " + noIndexReason; } @@ -1991,7 +1984,7 @@ public final class Switchboard extends serverSwitch { public int cleanupJobSize() { int c = 1; // "es gibt immer was zu tun" - if ( (this.crawlQueues.delegatedURL.stackSize() > 1000) ) { + if ( (this.crawlQueues.delegatedURL.size() > 1000) ) { c++; } if ( (this.crawlQueues.errorURL.stackSize() > 1000) ) { @@ -2101,13 +2094,13 @@ public final class Switchboard extends serverSwitch { // clean up delegated stack checkInterruption(); - if ( (this.crawlQueues.delegatedURL.stackSize() > 1000) ) { + if ( (this.crawlQueues.delegatedURL.size() > 1000) ) { if ( this.log.isFine() ) { this.log.fine("Cleaning Delegated-URLs report stack, " - + this.crawlQueues.delegatedURL.stackSize() + + this.crawlQueues.delegatedURL.size() + " entries on stack"); } - this.crawlQueues.delegatedURL.clearStack(); + this.crawlQueues.delegatedURL.clear(); } // clean up error stack @@ -2428,7 +2421,6 @@ public final class Switchboard extends serverSwitch { public IndexingQueueEntry parseDocument(final IndexingQueueEntry in) { in.queueEntry.updateStatus(Response.QUEUE_STATE_PARSING); - Document[] documents = null; try { documents = parseDocument(in.queueEntry); @@ -2439,7 +2431,7 @@ public final class Switchboard extends serverSwitch { } if ( documents == null ) { return null; - } + } return new IndexingQueueEntry(in.queueEntry, documents, null); } @@ -2465,14 +2457,8 @@ public final class Switchboard extends serverSwitch { response.setContent(Cache.getContent(response.url().hash())); if ( response.getContent() == null ) { this.log.warn("the resource '" + response.url() + "' is missing in the cache."); - addURLtoErrorDB( - response.url(), - response.profile(), - response.referrerHash(), - response.initiator(), - response.name(), - FailCategory.FINAL_LOAD_CONTEXT, - "missing in cache"); + // create a new errorURL DB entry + this.crawlQueues.errorURL.push(response.url(), response.profile(), FailCategory.FINAL_LOAD_CONTEXT, "missing in cache", -1); return null; } } @@ -2490,20 +2476,37 @@ public final class Switchboard extends serverSwitch { } } catch (final Parser.Failure e ) { this.log.warn("Unable to parse the resource '" + response.url() + "'. " + e.getMessage()); - addURLtoErrorDB( - response.url(), - response.profile(), - response.referrerHash(), - response.initiator(), - response.name(), - FailCategory.FINAL_PROCESS_CONTEXT, - e.getMessage()); + // create a new errorURL DB entry + this.crawlQueues.errorURL.push(response.url(), response.profile(), FailCategory.FINAL_PROCESS_CONTEXT, e.getMessage(), -1); return null; } - final long parsingEndTime = System.currentTimeMillis(); + + // put anchors on crawl stack final long stackStartTime = System.currentTimeMillis(); + // check if the documents have valid urls; this is not a bug patch; it is possible that + // i.e. the result of a feed parsing results in documents from domains which shall be filtered by the crawl profile + if (response.profile() != null) { + ArrayList newDocs = new ArrayList(); + for (Document doc: documents) { + String rejectReason = this.crawlStacker.checkAcceptance(doc.dc_source(), response.profile(), 1 /*depth is irrelevant here, we just make clear its not the start url*/); + if (rejectReason == null) { + newDocs.add(doc); + } else { + // we consider this as fail urls to have a tracking of the problem + if (rejectReason != null && !rejectReason.startsWith("double in")) { + final CrawlProfile profile = this.crawler.getActive(UTF8.getBytes(response.profile().handle())); + this.crawlStacker.nextQueue.errorURL.push(response.url(), profile, FailCategory.FINAL_LOAD_CONTEXT, rejectReason, -1); + } + } + } + if (newDocs.size() != documents.length) { + documents = (Document[]) newDocs.toArray(); + } + } + + // collect anchors within remaining documents if ((processCase == EventOrigin.PROXY_LOAD || processCase == EventOrigin.LOCAL_CRAWLING) && ( response.profile() == null || @@ -2592,14 +2595,8 @@ public final class Switchboard extends serverSwitch { if (!(profile.indexUrlMustMatchPattern() == CrawlProfile.MATCH_ALL_PATTERN || profile.indexUrlMustMatchPattern().matcher(urls).matches()) || (profile.indexUrlMustNotMatchPattern() != CrawlProfile.MATCH_NEVER_PATTERN && profile.indexUrlMustNotMatchPattern().matcher(urls).matches())) { if (this.log.isInfo()) this.log.info("Not Condensed Resource '" + urls + "': indexing prevented by regular expression on url; indexUrlMustMatchPattern = " + profile.indexUrlMustMatchPattern().pattern() + ", indexUrlMustNotMatchPattern = " + profile.indexUrlMustNotMatchPattern().pattern()); - addURLtoErrorDB( - in.queueEntry.url(), - profile, - in.queueEntry.referrerHash(), - in.queueEntry.initiator(), - in.queueEntry.name(), - FailCategory.FINAL_PROCESS_CONTEXT, - "indexing prevented by regular expression on url; indexUrlMustMatchPattern = " + profile.indexUrlMustMatchPattern().pattern() + ", indexUrlMustNotMatchPattern = " + profile.indexUrlMustNotMatchPattern().pattern()); + // create a new errorURL DB entry + this.crawlQueues.errorURL.push(in.queueEntry.url(), profile, FailCategory.FINAL_PROCESS_CONTEXT, "indexing prevented by regular expression on url; indexUrlMustMatchPattern = " + profile.indexUrlMustMatchPattern().pattern() + ", indexUrlMustNotMatchPattern = " + profile.indexUrlMustNotMatchPattern().pattern(), -1); return new IndexingQueueEntry(in.queueEntry, in.documents, null); } @@ -2608,27 +2605,15 @@ public final class Switchboard extends serverSwitch { docloop: for (final Document document : in.documents) { if (document.indexingDenied() && profile.obeyHtmlRobotsNoindex()) { if (this.log.isInfo()) this.log.info("Not Condensed Resource '" + urls + "': denied by document-attached noindexing rule"); - addURLtoErrorDB( - in.queueEntry.url(), - profile, - in.queueEntry.referrerHash(), - in.queueEntry.initiator(), - in.queueEntry.name(), - FailCategory.FINAL_PROCESS_CONTEXT, - "denied by document-attached noindexing rule"); + // create a new errorURL DB entry + this.crawlQueues.errorURL.push(in.queueEntry.url(), profile, FailCategory.FINAL_PROCESS_CONTEXT, "denied by document-attached noindexing rule", -1); continue docloop; } if (!(profile.indexContentMustMatchPattern() == CrawlProfile.MATCH_ALL_PATTERN || profile.indexContentMustMatchPattern().matcher(document.getTextString()).matches()) || (profile.indexContentMustNotMatchPattern() != CrawlProfile.MATCH_NEVER_PATTERN && profile.indexContentMustNotMatchPattern().matcher(document.getTextString()).matches())) { if (this.log.isInfo()) this.log.info("Not Condensed Resource '" + urls + "': indexing prevented by regular expression on content; indexContentMustMatchPattern = " + profile.indexContentMustMatchPattern().pattern() + ", indexContentMustNotMatchPattern = " + profile.indexContentMustNotMatchPattern().pattern()); - addURLtoErrorDB( - in.queueEntry.url(), - profile, - in.queueEntry.referrerHash(), - in.queueEntry.initiator(), - in.queueEntry.name(), - FailCategory.FINAL_PROCESS_CONTEXT, - "indexing prevented by regular expression on content; indexContentMustMatchPattern = " + profile.indexContentMustMatchPattern().pattern() + ", indexContentMustNotMatchPattern = " + profile.indexContentMustNotMatchPattern().pattern()); + // create a new errorURL DB entry + this.crawlQueues.errorURL.push(in.queueEntry.url(), profile, FailCategory.FINAL_PROCESS_CONTEXT, "indexing prevented by regular expression on content; indexContentMustMatchPattern = " + profile.indexContentMustMatchPattern().pattern() + ", indexContentMustNotMatchPattern = " + profile.indexContentMustNotMatchPattern().pattern(), -1); continue docloop; } doclist.add(document); @@ -2705,30 +2690,18 @@ public final class Switchboard extends serverSwitch { if (condenser == null || (document.indexingDenied() && profile.obeyHtmlRobotsNoindex())) { //if (this.log.isInfo()) log.logInfo("Not Indexed Resource '" + queueEntry.url().toNormalform(false, true) + "': denied by rule in document, process case=" + processCase); - addURLtoErrorDB( - url, - profile, - (referrerURL == null) ? null : referrerURL.hash(), - queueEntry.initiator(), - dc_title, - FailCategory.FINAL_PROCESS_CONTEXT, - "denied by rule in document, process case=" + processCase); + // create a new errorURL DB entry + this.crawlQueues.errorURL.push(url, profile, FailCategory.FINAL_PROCESS_CONTEXT, "denied by rule in document, process case=" + processCase, -1); return; } if ( profile != null && !profile.indexText() && !profile.indexMedia() ) { //if (this.log.isInfo()) log.logInfo("Not Indexed Resource '" + queueEntry.url().toNormalform(false, true) + "': denied by profile rule, process case=" + processCase + ", profile name = " + queueEntry.profile().name()); - addURLtoErrorDB( - url, - profile, - (referrerURL == null) ? null : referrerURL.hash(), - queueEntry.initiator(), - dc_title, - FailCategory.FINAL_LOAD_CONTEXT, - "denied by profile rule, process case=" - + processCase - + ", profile name = " - + profile.collectionName()); + // create a new errorURL DB entry + this.crawlQueues.errorURL.push(url, profile, FailCategory.FINAL_LOAD_CONTEXT, "denied by profile rule, process case=" + + processCase + + ", profile name = " + + profile.collectionName(), -1); return; } @@ -2906,7 +2879,7 @@ public final class Switchboard extends serverSwitch { // remove the document from the error-db byte[] hosthash = new byte[6]; System.arraycopy(urlhash, 6, hosthash, 0, 6); List hosthashes = new ArrayList(); hosthashes.add(hosthash); - this.crawlQueues.errorURL.removeHosts(hosthashes, false); + this.crawlQueues.errorURL.removeHosts(hosthashes); this.crawlQueues.removeURL(urlhash); // get a scraper to get the title @@ -3373,31 +3346,6 @@ public final class Switchboard extends serverSwitch { return hasDoneSomething; } - private void addURLtoErrorDB( - final DigestURL url, - final CrawlProfile profile, - final byte[] referrerHash, - final byte[] initiator, - final String name, - final FailCategory failCategory, - final String failreason) { - // assert initiator != null; // null == proxy - // create a new errorURL DB entry - final Request bentry = - new Request( - initiator, - url, - referrerHash, - (name == null) ? "" : name, - new Date(), - null, - 0, - 0, - 0, - 0); - this.crawlQueues.errorURL.push(bentry, profile, initiator, new Date(), 0, failCategory, failreason, -1); - } - public final void heuristicSite(final SearchEvent searchEvent, final String host) { new Thread() { @Override diff --git a/source/net/yacy/search/index/ErrorCache.java b/source/net/yacy/search/index/ErrorCache.java new file mode 100644 index 000000000..e0ac6c42d --- /dev/null +++ b/source/net/yacy/search/index/ErrorCache.java @@ -0,0 +1,173 @@ +/** + * ErrorCache + * Copyright 2013 by Michael Peter Christen + * First released 17.10.2013 at http://yacy.net + * + * This library is free software; you can redistribute it and/or + * modify it under the terms of the GNU Lesser General Public + * License as published by the Free Software Foundation; either + * version 2.1 of the License, or (at your option) any later version. + * + * This library is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * Lesser General Public License for more details. + * + * You should have received a copy of the GNU Lesser General Public License + * along with this program in the file lgpl21.txt + * If not, see . + */ + +package net.yacy.search.index; + +import java.io.IOException; +import java.util.ArrayList; +import java.util.Iterator; +import java.util.LinkedHashMap; + +import org.apache.solr.client.solrj.SolrQuery; +import org.apache.solr.client.solrj.SolrQuery.SortClause; +import org.apache.solr.client.solrj.response.QueryResponse; +import org.apache.solr.common.SolrDocument; +import org.apache.solr.common.SolrDocumentList; +import org.apache.solr.common.SolrInputDocument; + +import net.yacy.cora.document.encoding.ASCII; +import net.yacy.cora.document.id.DigestURL; +import net.yacy.cora.federate.solr.FailCategory; +import net.yacy.cora.order.NaturalOrder; +import net.yacy.cora.util.ConcurrentLog; +import net.yacy.crawler.data.CrawlProfile; +import net.yacy.search.index.Fulltext; +import net.yacy.search.schema.CollectionConfiguration; +import net.yacy.search.schema.CollectionSchema; + +public class ErrorCache { + + private static ConcurrentLog log = new ConcurrentLog("REJECTED"); + private static final int maxStackSize = 1000; + + // the class object + private final LinkedHashMap stack; + private final Fulltext fulltext; + + public ErrorCache(final Fulltext fulltext) { + this.fulltext = fulltext; + this.stack = new LinkedHashMap(); + try { + // fill stack with latest values + final SolrQuery params = new SolrQuery(); + params.setParam("defType", "edismax"); + params.setStart(0); + params.setRows(100); + params.setFacet(false); + params.setSort(new SortClause(CollectionSchema.last_modified.getSolrFieldName(), SolrQuery.ORDER.desc)); + params.setFacet(false); + params.setQuery(CollectionSchema.failreason_s.getSolrFieldName() + ":[* TO *]"); + QueryResponse rsp = fulltext.getDefaultConnector().getResponseByParams(params); + SolrDocumentList docList = rsp == null ? null : rsp.getResults(); + if (docList != null) for (int i = docList.size() - 1; i >= 0; i--) { + CollectionConfiguration.FailDoc failDoc = new CollectionConfiguration.FailDoc(docList.get(i)); + this.stack.put(ASCII.String(failDoc.getDigestURL().hash()), failDoc); + } + } catch (final Throwable e) { + } + } + + public void clear() throws IOException { + if (this.stack != null) this.stack.clear(); + this.fulltext.getDefaultConnector().deleteByQuery(CollectionSchema.failreason_s.getSolrFieldName() + ":[* TO *]"); + } + + public void remove(final String hash) { + if (hash == null) return; + this.stack.remove(hash); + try { + this.fulltext.getDefaultConnector().deleteByQuery(CollectionSchema.id.getSolrFieldName() + ":\"" + hash + "\" AND " + CollectionSchema.failreason_s.getSolrFieldName() + ":[* TO *]"); + } catch (final IOException e) { + return; + } + } + + public void removeHosts(final Iterable hosthashes) { + if (hosthashes == null) return; + try { + for (byte[] hosthash : hosthashes) { + this.fulltext.getDefaultConnector().deleteByQuery(CollectionSchema.host_id_s.getSolrFieldName() + ":\"" + ASCII.String(hosthash) + "\" AND " + CollectionSchema.failreason_s.getSolrFieldName() + ":[* TO *]"); + } + Iterator i = ErrorCache.this.stack.keySet().iterator(); + while (i.hasNext()) { + String b = i.next(); + for (byte[] hosthash : hosthashes) { + if (NaturalOrder.naturalOrder.equal(hosthash, 0, ASCII.getBytes(b), 6, 6)) i.remove(); + } + } + } catch (final IOException e) { + } + } + + public void push(final DigestURL url, final CrawlProfile profile, final FailCategory failCategory, String anycause, final int httpcode) { + // assert executor != null; // null == proxy ! + assert failCategory.store || httpcode == -1 : "failCategory=" + failCategory.name(); + if (exists(url.hash())) + return; // don't insert double causes + if (anycause == null) anycause = "unknown"; + final String reason = anycause + ((httpcode >= 0) ? " (http return code = " + httpcode + ")" : ""); + if (!reason.startsWith("double")) log.info(url.toNormalform(true) + " - " + reason); + CollectionConfiguration.FailDoc failDoc = new CollectionConfiguration.FailDoc( + url, profile == null ? null : profile.collections(), + failCategory.name() + " " + reason, failCategory.failType, + httpcode); + this.stack.put(ASCII.String(url.hash()), failDoc); + if (this.fulltext.getDefaultConnector() != null && failCategory.store) { + // send the error to solr + try { + SolrInputDocument errorDoc = failDoc.toSolr(this.fulltext.getDefaultConfiguration()); + this.fulltext.getDefaultConnector().add(errorDoc); + } catch (final IOException e) { + ConcurrentLog.warn("SOLR", "failed to send error " + url.toNormalform(true) + " to solr: " + e.getMessage()); + } + } + while (this.stack.size() > maxStackSize) + this.stack.remove(this.stack.keySet().iterator()); + } + + public ArrayList list(int max) { + final ArrayList l = new ArrayList(); + Iterator fdi = this.stack.values().iterator(); + for (int i = 0; i < this.stack.size() - max; i++) fdi.next(); + while (fdi.hasNext()) l.add(fdi.next()); + return l; + } + + public CollectionConfiguration.FailDoc get(final String urlhash) { + CollectionConfiguration.FailDoc fd = this.stack.get(urlhash); + if (fd != null) return fd; + try { + SolrDocument doc = this.fulltext.getDefaultConnector().getDocumentById(urlhash); + if (doc == null) return null; + return new CollectionConfiguration.FailDoc(doc); + } catch (final IOException e) { + ConcurrentLog.logException(e); + return null; + } + } + + public boolean exists(final byte[] urlHash) { + try { + return this.fulltext.getDefaultConnector().existsByQuery(CollectionSchema.id.getSolrFieldName() + ":\"" + ASCII.String(urlHash) + "\" AND " + CollectionSchema.failreason_s.getSolrFieldName() + ":[* TO *]"); + } catch (IOException e) { + return false; + } + } + + public void clearStack() { + this.stack.clear(); + } + + public int stackSize() { + return this.stack.size(); + } + +} + diff --git a/source/net/yacy/search/schema/CollectionConfiguration.java b/source/net/yacy/search/schema/CollectionConfiguration.java index 9c2db013d..36273649b 100644 --- a/source/net/yacy/search/schema/CollectionConfiguration.java +++ b/source/net/yacy/search/schema/CollectionConfiguration.java @@ -80,6 +80,7 @@ import net.yacy.kelondro.util.Bitfield; import net.yacy.search.index.Segment; import net.yacy.search.index.Segment.ReferenceReport; import net.yacy.search.index.Segment.ReferenceReportCache; +import net.yacy.search.query.QueryParams; import net.yacy.search.schema.WebgraphConfiguration.Subgraph; import org.apache.solr.common.SolrDocument; @@ -1195,34 +1196,73 @@ public class CollectionConfiguration extends SchemaConfiguration implements Seri return il; } */ - - /** - * register an entry as error document - * @param digestURI - * @param failReason - * @param httpstatus - * @throws IOException - */ - public SolrInputDocument err(final DigestURL digestURI, final Map collections, final String failReason, final FailType failType, final int httpstatus) throws IOException { - boolean allAttr = this.isEmpty(); - assert allAttr || contains(CollectionSchema.failreason_s); - - final SolrInputDocument doc = new SolrInputDocument(); - String url = addURIAttributes(doc, allAttr, digestURI, Response.docType(digestURI)); - if (allAttr || contains(CollectionSchema.load_date_dt)) add(doc, CollectionSchema.load_date_dt, new Date()); - - // fail reason and status - if (allAttr || contains(CollectionSchema.failreason_s)) add(doc, CollectionSchema.failreason_s, failReason); - if (allAttr || contains(CollectionSchema.failtype_s)) add(doc, CollectionSchema.failtype_s, failType.name()); - if (allAttr || contains(CollectionSchema.httpstatus_i)) add(doc, CollectionSchema.httpstatus_i, httpstatus); - if (allAttr || contains(CollectionSchema.collection_sxt) && collections != null && collections.size() > 0) { - List cs = new ArrayList(); - for (Map.Entry e: collections.entrySet()) { - if (e.getValue().matcher(url).matches()) cs.add(e.getKey()); - } - add(doc, CollectionSchema.collection_sxt, cs); - } - return doc; - } + public static class FailDoc { + DigestURL digestURL; + final Map collections; + final String failReason; + final FailType failType; + final int httpstatus; + final Date failtime; + public FailDoc(final DigestURL digestURL, final Map collections, final String failReason, final FailType failType, final int httpstatus) { + this.digestURL = digestURL; + this.collections = collections; + this.failReason = failReason; + this.failType = failType; + this.httpstatus = httpstatus; + this.failtime = new Date(); + } + public FailDoc(final SolrDocument doc) { + try { + this.digestURL = new DigestURL((String) doc.getFieldValue(CollectionSchema.sku.getSolrFieldName())); + } catch (MalformedURLException e) { + this.digestURL = null; + } + this.collections = new HashMap(); + Collection c = doc.getFieldValues(CollectionSchema.collection_sxt.getSolrFieldName()); + for (Object cn: c) this.collections.put((String) cn, QueryParams.catchall_pattern); + this.failReason = (String) doc.getFieldValue(CollectionSchema.failreason_s.getSolrFieldName()); + this.failType = FailType.valueOf((String) doc.getFieldValue(CollectionSchema.failtype_s.getSolrFieldName())); + this.httpstatus = (Integer) doc.getFieldValue(CollectionSchema.httpstatus_i.getSolrFieldName()); + this.failtime = (Date) doc.getFieldValue(CollectionSchema.load_date_dt.getSolrFieldName()); + } + public DigestURL getDigestURL() { + return digestURL; + } + public Map getCollections() { + return collections; + } + public String getFailReason() { + return failReason; + } + public FailType getFailType() { + return failType; + } + public int getHttpstatus() { + return httpstatus; + } + public SolrInputDocument toSolr(CollectionConfiguration configuration) { + boolean allAttr = configuration.isEmpty(); + assert allAttr || configuration.contains(CollectionSchema.failreason_s); + + final SolrInputDocument doc = new SolrInputDocument(); + String url = configuration.addURIAttributes(doc, allAttr, this.getDigestURL(), Response.docType(this.getDigestURL())); + if (allAttr || configuration.contains(CollectionSchema.load_date_dt)) configuration.add(doc, CollectionSchema.load_date_dt, new Date()); + + // fail reason and status + if (allAttr || configuration.contains(CollectionSchema.failreason_s)) configuration.add(doc, CollectionSchema.failreason_s, this.getFailReason()); + if (allAttr || configuration.contains(CollectionSchema.failtype_s)) configuration.add(doc, CollectionSchema.failtype_s, this.getFailType().name()); + if (allAttr || configuration.contains(CollectionSchema.httpstatus_i)) configuration.add(doc, CollectionSchema.httpstatus_i, this.getHttpstatus()); + if (allAttr || configuration.contains(CollectionSchema.collection_sxt) && this.getCollections() != null && this.getCollections().size() > 0) { + List cs = new ArrayList(); + for (Map.Entry e: this.getCollections().entrySet()) { + if (e.getValue().matcher(url).matches()) cs.add(e.getKey()); + } + configuration.add(doc, CollectionSchema.collection_sxt, cs); + } + return doc; + } + + } + } diff --git a/source/net/yacy/search/snippet/MediaSnippet.java b/source/net/yacy/search/snippet/MediaSnippet.java index 0be262a08..cf4dd21a8 100644 --- a/source/net/yacy/search/snippet/MediaSnippet.java +++ b/source/net/yacy/search/snippet/MediaSnippet.java @@ -40,6 +40,7 @@ import net.yacy.cora.document.analysis.Classification.ContentDomain; import net.yacy.cora.document.encoding.ASCII; import net.yacy.cora.document.id.AnchorURL; import net.yacy.cora.document.id.DigestURL; +import net.yacy.cora.federate.solr.FailCategory; import net.yacy.cora.federate.yacy.CacheStrategy; import net.yacy.cora.order.Base64Order; import net.yacy.cora.protocol.ClientIdentification; @@ -48,8 +49,6 @@ import net.yacy.cora.util.ByteArray; import net.yacy.cora.util.ConcurrentLog; import net.yacy.cora.util.NumberTools; import net.yacy.cora.util.SpaceExceededException; -import net.yacy.crawler.data.ZURL.FailCategory; -import net.yacy.crawler.retrieval.Request; import net.yacy.document.Document; import net.yacy.document.Parser; import net.yacy.document.WordTokenizer; @@ -59,6 +58,7 @@ import net.yacy.repository.Blacklist.BlacklistType; import net.yacy.search.Switchboard; +@SuppressWarnings("unused") public class MediaSnippet implements Comparable, Comparator { public ContentDomain type; public DigestURL href, source; @@ -260,7 +260,7 @@ public class MediaSnippet implements Comparable, Comparator