From 22ce4fb4ddecc56b40e9c3985bdd0a7b22babde5 Mon Sep 17 00:00:00 2001 From: orbiter Date: Fri, 1 Aug 2014 11:00:10 +0200 Subject: [PATCH] better error handling for remote solr queries and exists-checks --- htroot/HostBrowser.java | 18 +++- htroot/IndexControlRWIs_p.java | 91 ++++++++++--------- htroot/IndexControlURLs_p.java | 21 +++-- htroot/Load_RSS_p.java | 28 +++--- htroot/api/citation.java | 11 ++- htroot/api/linkstructure.java | 8 +- htroot/api/webstructure.java | 24 +++-- htroot/api/ymarks/add_ymark.java | 8 +- htroot/yacy/transferRWI.java | 12 ++- htroot/yacy/transferURL.java | 9 +- htroot/yacy/urls.java | 32 ++++--- htroot/yacysearch.java | 28 +++--- .../solr/connector/AbstractSolrConnector.java | 3 - .../solr/connector/EmbeddedSolrConnector.java | 4 +- .../solr/connector/SolrConnector.java | 8 +- .../solr/connector/SolrServerConnector.java | 32 ++++--- source/net/yacy/crawler/CrawlStacker.java | 3 + .../net/yacy/crawler/retrieval/RSSLoader.java | 15 ++- .../crawler/retrieval/SitemapImporter.java | 22 +++-- source/net/yacy/data/ymark/YMarkMetadata.java | 8 +- source/net/yacy/peers/Transmission.java | 13 ++- source/net/yacy/search/Switchboard.java | 46 ++++++---- source/net/yacy/search/index/Fulltext.java | 25 ++--- source/net/yacy/search/index/Segment.java | 8 +- 24 files changed, 292 insertions(+), 185 deletions(-) diff --git a/htroot/HostBrowser.java b/htroot/HostBrowser.java index 75075c106..b3e56dbe2 100644 --- a/htroot/HostBrowser.java +++ b/htroot/HostBrowser.java @@ -138,8 +138,13 @@ public class HostBrowser { String load = post.get("load", ""); boolean wait = false; - if (loadRight && autoload && path.length() != 0 && pathURI != null && load.length() == 0 && sb.index.getLoadTime(ASCII.String(pathURI.hash())) < 0) { - // in case that the url does not exist and loading is wanted turn this request into a loading request + try { + if (loadRight && autoload && path.length() != 0 && pathURI != null && load.length() == 0 && sb.index.getLoadTime(ASCII.String(pathURI.hash())) < 0) { + // in case that the url does not exist and loading is wanted turn this request into a loading request + load = path; + wait = true; + } + } catch (IOException e1) { load = path; wait = true; } @@ -156,8 +161,13 @@ public class HostBrowser { 0, 0, 0 )); prop.putHTML("result", reasonString == null ? ("added url to indexer: " + load) : ("not indexed url '" + load + "': " + reasonString)); - if (wait) for (int i = 0; i < 30; i++) { - if (sb.index.getLoadTime(ASCII.String(url.hash())) >= 0) break; + if (wait) waitloop: for (int i = 0; i < 30; i++) { + try { + if (sb.index.getLoadTime(ASCII.String(url.hash())) >= 0) break; + } catch (IOException e1) { + e1.printStackTrace(); + break waitloop; + } try {Thread.sleep(100);} catch (final InterruptedException e) {} } } catch (final MalformedURLException e) { diff --git a/htroot/IndexControlRWIs_p.java b/htroot/IndexControlRWIs_p.java index 2086dc937..6c06e1e8f 100644 --- a/htroot/IndexControlRWIs_p.java +++ b/htroot/IndexControlRWIs_p.java @@ -369,8 +369,7 @@ public class IndexControlRWIs_p { Word.commonHashOrder, urlb.size()); if ( post.containsKey("blacklisturls") ) { - final String[] supportedBlacklistTypes = - env.getConfig("BlackLists.types", "").split(","); + final String[] supportedBlacklistTypes = env.getConfig("BlackLists.types", "").split(","); DigestURL url; for ( final byte[] b : urlb ) { try { @@ -378,28 +377,32 @@ public class IndexControlRWIs_p { } catch (final SpaceExceededException e ) { ConcurrentLog.logException(e); } - url = segment.fulltext().getURL(ASCII.String(b)); - segment.fulltext().remove(b); - if ( url != null ) { - for ( final String supportedBlacklistType : supportedBlacklistTypes ) { - if ( ListManager.listSetContains( - supportedBlacklistType + ".BlackLists", - blacklist) ) { - try { - Switchboard.urlBlacklist.add( - BlacklistType.valueOf(supportedBlacklistType), - blacklist, - url.getHost(), - url.getFile()); - } catch (PunycodeException e) { - ConcurrentLog.warn(APP_NAME, - "Unable to add blacklist entry to blacklist " - + supportedBlacklistType, e); + try { + url = segment.fulltext().getURL(ASCII.String(b)); + segment.fulltext().remove(b); + if ( url != null ) { + for ( final String supportedBlacklistType : supportedBlacklistTypes ) { + if ( ListManager.listSetContains( + supportedBlacklistType + ".BlackLists", + blacklist) ) { + try { + Switchboard.urlBlacklist.add( + BlacklistType.valueOf(supportedBlacklistType), + blacklist, + url.getHost(), + url.getFile()); + } catch (PunycodeException e) { + ConcurrentLog.warn(APP_NAME, + "Unable to add blacklist entry to blacklist " + + supportedBlacklistType, e); + } } - } - } - SearchEventCache.cleanupEvents(true); - } + } + SearchEventCache.cleanupEvents(true); + } + } catch (IOException e1) { + ConcurrentLog.logException(e1); + } } } @@ -411,27 +414,29 @@ public class IndexControlRWIs_p { } catch (final SpaceExceededException e ) { ConcurrentLog.logException(e); } - url = segment.fulltext().getURL(ASCII.String(b)); - segment.fulltext().remove(b); - if ( url != null ) { - for ( final BlacklistType supportedBlacklistType : BlacklistType.values() ) { - if ( ListManager.listSetContains( - supportedBlacklistType + ".BlackLists", - blacklist) ) { - try { - Switchboard.urlBlacklist.add( - supportedBlacklistType, - blacklist, - url.getHost(), - ".*"); - } catch (PunycodeException e) { - ConcurrentLog.warn(APP_NAME, - "Unable to add blacklist entry to blacklist " - + supportedBlacklistType, e); + try { + url = segment.fulltext().getURL(ASCII.String(b)); + segment.fulltext().remove(b); + if ( url != null ) { + for ( final BlacklistType supportedBlacklistType : BlacklistType.values() ) { + if ( ListManager.listSetContains(supportedBlacklistType + ".BlackLists", blacklist) ) { + try { + Switchboard.urlBlacklist.add( + supportedBlacklistType, + blacklist, + url.getHost(), + ".*"); + } catch (PunycodeException e) { + ConcurrentLog.warn(APP_NAME, + "Unable to add blacklist entry to blacklist " + + supportedBlacklistType, e); + } } - } - } - } + } + } + } catch (IOException e1) { + ConcurrentLog.logException(e1); + } } } try { diff --git a/htroot/IndexControlURLs_p.java b/htroot/IndexControlURLs_p.java index 897541296..dcae6e585 100644 --- a/htroot/IndexControlURLs_p.java +++ b/htroot/IndexControlURLs_p.java @@ -183,14 +183,19 @@ public class IndexControlURLs_p { } if (post.containsKey("urlhashdelete")) { - final DigestURL url = segment.fulltext().getURL(urlhash); - if (url == null) { - prop.putHTML("result", "No Entry for URL hash " + urlhash + "; nothing deleted."); - } else { - urlstring = url.toNormalform(true); - prop.put("urlstring", ""); - sb.urlRemove(segment, urlhash.getBytes()); - prop.putHTML("result", "Removed URL " + urlstring); + DigestURL url; + try { + url = segment.fulltext().getURL(urlhash); + if (url == null) { + prop.putHTML("result", "No Entry for URL hash " + urlhash + "; nothing deleted."); + } else { + urlstring = url.toNormalform(true); + prop.put("urlstring", ""); + sb.urlRemove(segment, urlhash.getBytes()); + prop.putHTML("result", "Removed URL " + urlstring); + } + } catch (IOException e) { + prop.putHTML("result", "Error when querying the url hash " + urlhash + ":" + e.getMessage()); } } diff --git a/htroot/Load_RSS_p.java b/htroot/Load_RSS_p.java index 5479a9d9c..c07dc5db0 100644 --- a/htroot/Load_RSS_p.java +++ b/htroot/Load_RSS_p.java @@ -352,17 +352,23 @@ public class Load_RSS_p { author = item.getAuthor(); if (author == null) author = item.getCopyright(); pubDate = item.getPubDate(); - HarvestProcess harvestProcess = sb.urlExists(ASCII.String(messageurl.hash())); - prop.put("showitems_item_" + i + "_state", harvestProcess != null ? 2 : RSSLoader.indexTriggered.containsKey(messageurl.hash()) ? 1 : 0); - prop.put("showitems_item_" + i + "_state_count", i); - prop.putHTML("showitems_item_" + i + "_state_guid", item.getGuid()); - prop.putHTML("showitems_item_" + i + "_author", author == null ? "" : author); - prop.putHTML("showitems_item_" + i + "_title", item.getTitle()); - prop.putHTML("showitems_item_" + i + "_link", messageurl.toNormalform(true)); - prop.putHTML("showitems_item_" + i + "_description", item.getDescriptions().toString()); - prop.putHTML("showitems_item_" + i + "_language", item.getLanguage()); - prop.putHTML("showitems_item_" + i + "_date", (pubDate == null) ? "" : DateFormat.getDateTimeInstance().format(pubDate)); - i++; + HarvestProcess harvestProcess; + try { + harvestProcess = sb.urlExists(ASCII.String(messageurl.hash())); + prop.put("showitems_item_" + i + "_state", harvestProcess != null ? 2 : RSSLoader.indexTriggered.containsKey(messageurl.hash()) ? 1 : 0); + prop.put("showitems_item_" + i + "_state_count", i); + prop.putHTML("showitems_item_" + i + "_state_guid", item.getGuid()); + prop.putHTML("showitems_item_" + i + "_author", author == null ? "" : author); + prop.putHTML("showitems_item_" + i + "_title", item.getTitle()); + prop.putHTML("showitems_item_" + i + "_link", messageurl.toNormalform(true)); + prop.putHTML("showitems_item_" + i + "_description", item.getDescriptions().toString()); + prop.putHTML("showitems_item_" + i + "_language", item.getLanguage()); + prop.putHTML("showitems_item_" + i + "_date", (pubDate == null) ? "" : DateFormat.getDateTimeInstance().format(pubDate)); + i++; + } catch (IOException e) { + ConcurrentLog.logException(e); + continue; + } } catch (final MalformedURLException e) { ConcurrentLog.logException(e); continue; diff --git a/htroot/api/citation.java b/htroot/api/citation.java index 079efab2d..0af3d5a26 100644 --- a/htroot/api/citation.java +++ b/htroot/api/citation.java @@ -35,6 +35,7 @@ import net.yacy.cora.document.id.DigestURL; import net.yacy.cora.federate.solr.connector.SolrConnector; import net.yacy.cora.protocol.RequestHeader; import net.yacy.cora.sorting.OrderedScoreMap; +import net.yacy.cora.util.ConcurrentLog; import net.yacy.document.SentenceReader; import net.yacy.search.Switchboard; import net.yacy.search.index.Segment; @@ -86,10 +87,14 @@ public class citation { } catch (final MalformedURLException e) {} } if (uri == null && hash.length() > 0) { - uri = sb.getURL(ASCII.getBytes(hash)); - if (uri == null) { - connector.commit(true); // try again, that url can be fresh + try { uri = sb.getURL(ASCII.getBytes(hash)); + if (uri == null) { + connector.commit(true); // try again, that url can be fresh + uri = sb.getURL(ASCII.getBytes(hash)); + } + } catch (IOException e) { + ConcurrentLog.logException(e); } } if (uri == null) return prop; // no proper url addressed diff --git a/htroot/api/linkstructure.java b/htroot/api/linkstructure.java index 8e06d1396..af1f4712e 100644 --- a/htroot/api/linkstructure.java +++ b/htroot/api/linkstructure.java @@ -17,6 +17,7 @@ // along with this program; if not, write to the Free Software // Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA +import java.io.IOException; import java.net.MalformedURLException; import net.yacy.cora.document.encoding.ASCII; @@ -25,6 +26,7 @@ import net.yacy.cora.order.Base64Order; import net.yacy.cora.protocol.HeaderFramework; import net.yacy.cora.protocol.RequestHeader; import net.yacy.cora.protocol.ResponseHeader; +import net.yacy.cora.util.ConcurrentLog; import net.yacy.search.Switchboard; import net.yacy.search.index.Fulltext; import net.yacy.search.schema.HyperlinkEdge; @@ -59,7 +61,11 @@ public class linkstructure { String hostname = null; if (about.length() == 12 && Base64Order.enhancedCoder.wellformed(ASCII.getBytes(about))) { byte[] urlhash = ASCII.getBytes(about); - url = authenticated ? sb.getURL(urlhash) : null; + try { + url = authenticated ? sb.getURL(urlhash) : null; + } catch (IOException e) { + ConcurrentLog.logException(e); + } } else if (url == null && about.length() > 0) { // consider "about" as url or hostname url = new DigestURL(about.indexOf("://") >= 0 ? about : "http://" + about); // accept also domains hostname = url.getHost(); diff --git a/htroot/api/webstructure.java b/htroot/api/webstructure.java index 7e820f495..bc1b24aff 100644 --- a/htroot/api/webstructure.java +++ b/htroot/api/webstructure.java @@ -65,7 +65,12 @@ public class webstructure { } else if (about.length() == 12 && Base64Order.enhancedCoder.wellformed(ASCII.getBytes(about))) { urlhash = ASCII.getBytes(about); hosthash = about.substring(6); - url = authenticated ? sb.getURL(urlhash) : null; + try { + url = authenticated ? sb.getURL(urlhash) : null; + } catch (IOException e) { + url = null; + ConcurrentLog.logException(e); + } } else if (about.length() > 0) { // consider "about" as url or hostname try { @@ -156,12 +161,17 @@ public class webstructure { Iterator i = ids.iterator(); while (i.hasNext()) { byte[] refhash = i.next(); - DigestURL refurl = authenticated ? sb.getURL(refhash) : null; - prop.put("citations_documents_0_anchors_" + d + "_urle", refurl == null ? 0 : 1); - if (refurl != null) prop.putXML("citations_documents_0_anchors_" + d + "_urle_url", refurl.toNormalform(true)); - prop.put("citations_documents_0_anchors_" + d + "_urle_hash", refhash); - prop.put("citations_documents_0_anchors_" + d + "_urle_date", GenericFormatter.SHORT_DAY_FORMATTER.format(new Date())); // superfluous? - d++; + DigestURL refurl; + try { + refurl = authenticated ? sb.getURL(refhash) : null; + prop.put("citations_documents_0_anchors_" + d + "_urle", refurl == null ? 0 : 1); + if (refurl != null) prop.putXML("citations_documents_0_anchors_" + d + "_urle_url", refurl.toNormalform(true)); + prop.put("citations_documents_0_anchors_" + d + "_urle_hash", refhash); + prop.put("citations_documents_0_anchors_" + d + "_urle_date", GenericFormatter.SHORT_DAY_FORMATTER.format(new Date())); // superfluous? + d++; + } catch (IOException e) { + ConcurrentLog.logException(e); + } } prop.put("citations_documents_0_count", d); prop.put("citations_documents_0_anchors", d); diff --git a/htroot/api/ymarks/add_ymark.java b/htroot/api/ymarks/add_ymark.java index 2d3140276..c11755584 100644 --- a/htroot/api/ymarks/add_ymark.java +++ b/htroot/api/ymarks/add_ymark.java @@ -34,18 +34,16 @@ public class add_ymark { if (post.containsKey("urlHash")) { final String urlHash = post.get("urlHash",YMarkUtil.EMPTY_STRING); - final DigestURL url = sb.index.fulltext().getURL(urlHash); - final String folders = post.get(YMarkEntry.BOOKMARK.FOLDERS.key(),YMarkEntry.BOOKMARK.FOLDERS.deflt()); - final String tags = post.get(YMarkEntry.BOOKMARK.TAGS.key(),YMarkUtil.EMPTY_STRING); try { + final DigestURL url = sb.index.fulltext().getURL(urlHash); + final String folders = post.get(YMarkEntry.BOOKMARK.FOLDERS.key(),YMarkEntry.BOOKMARK.FOLDERS.deflt()); + final String tags = post.get(YMarkEntry.BOOKMARK.TAGS.key(),YMarkUtil.EMPTY_STRING); ClientIdentification.Agent agent = ClientIdentification.getAgent(post.get("agentName", ClientIdentification.yacyInternetCrawlerAgentName)); sb.tables.bookmarks.createBookmark(sb.loader, url, agent, bmk_user, true, tags, folders); prop.put("status", "1"); } catch (final IOException e) { - // TODO Auto-generated catch block ConcurrentLog.logException(e); } catch (final Failure e) { - // TODO Auto-generated catch block ConcurrentLog.logException(e); } diff --git a/htroot/yacy/transferRWI.java b/htroot/yacy/transferRWI.java index 8489dbd1d..dae580bcf 100644 --- a/htroot/yacy/transferRWI.java +++ b/htroot/yacy/transferRWI.java @@ -27,6 +27,7 @@ // javac -classpath .:../classes transferRWI.java +import java.io.IOException; import java.util.ArrayList; import java.util.HashSet; import java.util.Iterator; @@ -234,9 +235,14 @@ public final class transferRWI { } for (String id: testids) { try { - if (sb.index.fulltext().getLoadTime(id) >= 0) { - knownURL.put(ASCII.getBytes(id)); - } else { + try { + if (sb.index.fulltext().getLoadTime(id) >= 0) { + knownURL.put(ASCII.getBytes(id)); + } else { + unknownURL.put(ASCII.getBytes(id)); + } + } catch (IOException e) { + ConcurrentLog.logException(e); unknownURL.put(ASCII.getBytes(id)); } } catch (final SpaceExceededException e) { diff --git a/htroot/yacy/transferURL.java b/htroot/yacy/transferURL.java index 54650c2e3..49c315835 100644 --- a/htroot/yacy/transferURL.java +++ b/htroot/yacy/transferURL.java @@ -144,7 +144,14 @@ public final class transferURL { doublecheck = 0; for (String id : lEm.keySet()) { - if (sb.index.getLoadTime(id) < 0) { + long lt = -1; + try { + lt = sb.index.getLoadTime(id); + } catch (IOException e1) { + lt = -1; + ConcurrentLog.logException(e1); + } + if (lt < 0) { lEntry = lEm.get(id); // write entry to database diff --git a/htroot/yacy/urls.java b/htroot/yacy/urls.java index 9bb930116..0054ea452 100644 --- a/htroot/yacy/urls.java +++ b/htroot/yacy/urls.java @@ -29,6 +29,7 @@ import net.yacy.cora.date.GenericFormatter; import net.yacy.cora.document.encoding.ASCII; import net.yacy.cora.document.id.DigestURL; import net.yacy.cora.protocol.RequestHeader; +import net.yacy.cora.util.ConcurrentLog; import net.yacy.crawler.data.NoticedURL; import net.yacy.crawler.retrieval.Request; import net.yacy.kelondro.data.meta.URIMetadataNode; @@ -74,7 +75,12 @@ public class urls { if (entry == null) break; // find referrer, if there is one - referrer = sb.getURL(entry.referrerhash()); + try { + referrer = sb.getURL(entry.referrerhash()); + } catch (IOException e) { + referrer = null; + ConcurrentLog.logException(e); + } // place url to notice-url db sb.crawlQueues.delegatedURL.put(ASCII.String(entry.url().hash()), entry.url()); @@ -106,16 +112,20 @@ public class urls { entry = sb.index.fulltext().getMetadata(ASCII.getBytes(urlhashes.substring(12 * i, 12 * (i + 1)))); if (entry == null) continue; // find referrer, if there is one - referrer = sb.getURL(entry.referrerHash()); - // create RSS entry - prop.put("item_" + c + "_title", entry.dc_title()); - prop.putXML("item_" + c + "_link", entry.url().toNormalform(true)); - prop.putXML("item_" + c + "_referrer", (referrer == null) ? "" : referrer.toNormalform(true)); - prop.putXML("item_" + c + "_description", entry.dc_title()); - prop.put("item_" + c + "_author", entry.dc_creator()); - prop.put("item_" + c + "_pubDate", GenericFormatter.SHORT_SECOND_FORMATTER.format(entry.moddate())); - prop.put("item_" + c + "_guid", ASCII.String(entry.hash())); - c++; + try { + referrer = sb.getURL(entry.referrerHash()); + // create RSS entry + prop.put("item_" + c + "_title", entry.dc_title()); + prop.putXML("item_" + c + "_link", entry.url().toNormalform(true)); + prop.putXML("item_" + c + "_referrer", (referrer == null) ? "" : referrer.toNormalform(true)); + prop.putXML("item_" + c + "_description", entry.dc_title()); + prop.put("item_" + c + "_author", entry.dc_creator()); + prop.put("item_" + c + "_pubDate", GenericFormatter.SHORT_SECOND_FORMATTER.format(entry.moddate())); + prop.put("item_" + c + "_guid", ASCII.String(entry.hash())); + c++; + } catch (IOException e) { + ConcurrentLog.logException(e); + } } prop.put("item", c); prop.putXML("response", "ok"); diff --git a/htroot/yacysearch.java b/htroot/yacysearch.java index 6d4d420e4..fe9a5c9bb 100644 --- a/htroot/yacysearch.java +++ b/htroot/yacysearch.java @@ -588,19 +588,23 @@ public class yacysearch { return prop; } final String bookmarkHash = post.get("bookmarkref", ""); // urlhash - final DigestURL url = indexSegment.fulltext().getURL(bookmarkHash); - if ( url != null ) { - try { - sb.tables.bookmarks.createBookmark( - sb.loader, - url, - ClientIdentification.yacyInternetCrawlerAgent, - YMarkTables.USER_ADMIN, - true, - "searchresult", - "/search"); - } catch (final Throwable e ) { + try { + final DigestURL url = indexSegment.fulltext().getURL(bookmarkHash); + if ( url != null ) { + try { + sb.tables.bookmarks.createBookmark( + sb.loader, + url, + ClientIdentification.yacyInternetCrawlerAgent, + YMarkTables.USER_ADMIN, + true, + "searchresult", + "/search"); + } catch (final Throwable e ) { + } } + } catch (IOException e) { + ConcurrentLog.logException(e); } } diff --git a/source/net/yacy/cora/federate/solr/connector/AbstractSolrConnector.java b/source/net/yacy/cora/federate/solr/connector/AbstractSolrConnector.java index efc5e7bdc..f5f04d92a 100644 --- a/source/net/yacy/cora/federate/solr/connector/AbstractSolrConnector.java +++ b/source/net/yacy/cora/federate/solr/connector/AbstractSolrConnector.java @@ -313,9 +313,6 @@ public abstract class AbstractSolrConnector implements SolrConnector { //params.setQuery(CollectionSchema.id.getSolrFieldName() + ":\"" + id + "\""); String q = "{!raw f=" + CollectionSchema.id.getSolrFieldName() + "}" + id; params.setQuery(q); - //params.setQuery("*:*"); - //params.addFilterQuery(q); - //params.set("defType", "raw"); params.setRows(1); params.setStart(0); params.setFacet(false); diff --git a/source/net/yacy/cora/federate/solr/connector/EmbeddedSolrConnector.java b/source/net/yacy/cora/federate/solr/connector/EmbeddedSolrConnector.java index 661d498f3..231f52d92 100644 --- a/source/net/yacy/cora/federate/solr/connector/EmbeddedSolrConnector.java +++ b/source/net/yacy/cora/federate/solr/connector/EmbeddedSolrConnector.java @@ -405,7 +405,7 @@ public class EmbeddedSolrConnector extends SolrServerConnector implements SolrCo * @throws IOException */ @Override - public LoadTimeURL getLoadTimeURL(String id) { + public LoadTimeURL getLoadTimeURL(String id) throws IOException { int responseCount = 0; DocListSearcher docListSearcher = null; try { @@ -421,10 +421,10 @@ public class EmbeddedSolrConnector extends SolrServerConnector implements SolrCo //} } catch (Throwable e) { ConcurrentLog.logException(e); + throw new IOException(e.getMessage()); } finally { if (docListSearcher != null) docListSearcher.close(); } - return null; } @Override diff --git a/source/net/yacy/cora/federate/solr/connector/SolrConnector.java b/source/net/yacy/cora/federate/solr/connector/SolrConnector.java index 040ddf53d..061e36c38 100644 --- a/source/net/yacy/cora/federate/solr/connector/SolrConnector.java +++ b/source/net/yacy/cora/federate/solr/connector/SolrConnector.java @@ -154,7 +154,7 @@ public interface SolrConnector extends Iterable /* Iterable of document * @param query * @throws IOException */ - public QueryResponse getResponseByParams(final ModifiableSolrParams query) throws IOException, SolrException; + public QueryResponse getResponseByParams(final ModifiableSolrParams query) throws IOException; /** * get the solr document list from a query response @@ -165,7 +165,7 @@ public interface SolrConnector extends Iterable /* Iterable of document * @throws IOException * @throws SolrException */ - public SolrDocumentList getDocumentListByParams(ModifiableSolrParams params) throws IOException, SolrException; + public SolrDocumentList getDocumentListByParams(ModifiableSolrParams params) throws IOException; /** * get the number of results for a query response @@ -174,7 +174,7 @@ public interface SolrConnector extends Iterable /* Iterable of document * @throws IOException * @throws SolrException */ - public long getDocumentCountByParams(ModifiableSolrParams params) throws IOException, SolrException; + public long getDocumentCountByParams(ModifiableSolrParams params) throws IOException; /** * get a query result from solr @@ -191,7 +191,7 @@ public interface SolrConnector extends Iterable /* Iterable of document final String sort, final int offset, final int count, - final String ... fields) throws IOException, SolrException; + final String ... fields) throws IOException; /** * get the number of results when this query is done. diff --git a/source/net/yacy/cora/federate/solr/connector/SolrServerConnector.java b/source/net/yacy/cora/federate/solr/connector/SolrServerConnector.java index 1610c8e3c..4b5092ebc 100644 --- a/source/net/yacy/cora/federate/solr/connector/SolrServerConnector.java +++ b/source/net/yacy/cora/federate/solr/connector/SolrServerConnector.java @@ -33,7 +33,6 @@ import net.yacy.search.schema.CollectionSchema; import org.apache.lucene.analysis.NumericTokenStream; import org.apache.solr.common.SolrDocumentList; import org.apache.solr.common.SolrException; -import org.apache.solr.common.SolrException.ErrorCode; import org.apache.solr.common.SolrInputDocument; import org.apache.solr.common.params.ModifiableSolrParams; import org.apache.solr.common.util.NamedList; @@ -289,7 +288,7 @@ public abstract class SolrServerConnector extends AbstractSolrConnector implemen * @throws SolrException */ @Override - public SolrDocumentList getDocumentListByParams(ModifiableSolrParams params) throws IOException, SolrException { + public SolrDocumentList getDocumentListByParams(ModifiableSolrParams params) throws IOException { if (this.server == null) throw new IOException("server disconnected"); // during the solr query we set the thread name to the query string to get more debugging info in thread dumps String q = params.get("q"); @@ -297,18 +296,25 @@ public abstract class SolrServerConnector extends AbstractSolrConnector implemen String threadname = Thread.currentThread().getName(); if (q != null) Thread.currentThread().setName("solr query: q = " + q + (fq == null ? "" : ", fq = " + fq)); QueryResponse rsp; - try { - rsp = this.server.query(params); - if (q != null) Thread.currentThread().setName(threadname); - if (rsp != null) if (log.isFine()) log.fine(rsp.getResults().getNumFound() + " results for q=" + q); - return rsp.getResults(); - } catch (final SolrServerException e) { - clearCaches(); // prevent further OOM if this was caused by OOM - throw new SolrException(ErrorCode.UNKNOWN, e); - } catch (final Throwable e) { - clearCaches(); // prevent further OOM if this was caused by OOM - throw new IOException("Error executing query", e); + int retry = 10; + Throwable error = null; + while (retry-- > 0) { + try { + rsp = this.server.query(params); + if (q != null) Thread.currentThread().setName(threadname); + if (rsp != null) if (log.isFine()) log.fine(rsp.getResults().getNumFound() + " results for q=" + q); + return rsp.getResults(); + } catch (final SolrServerException e) { + error = e; + clearCaches(); // prevent further OOM if this was caused by OOM + } catch (final Throwable e) { + error = e; + clearCaches(); // prevent further OOM if this was caused by OOM + } + ConcurrentLog.severe("SolrServerConnector", "Failed to query remote Solr: " + error.getMessage() + ", query:" + q + (fq == null ? "" : ", fq = " + fq)); + try {Thread.sleep(1000);} catch (InterruptedException e) {} } + throw new IOException("Error executing query", error); } // luke requests: these do not work for attached SolrCloud Server diff --git a/source/net/yacy/crawler/CrawlStacker.java b/source/net/yacy/crawler/CrawlStacker.java index 1b71f1600..18c005ec1 100644 --- a/source/net/yacy/crawler/CrawlStacker.java +++ b/source/net/yacy/crawler/CrawlStacker.java @@ -388,7 +388,10 @@ public final class CrawlStacker { try { oldEntry = this.indexSegment.fulltext().getDefaultConnector().getLoadTimeURL(urlhash); } catch (IOException e) { + // if an exception here occurs then there is the danger that urls which had been in the crawler are overwritten a second time + // to prevent that, we reject urls in these events ConcurrentLog.logException(e); + return "exception during double-test: " + e.getMessage(); } final Long oldDate = oldEntry == null ? null : oldEntry.date; if (oldDate == null) { diff --git a/source/net/yacy/crawler/retrieval/RSSLoader.java b/source/net/yacy/crawler/retrieval/RSSLoader.java index b7719cfef..508748e0f 100644 --- a/source/net/yacy/crawler/retrieval/RSSLoader.java +++ b/source/net/yacy/crawler/retrieval/RSSLoader.java @@ -108,11 +108,16 @@ public class RSSLoader extends Thread { } } for (final Map.Entry e: urlmap.entrySet()) { - HarvestProcess harvestProcess = sb.urlExists(e.getKey()); - if (harvestProcess != null) continue; - list.add(e.getValue()); - indexTriggered.insertIfAbsent(ASCII.getBytes(e.getKey()), new Date()); - loadCount++; + HarvestProcess harvestProcess; + try { + harvestProcess = sb.urlExists(e.getKey()); + if (harvestProcess != null) continue; + list.add(e.getValue()); + indexTriggered.insertIfAbsent(ASCII.getBytes(e.getKey()), new Date()); + loadCount++; + } catch (IOException e1) { + ConcurrentLog.logException(e1); + } } sb.addToIndex(list, null, null, collections, true); // update info for loading diff --git a/source/net/yacy/crawler/retrieval/SitemapImporter.java b/source/net/yacy/crawler/retrieval/SitemapImporter.java index b3594b109..69ae673e8 100644 --- a/source/net/yacy/crawler/retrieval/SitemapImporter.java +++ b/source/net/yacy/crawler/retrieval/SitemapImporter.java @@ -25,6 +25,7 @@ package net.yacy.crawler.retrieval; +import java.io.IOException; import java.net.MalformedURLException; import java.util.Date; @@ -82,15 +83,20 @@ public class SitemapImporter extends Thread { // check if the url is known and needs to be recrawled Date lastMod = entry.lastmod(null); if (lastMod != null) { - final HarvestProcess dbocc = this.sb.urlExists(ASCII.String(nexturlhash)); - if (dbocc != null && dbocc == HarvestProcess.LOADED) { - // the url was already loaded. we need to check the date - final URIMetadataNode oldEntry = this.sb.index.fulltext().getMetadata(nexturlhash); - if (oldEntry != null) { - final Date modDate = oldEntry.moddate(); - // check if modDate is null - if (modDate.after(lastMod)) return; + HarvestProcess dbocc; + try { + dbocc = this.sb.urlExists(ASCII.String(nexturlhash)); + if (dbocc != null && dbocc == HarvestProcess.LOADED) { + // the url was already loaded. we need to check the date + final URIMetadataNode oldEntry = this.sb.index.fulltext().getMetadata(nexturlhash); + if (oldEntry != null) { + final Date modDate = oldEntry.moddate(); + // check if modDate is null + if (modDate.after(lastMod)) return; + } } + } catch (IOException e) { + ConcurrentLog.logException(e); } } diff --git a/source/net/yacy/data/ymark/YMarkMetadata.java b/source/net/yacy/data/ymark/YMarkMetadata.java index ab7b47e7d..d999bf938 100644 --- a/source/net/yacy/data/ymark/YMarkMetadata.java +++ b/source/net/yacy/data/ymark/YMarkMetadata.java @@ -35,6 +35,7 @@ import net.yacy.cora.document.encoding.ASCII; import net.yacy.cora.document.id.DigestURL; import net.yacy.cora.federate.yacy.CacheStrategy; import net.yacy.cora.protocol.ClientIdentification; +import net.yacy.cora.util.ConcurrentLog; import net.yacy.crawler.retrieval.Response; import net.yacy.document.Document; import net.yacy.document.Parser.Failure; @@ -82,7 +83,12 @@ public class YMarkMetadata { public YMarkMetadata(final byte[] urlHash, final Segment indexSegment) { this.document = null; this.indexSegment = indexSegment; - this.uri = this.indexSegment.fulltext().getURL(ASCII.String(urlHash)); + try { + this.uri = this.indexSegment.fulltext().getURL(ASCII.String(urlHash)); + } catch (IOException e) { + this.uri = null; + ConcurrentLog.logException(e); + } } public YMarkMetadata(final Document document) { diff --git a/source/net/yacy/peers/Transmission.java b/source/net/yacy/peers/Transmission.java index 8795209c7..970c4532d 100644 --- a/source/net/yacy/peers/Transmission.java +++ b/source/net/yacy/peers/Transmission.java @@ -24,6 +24,7 @@ package net.yacy.peers; +import java.io.IOException; import java.util.ArrayList; import java.util.HashSet; import java.util.Iterator; @@ -174,9 +175,15 @@ public class Transmission { i = c.entries(); while (i.hasNext()) { final WordReference e = i.next(); - if (Transmission.this.segment.fulltext().getLoadTime(ASCII.String(e.urlhash())) >= 0) { - this.references.put(e.urlhash()); - } else { + try { + if (Transmission.this.segment.fulltext().getLoadTime(ASCII.String(e.urlhash())) >= 0) { + this.references.put(e.urlhash()); + } else { + notFoundx.add(e.urlhash()); + this.badReferences.put(e.urlhash()); + } + } catch (IOException e1) { + ConcurrentLog.logException(e1); notFoundx.add(e.urlhash()); this.badReferences.put(e.urlhash()); } diff --git a/source/net/yacy/search/Switchboard.java b/source/net/yacy/search/Switchboard.java index 0afc77644..09a85ce7c 100644 --- a/source/net/yacy/search/Switchboard.java +++ b/source/net/yacy/search/Switchboard.java @@ -1620,18 +1620,12 @@ public final class Switchboard extends serverSwitch { * @param hash * @return if it exists, the name of the database is returned, if it not exists, null is returned */ - public HarvestProcess urlExists(final String hash) { - if (this.index.getLoadTime(hash) >= 0) return HarvestProcess.LOADED; + public HarvestProcess urlExists(final String hash) throws IOException { + LoadTimeURL md = this.index.fulltext().getDefaultConnector().getLoadTimeURL(hash); + if (md != null && md.date >= 0) return HarvestProcess.LOADED; HarvestProcess hp = this.crawlQueues.exists(ASCII.getBytes(hash)); if (hp != null) return hp; - try { - LoadTimeURL md = this.index.fulltext().getDefaultConnector().getLoadTimeURL(hash); - if (md == null) return null; - return HarvestProcess.LOADED; // todo: can also be in error - } catch (IOException e) { - ConcurrentLog.logException(e); - return null; - } + return null; // todo: can also be in error } public void urlRemove(final Segment segment, final byte[] hash) { @@ -1640,7 +1634,7 @@ public final class Switchboard extends serverSwitch { this.crawlQueues.removeURL(hash); } - public DigestURL getURL(final byte[] urlhash) { + public DigestURL getURL(final byte[] urlhash) throws IOException { if (urlhash == null) return null; if (urlhash.length == 0) return null; final DigestURL url = this.index.fulltext().getURL(ASCII.String(urlhash)); @@ -2977,7 +2971,15 @@ public final class Switchboard extends serverSwitch { // stacking may fail because of double occurrences of that url. Therefore // we must wait here until the url has actually disappeared int t = 100; - while (t-- > 0 && this.index.getLoadTime(ASCII.String(urlhash)) >= 0) { + while (t-- > 0) { + try { + long lt = this.index.getLoadTime(ASCII.String(urlhash)); + if (lt < 0) break; + } catch (IOException e) { + // if this fails, the url may still exist + // we should abandon the whole process + return "exist-test failed: " + e.getMessage(); + } try {Thread.sleep(100);} catch (final InterruptedException e) {} ConcurrentLog.fine("Switchboard", "STACKURL: waiting for deletion, t=" + t); //if (t == 20) this.index.fulltext().commit(true); @@ -3094,9 +3096,17 @@ public final class Switchboard extends serverSwitch { final List requests = new ArrayList(); for (Map.Entry e: urlmap.entrySet()) { final String urlName = e.getValue().toNormalform(true); - if (doublecheck && this.index.getLoadTime(e.getKey()) >= 0) { - this.log.info("addToIndex: double " + urlName); - continue; + if (doublecheck) { + try { + if (this.index.getLoadTime(e.getKey()) >= 0) { + this.log.info("addToIndex: double " + urlName); + continue; + } + } catch (IOException ee) { + // double check fail may mean that the url exist + this.log.info("addToIndex: doublecheck failed for " + urlName + ": " + ee.getMessage()); + continue; + } } final Request request = this.loader.request(e.getValue(), true, true); final CrawlProfile profile = this.crawler.get(ASCII.getBytes(request.profileHandle())); @@ -3168,7 +3178,11 @@ public final class Switchboard extends serverSwitch { Map urlmap = new HashMap(); for (DigestURL url: urls) urlmap.put(ASCII.String(url.hash()), url); for (Map.Entry e: urlmap.entrySet()) { - if (this.index.getLoadTime(e.getKey()) >= 0) continue; // double + try { + if (this.index.getLoadTime(e.getKey()) >= 0) continue; // double + } catch (IOException ee) { + continue; // if the check fails, consider the url as double + } DigestURL url = e.getValue(); final Request request = this.loader.request(url, true, true); final CrawlProfile profile = this.crawler.get(ASCII.getBytes(request.profileHandle())); diff --git a/source/net/yacy/search/index/Fulltext.java b/source/net/yacy/search/index/Fulltext.java index cebdceeb0..dfea34473 100644 --- a/source/net/yacy/search/index/Fulltext.java +++ b/source/net/yacy/search/index/Fulltext.java @@ -473,16 +473,12 @@ public final class Fulltext { return false; } - public DigestURL getURL(final String urlHash) { + public DigestURL getURL(final String urlHash) throws IOException { if (urlHash == null || this.getDefaultConnector() == null) return null; - try { - SolrConnector.LoadTimeURL md = this.getDefaultConnector().getLoadTimeURL(urlHash); - if (md == null) return null; - return new DigestURL(md.url, ASCII.getBytes(urlHash)); - } catch (final IOException e) { - return null; - } + SolrConnector.LoadTimeURL md = this.getDefaultConnector().getLoadTimeURL(urlHash); + if (md == null) return null; + return new DigestURL(md.url, ASCII.getBytes(urlHash)); } /** @@ -490,16 +486,11 @@ public final class Fulltext { * @param urlHash * @return the time in milliseconds since epoch for the load time or -1 if the document does not exist */ - public long getLoadTime(final String urlHash) { + public long getLoadTime(final String urlHash) throws IOException { if (urlHash == null) return -1l; - try { - SolrConnector.LoadTimeURL md = this.getDefaultConnector().getLoadTimeURL(urlHash); - if (md == null) return -1l; - return md.date; - } catch (final Throwable e) { - ConcurrentLog.logException(e); - } - return -1l; + SolrConnector.LoadTimeURL md = this.getDefaultConnector().getLoadTimeURL(urlHash); + if (md == null) return -1l; + return md.date; } public List dumpFiles() { diff --git a/source/net/yacy/search/index/Segment.java b/source/net/yacy/search/index/Segment.java index 2b8cc7fdf..9dbb07297 100644 --- a/source/net/yacy/search/index/Segment.java +++ b/source/net/yacy/search/index/Segment.java @@ -356,7 +356,7 @@ public class Segment { * @param urlHash * @return the time in milliseconds since epoch for the load time or -1 if the document does not exist */ - public long getLoadTime(final String urlhash) { + public long getLoadTime(final String urlhash) throws IOException { return this.fulltext.getLoadTime(urlhash); } @@ -683,10 +683,10 @@ public class Segment { if (urlhash == null) return 0; // determine the url string - final DigestURL url = fulltext().getURL(ASCII.String(urlhash)); - if (url == null) return 0; - try { + final DigestURL url = fulltext().getURL(ASCII.String(urlhash)); + if (url == null) return 0; + // parse the resource final Document document = Document.mergeDocuments(url, null, loader.loadDocuments(loader.request(url, true, false), cacheStrategy, Integer.MAX_VALUE, null, agent)); if (document == null) {