better error handling for remote solr queries and exists-checks

This commit is contained in:
orbiter 2014-08-01 11:00:10 +02:00
parent b510b182d8
commit 22ce4fb4dd
24 changed files with 292 additions and 185 deletions

View File

@ -138,8 +138,13 @@ public class HostBrowser {
String load = post.get("load", ""); String load = post.get("load", "");
boolean wait = false; boolean wait = false;
if (loadRight && autoload && path.length() != 0 && pathURI != null && load.length() == 0 && sb.index.getLoadTime(ASCII.String(pathURI.hash())) < 0) { try {
// in case that the url does not exist and loading is wanted turn this request into a loading request if (loadRight && autoload && path.length() != 0 && pathURI != null && load.length() == 0 && sb.index.getLoadTime(ASCII.String(pathURI.hash())) < 0) {
// in case that the url does not exist and loading is wanted turn this request into a loading request
load = path;
wait = true;
}
} catch (IOException e1) {
load = path; load = path;
wait = true; wait = true;
} }
@ -156,8 +161,13 @@ public class HostBrowser {
0, 0, 0 0, 0, 0
)); ));
prop.putHTML("result", reasonString == null ? ("added url to indexer: " + load) : ("not indexed url '" + load + "': " + reasonString)); prop.putHTML("result", reasonString == null ? ("added url to indexer: " + load) : ("not indexed url '" + load + "': " + reasonString));
if (wait) for (int i = 0; i < 30; i++) { if (wait) waitloop: for (int i = 0; i < 30; i++) {
if (sb.index.getLoadTime(ASCII.String(url.hash())) >= 0) break; try {
if (sb.index.getLoadTime(ASCII.String(url.hash())) >= 0) break;
} catch (IOException e1) {
e1.printStackTrace();
break waitloop;
}
try {Thread.sleep(100);} catch (final InterruptedException e) {} try {Thread.sleep(100);} catch (final InterruptedException e) {}
} }
} catch (final MalformedURLException e) { } catch (final MalformedURLException e) {

View File

@ -369,8 +369,7 @@ public class IndexControlRWIs_p {
Word.commonHashOrder, Word.commonHashOrder,
urlb.size()); urlb.size());
if ( post.containsKey("blacklisturls") ) { if ( post.containsKey("blacklisturls") ) {
final String[] supportedBlacklistTypes = final String[] supportedBlacklistTypes = env.getConfig("BlackLists.types", "").split(",");
env.getConfig("BlackLists.types", "").split(",");
DigestURL url; DigestURL url;
for ( final byte[] b : urlb ) { for ( final byte[] b : urlb ) {
try { try {
@ -378,28 +377,32 @@ public class IndexControlRWIs_p {
} catch (final SpaceExceededException e ) { } catch (final SpaceExceededException e ) {
ConcurrentLog.logException(e); ConcurrentLog.logException(e);
} }
url = segment.fulltext().getURL(ASCII.String(b)); try {
segment.fulltext().remove(b); url = segment.fulltext().getURL(ASCII.String(b));
if ( url != null ) { segment.fulltext().remove(b);
for ( final String supportedBlacklistType : supportedBlacklistTypes ) { if ( url != null ) {
if ( ListManager.listSetContains( for ( final String supportedBlacklistType : supportedBlacklistTypes ) {
supportedBlacklistType + ".BlackLists", if ( ListManager.listSetContains(
blacklist) ) { supportedBlacklistType + ".BlackLists",
try { blacklist) ) {
Switchboard.urlBlacklist.add( try {
BlacklistType.valueOf(supportedBlacklistType), Switchboard.urlBlacklist.add(
blacklist, BlacklistType.valueOf(supportedBlacklistType),
url.getHost(), blacklist,
url.getFile()); url.getHost(),
} catch (PunycodeException e) { url.getFile());
ConcurrentLog.warn(APP_NAME, } catch (PunycodeException e) {
"Unable to add blacklist entry to blacklist " ConcurrentLog.warn(APP_NAME,
+ supportedBlacklistType, e); "Unable to add blacklist entry to blacklist "
+ supportedBlacklistType, e);
}
} }
} }
} SearchEventCache.cleanupEvents(true);
SearchEventCache.cleanupEvents(true); }
} } catch (IOException e1) {
ConcurrentLog.logException(e1);
}
} }
} }
@ -411,27 +414,29 @@ public class IndexControlRWIs_p {
} catch (final SpaceExceededException e ) { } catch (final SpaceExceededException e ) {
ConcurrentLog.logException(e); ConcurrentLog.logException(e);
} }
url = segment.fulltext().getURL(ASCII.String(b)); try {
segment.fulltext().remove(b); url = segment.fulltext().getURL(ASCII.String(b));
if ( url != null ) { segment.fulltext().remove(b);
for ( final BlacklistType supportedBlacklistType : BlacklistType.values() ) { if ( url != null ) {
if ( ListManager.listSetContains( for ( final BlacklistType supportedBlacklistType : BlacklistType.values() ) {
supportedBlacklistType + ".BlackLists", if ( ListManager.listSetContains(supportedBlacklistType + ".BlackLists", blacklist) ) {
blacklist) ) { try {
try { Switchboard.urlBlacklist.add(
Switchboard.urlBlacklist.add( supportedBlacklistType,
supportedBlacklistType, blacklist,
blacklist, url.getHost(),
url.getHost(), ".*");
".*"); } catch (PunycodeException e) {
} catch (PunycodeException e) { ConcurrentLog.warn(APP_NAME,
ConcurrentLog.warn(APP_NAME, "Unable to add blacklist entry to blacklist "
"Unable to add blacklist entry to blacklist " + supportedBlacklistType, e);
+ supportedBlacklistType, e); }
} }
} }
} }
} } catch (IOException e1) {
ConcurrentLog.logException(e1);
}
} }
} }
try { try {

View File

@ -183,14 +183,19 @@ public class IndexControlURLs_p {
} }
if (post.containsKey("urlhashdelete")) { if (post.containsKey("urlhashdelete")) {
final DigestURL url = segment.fulltext().getURL(urlhash); DigestURL url;
if (url == null) { try {
prop.putHTML("result", "No Entry for URL hash " + urlhash + "; nothing deleted."); url = segment.fulltext().getURL(urlhash);
} else { if (url == null) {
urlstring = url.toNormalform(true); prop.putHTML("result", "No Entry for URL hash " + urlhash + "; nothing deleted.");
prop.put("urlstring", ""); } else {
sb.urlRemove(segment, urlhash.getBytes()); urlstring = url.toNormalform(true);
prop.putHTML("result", "Removed URL " + urlstring); prop.put("urlstring", "");
sb.urlRemove(segment, urlhash.getBytes());
prop.putHTML("result", "Removed URL " + urlstring);
}
} catch (IOException e) {
prop.putHTML("result", "Error when querying the url hash " + urlhash + ":" + e.getMessage());
} }
} }

View File

@ -352,17 +352,23 @@ public class Load_RSS_p {
author = item.getAuthor(); author = item.getAuthor();
if (author == null) author = item.getCopyright(); if (author == null) author = item.getCopyright();
pubDate = item.getPubDate(); pubDate = item.getPubDate();
HarvestProcess harvestProcess = sb.urlExists(ASCII.String(messageurl.hash())); HarvestProcess harvestProcess;
prop.put("showitems_item_" + i + "_state", harvestProcess != null ? 2 : RSSLoader.indexTriggered.containsKey(messageurl.hash()) ? 1 : 0); try {
prop.put("showitems_item_" + i + "_state_count", i); harvestProcess = sb.urlExists(ASCII.String(messageurl.hash()));
prop.putHTML("showitems_item_" + i + "_state_guid", item.getGuid()); prop.put("showitems_item_" + i + "_state", harvestProcess != null ? 2 : RSSLoader.indexTriggered.containsKey(messageurl.hash()) ? 1 : 0);
prop.putHTML("showitems_item_" + i + "_author", author == null ? "" : author); prop.put("showitems_item_" + i + "_state_count", i);
prop.putHTML("showitems_item_" + i + "_title", item.getTitle()); prop.putHTML("showitems_item_" + i + "_state_guid", item.getGuid());
prop.putHTML("showitems_item_" + i + "_link", messageurl.toNormalform(true)); prop.putHTML("showitems_item_" + i + "_author", author == null ? "" : author);
prop.putHTML("showitems_item_" + i + "_description", item.getDescriptions().toString()); prop.putHTML("showitems_item_" + i + "_title", item.getTitle());
prop.putHTML("showitems_item_" + i + "_language", item.getLanguage()); prop.putHTML("showitems_item_" + i + "_link", messageurl.toNormalform(true));
prop.putHTML("showitems_item_" + i + "_date", (pubDate == null) ? "" : DateFormat.getDateTimeInstance().format(pubDate)); prop.putHTML("showitems_item_" + i + "_description", item.getDescriptions().toString());
i++; prop.putHTML("showitems_item_" + i + "_language", item.getLanguage());
prop.putHTML("showitems_item_" + i + "_date", (pubDate == null) ? "" : DateFormat.getDateTimeInstance().format(pubDate));
i++;
} catch (IOException e) {
ConcurrentLog.logException(e);
continue;
}
} catch (final MalformedURLException e) { } catch (final MalformedURLException e) {
ConcurrentLog.logException(e); ConcurrentLog.logException(e);
continue; continue;

View File

@ -35,6 +35,7 @@ import net.yacy.cora.document.id.DigestURL;
import net.yacy.cora.federate.solr.connector.SolrConnector; import net.yacy.cora.federate.solr.connector.SolrConnector;
import net.yacy.cora.protocol.RequestHeader; import net.yacy.cora.protocol.RequestHeader;
import net.yacy.cora.sorting.OrderedScoreMap; import net.yacy.cora.sorting.OrderedScoreMap;
import net.yacy.cora.util.ConcurrentLog;
import net.yacy.document.SentenceReader; import net.yacy.document.SentenceReader;
import net.yacy.search.Switchboard; import net.yacy.search.Switchboard;
import net.yacy.search.index.Segment; import net.yacy.search.index.Segment;
@ -86,10 +87,14 @@ public class citation {
} catch (final MalformedURLException e) {} } catch (final MalformedURLException e) {}
} }
if (uri == null && hash.length() > 0) { if (uri == null && hash.length() > 0) {
uri = sb.getURL(ASCII.getBytes(hash)); try {
if (uri == null) {
connector.commit(true); // try again, that url can be fresh
uri = sb.getURL(ASCII.getBytes(hash)); uri = sb.getURL(ASCII.getBytes(hash));
if (uri == null) {
connector.commit(true); // try again, that url can be fresh
uri = sb.getURL(ASCII.getBytes(hash));
}
} catch (IOException e) {
ConcurrentLog.logException(e);
} }
} }
if (uri == null) return prop; // no proper url addressed if (uri == null) return prop; // no proper url addressed

View File

@ -17,6 +17,7 @@
// along with this program; if not, write to the Free Software // along with this program; if not, write to the Free Software
// Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA // Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
import java.io.IOException;
import java.net.MalformedURLException; import java.net.MalformedURLException;
import net.yacy.cora.document.encoding.ASCII; import net.yacy.cora.document.encoding.ASCII;
@ -25,6 +26,7 @@ import net.yacy.cora.order.Base64Order;
import net.yacy.cora.protocol.HeaderFramework; import net.yacy.cora.protocol.HeaderFramework;
import net.yacy.cora.protocol.RequestHeader; import net.yacy.cora.protocol.RequestHeader;
import net.yacy.cora.protocol.ResponseHeader; import net.yacy.cora.protocol.ResponseHeader;
import net.yacy.cora.util.ConcurrentLog;
import net.yacy.search.Switchboard; import net.yacy.search.Switchboard;
import net.yacy.search.index.Fulltext; import net.yacy.search.index.Fulltext;
import net.yacy.search.schema.HyperlinkEdge; import net.yacy.search.schema.HyperlinkEdge;
@ -59,7 +61,11 @@ public class linkstructure {
String hostname = null; String hostname = null;
if (about.length() == 12 && Base64Order.enhancedCoder.wellformed(ASCII.getBytes(about))) { if (about.length() == 12 && Base64Order.enhancedCoder.wellformed(ASCII.getBytes(about))) {
byte[] urlhash = ASCII.getBytes(about); byte[] urlhash = ASCII.getBytes(about);
url = authenticated ? sb.getURL(urlhash) : null; try {
url = authenticated ? sb.getURL(urlhash) : null;
} catch (IOException e) {
ConcurrentLog.logException(e);
}
} else if (url == null && about.length() > 0) { // consider "about" as url or hostname } else if (url == null && about.length() > 0) { // consider "about" as url or hostname
url = new DigestURL(about.indexOf("://") >= 0 ? about : "http://" + about); // accept also domains url = new DigestURL(about.indexOf("://") >= 0 ? about : "http://" + about); // accept also domains
hostname = url.getHost(); hostname = url.getHost();

View File

@ -65,7 +65,12 @@ public class webstructure {
} else if (about.length() == 12 && Base64Order.enhancedCoder.wellformed(ASCII.getBytes(about))) { } else if (about.length() == 12 && Base64Order.enhancedCoder.wellformed(ASCII.getBytes(about))) {
urlhash = ASCII.getBytes(about); urlhash = ASCII.getBytes(about);
hosthash = about.substring(6); hosthash = about.substring(6);
url = authenticated ? sb.getURL(urlhash) : null; try {
url = authenticated ? sb.getURL(urlhash) : null;
} catch (IOException e) {
url = null;
ConcurrentLog.logException(e);
}
} else if (about.length() > 0) { } else if (about.length() > 0) {
// consider "about" as url or hostname // consider "about" as url or hostname
try { try {
@ -156,12 +161,17 @@ public class webstructure {
Iterator<byte[]> i = ids.iterator(); Iterator<byte[]> i = ids.iterator();
while (i.hasNext()) { while (i.hasNext()) {
byte[] refhash = i.next(); byte[] refhash = i.next();
DigestURL refurl = authenticated ? sb.getURL(refhash) : null; DigestURL refurl;
prop.put("citations_documents_0_anchors_" + d + "_urle", refurl == null ? 0 : 1); try {
if (refurl != null) prop.putXML("citations_documents_0_anchors_" + d + "_urle_url", refurl.toNormalform(true)); refurl = authenticated ? sb.getURL(refhash) : null;
prop.put("citations_documents_0_anchors_" + d + "_urle_hash", refhash); prop.put("citations_documents_0_anchors_" + d + "_urle", refurl == null ? 0 : 1);
prop.put("citations_documents_0_anchors_" + d + "_urle_date", GenericFormatter.SHORT_DAY_FORMATTER.format(new Date())); // superfluous? if (refurl != null) prop.putXML("citations_documents_0_anchors_" + d + "_urle_url", refurl.toNormalform(true));
d++; prop.put("citations_documents_0_anchors_" + d + "_urle_hash", refhash);
prop.put("citations_documents_0_anchors_" + d + "_urle_date", GenericFormatter.SHORT_DAY_FORMATTER.format(new Date())); // superfluous?
d++;
} catch (IOException e) {
ConcurrentLog.logException(e);
}
} }
prop.put("citations_documents_0_count", d); prop.put("citations_documents_0_count", d);
prop.put("citations_documents_0_anchors", d); prop.put("citations_documents_0_anchors", d);

View File

@ -34,18 +34,16 @@ public class add_ymark {
if (post.containsKey("urlHash")) { if (post.containsKey("urlHash")) {
final String urlHash = post.get("urlHash",YMarkUtil.EMPTY_STRING); final String urlHash = post.get("urlHash",YMarkUtil.EMPTY_STRING);
final DigestURL url = sb.index.fulltext().getURL(urlHash);
final String folders = post.get(YMarkEntry.BOOKMARK.FOLDERS.key(),YMarkEntry.BOOKMARK.FOLDERS.deflt());
final String tags = post.get(YMarkEntry.BOOKMARK.TAGS.key(),YMarkUtil.EMPTY_STRING);
try { try {
final DigestURL url = sb.index.fulltext().getURL(urlHash);
final String folders = post.get(YMarkEntry.BOOKMARK.FOLDERS.key(),YMarkEntry.BOOKMARK.FOLDERS.deflt());
final String tags = post.get(YMarkEntry.BOOKMARK.TAGS.key(),YMarkUtil.EMPTY_STRING);
ClientIdentification.Agent agent = ClientIdentification.getAgent(post.get("agentName", ClientIdentification.yacyInternetCrawlerAgentName)); ClientIdentification.Agent agent = ClientIdentification.getAgent(post.get("agentName", ClientIdentification.yacyInternetCrawlerAgentName));
sb.tables.bookmarks.createBookmark(sb.loader, url, agent, bmk_user, true, tags, folders); sb.tables.bookmarks.createBookmark(sb.loader, url, agent, bmk_user, true, tags, folders);
prop.put("status", "1"); prop.put("status", "1");
} catch (final IOException e) { } catch (final IOException e) {
// TODO Auto-generated catch block
ConcurrentLog.logException(e); ConcurrentLog.logException(e);
} catch (final Failure e) { } catch (final Failure e) {
// TODO Auto-generated catch block
ConcurrentLog.logException(e); ConcurrentLog.logException(e);
} }

View File

@ -27,6 +27,7 @@
// javac -classpath .:../classes transferRWI.java // javac -classpath .:../classes transferRWI.java
import java.io.IOException;
import java.util.ArrayList; import java.util.ArrayList;
import java.util.HashSet; import java.util.HashSet;
import java.util.Iterator; import java.util.Iterator;
@ -234,9 +235,14 @@ public final class transferRWI {
} }
for (String id: testids) { for (String id: testids) {
try { try {
if (sb.index.fulltext().getLoadTime(id) >= 0) { try {
knownURL.put(ASCII.getBytes(id)); if (sb.index.fulltext().getLoadTime(id) >= 0) {
} else { knownURL.put(ASCII.getBytes(id));
} else {
unknownURL.put(ASCII.getBytes(id));
}
} catch (IOException e) {
ConcurrentLog.logException(e);
unknownURL.put(ASCII.getBytes(id)); unknownURL.put(ASCII.getBytes(id));
} }
} catch (final SpaceExceededException e) { } catch (final SpaceExceededException e) {

View File

@ -144,7 +144,14 @@ public final class transferURL {
doublecheck = 0; doublecheck = 0;
for (String id : lEm.keySet()) { for (String id : lEm.keySet()) {
if (sb.index.getLoadTime(id) < 0) { long lt = -1;
try {
lt = sb.index.getLoadTime(id);
} catch (IOException e1) {
lt = -1;
ConcurrentLog.logException(e1);
}
if (lt < 0) {
lEntry = lEm.get(id); lEntry = lEm.get(id);
// write entry to database // write entry to database

View File

@ -29,6 +29,7 @@ import net.yacy.cora.date.GenericFormatter;
import net.yacy.cora.document.encoding.ASCII; import net.yacy.cora.document.encoding.ASCII;
import net.yacy.cora.document.id.DigestURL; import net.yacy.cora.document.id.DigestURL;
import net.yacy.cora.protocol.RequestHeader; import net.yacy.cora.protocol.RequestHeader;
import net.yacy.cora.util.ConcurrentLog;
import net.yacy.crawler.data.NoticedURL; import net.yacy.crawler.data.NoticedURL;
import net.yacy.crawler.retrieval.Request; import net.yacy.crawler.retrieval.Request;
import net.yacy.kelondro.data.meta.URIMetadataNode; import net.yacy.kelondro.data.meta.URIMetadataNode;
@ -74,7 +75,12 @@ public class urls {
if (entry == null) break; if (entry == null) break;
// find referrer, if there is one // find referrer, if there is one
referrer = sb.getURL(entry.referrerhash()); try {
referrer = sb.getURL(entry.referrerhash());
} catch (IOException e) {
referrer = null;
ConcurrentLog.logException(e);
}
// place url to notice-url db // place url to notice-url db
sb.crawlQueues.delegatedURL.put(ASCII.String(entry.url().hash()), entry.url()); sb.crawlQueues.delegatedURL.put(ASCII.String(entry.url().hash()), entry.url());
@ -106,16 +112,20 @@ public class urls {
entry = sb.index.fulltext().getMetadata(ASCII.getBytes(urlhashes.substring(12 * i, 12 * (i + 1)))); entry = sb.index.fulltext().getMetadata(ASCII.getBytes(urlhashes.substring(12 * i, 12 * (i + 1))));
if (entry == null) continue; if (entry == null) continue;
// find referrer, if there is one // find referrer, if there is one
referrer = sb.getURL(entry.referrerHash()); try {
// create RSS entry referrer = sb.getURL(entry.referrerHash());
prop.put("item_" + c + "_title", entry.dc_title()); // create RSS entry
prop.putXML("item_" + c + "_link", entry.url().toNormalform(true)); prop.put("item_" + c + "_title", entry.dc_title());
prop.putXML("item_" + c + "_referrer", (referrer == null) ? "" : referrer.toNormalform(true)); prop.putXML("item_" + c + "_link", entry.url().toNormalform(true));
prop.putXML("item_" + c + "_description", entry.dc_title()); prop.putXML("item_" + c + "_referrer", (referrer == null) ? "" : referrer.toNormalform(true));
prop.put("item_" + c + "_author", entry.dc_creator()); prop.putXML("item_" + c + "_description", entry.dc_title());
prop.put("item_" + c + "_pubDate", GenericFormatter.SHORT_SECOND_FORMATTER.format(entry.moddate())); prop.put("item_" + c + "_author", entry.dc_creator());
prop.put("item_" + c + "_guid", ASCII.String(entry.hash())); prop.put("item_" + c + "_pubDate", GenericFormatter.SHORT_SECOND_FORMATTER.format(entry.moddate()));
c++; prop.put("item_" + c + "_guid", ASCII.String(entry.hash()));
c++;
} catch (IOException e) {
ConcurrentLog.logException(e);
}
} }
prop.put("item", c); prop.put("item", c);
prop.putXML("response", "ok"); prop.putXML("response", "ok");

View File

@ -588,19 +588,23 @@ public class yacysearch {
return prop; return prop;
} }
final String bookmarkHash = post.get("bookmarkref", ""); // urlhash final String bookmarkHash = post.get("bookmarkref", ""); // urlhash
final DigestURL url = indexSegment.fulltext().getURL(bookmarkHash); try {
if ( url != null ) { final DigestURL url = indexSegment.fulltext().getURL(bookmarkHash);
try { if ( url != null ) {
sb.tables.bookmarks.createBookmark( try {
sb.loader, sb.tables.bookmarks.createBookmark(
url, sb.loader,
ClientIdentification.yacyInternetCrawlerAgent, url,
YMarkTables.USER_ADMIN, ClientIdentification.yacyInternetCrawlerAgent,
true, YMarkTables.USER_ADMIN,
"searchresult", true,
"/search"); "searchresult",
} catch (final Throwable e ) { "/search");
} catch (final Throwable e ) {
}
} }
} catch (IOException e) {
ConcurrentLog.logException(e);
} }
} }

View File

@ -313,9 +313,6 @@ public abstract class AbstractSolrConnector implements SolrConnector {
//params.setQuery(CollectionSchema.id.getSolrFieldName() + ":\"" + id + "\""); //params.setQuery(CollectionSchema.id.getSolrFieldName() + ":\"" + id + "\"");
String q = "{!raw f=" + CollectionSchema.id.getSolrFieldName() + "}" + id; String q = "{!raw f=" + CollectionSchema.id.getSolrFieldName() + "}" + id;
params.setQuery(q); params.setQuery(q);
//params.setQuery("*:*");
//params.addFilterQuery(q);
//params.set("defType", "raw");
params.setRows(1); params.setRows(1);
params.setStart(0); params.setStart(0);
params.setFacet(false); params.setFacet(false);

View File

@ -405,7 +405,7 @@ public class EmbeddedSolrConnector extends SolrServerConnector implements SolrCo
* @throws IOException * @throws IOException
*/ */
@Override @Override
public LoadTimeURL getLoadTimeURL(String id) { public LoadTimeURL getLoadTimeURL(String id) throws IOException {
int responseCount = 0; int responseCount = 0;
DocListSearcher docListSearcher = null; DocListSearcher docListSearcher = null;
try { try {
@ -421,10 +421,10 @@ public class EmbeddedSolrConnector extends SolrServerConnector implements SolrCo
//} //}
} catch (Throwable e) { } catch (Throwable e) {
ConcurrentLog.logException(e); ConcurrentLog.logException(e);
throw new IOException(e.getMessage());
} finally { } finally {
if (docListSearcher != null) docListSearcher.close(); if (docListSearcher != null) docListSearcher.close();
} }
return null;
} }
@Override @Override

View File

@ -154,7 +154,7 @@ public interface SolrConnector extends Iterable<String> /* Iterable of document
* @param query * @param query
* @throws IOException * @throws IOException
*/ */
public QueryResponse getResponseByParams(final ModifiableSolrParams query) throws IOException, SolrException; public QueryResponse getResponseByParams(final ModifiableSolrParams query) throws IOException;
/** /**
* get the solr document list from a query response * get the solr document list from a query response
@ -165,7 +165,7 @@ public interface SolrConnector extends Iterable<String> /* Iterable of document
* @throws IOException * @throws IOException
* @throws SolrException * @throws SolrException
*/ */
public SolrDocumentList getDocumentListByParams(ModifiableSolrParams params) throws IOException, SolrException; public SolrDocumentList getDocumentListByParams(ModifiableSolrParams params) throws IOException;
/** /**
* get the number of results for a query response * get the number of results for a query response
@ -174,7 +174,7 @@ public interface SolrConnector extends Iterable<String> /* Iterable of document
* @throws IOException * @throws IOException
* @throws SolrException * @throws SolrException
*/ */
public long getDocumentCountByParams(ModifiableSolrParams params) throws IOException, SolrException; public long getDocumentCountByParams(ModifiableSolrParams params) throws IOException;
/** /**
* get a query result from solr * get a query result from solr
@ -191,7 +191,7 @@ public interface SolrConnector extends Iterable<String> /* Iterable of document
final String sort, final String sort,
final int offset, final int offset,
final int count, final int count,
final String ... fields) throws IOException, SolrException; final String ... fields) throws IOException;
/** /**
* get the number of results when this query is done. * get the number of results when this query is done.

View File

@ -33,7 +33,6 @@ import net.yacy.search.schema.CollectionSchema;
import org.apache.lucene.analysis.NumericTokenStream; import org.apache.lucene.analysis.NumericTokenStream;
import org.apache.solr.common.SolrDocumentList; import org.apache.solr.common.SolrDocumentList;
import org.apache.solr.common.SolrException; import org.apache.solr.common.SolrException;
import org.apache.solr.common.SolrException.ErrorCode;
import org.apache.solr.common.SolrInputDocument; import org.apache.solr.common.SolrInputDocument;
import org.apache.solr.common.params.ModifiableSolrParams; import org.apache.solr.common.params.ModifiableSolrParams;
import org.apache.solr.common.util.NamedList; import org.apache.solr.common.util.NamedList;
@ -289,7 +288,7 @@ public abstract class SolrServerConnector extends AbstractSolrConnector implemen
* @throws SolrException * @throws SolrException
*/ */
@Override @Override
public SolrDocumentList getDocumentListByParams(ModifiableSolrParams params) throws IOException, SolrException { public SolrDocumentList getDocumentListByParams(ModifiableSolrParams params) throws IOException {
if (this.server == null) throw new IOException("server disconnected"); if (this.server == null) throw new IOException("server disconnected");
// during the solr query we set the thread name to the query string to get more debugging info in thread dumps // during the solr query we set the thread name to the query string to get more debugging info in thread dumps
String q = params.get("q"); String q = params.get("q");
@ -297,18 +296,25 @@ public abstract class SolrServerConnector extends AbstractSolrConnector implemen
String threadname = Thread.currentThread().getName(); String threadname = Thread.currentThread().getName();
if (q != null) Thread.currentThread().setName("solr query: q = " + q + (fq == null ? "" : ", fq = " + fq)); if (q != null) Thread.currentThread().setName("solr query: q = " + q + (fq == null ? "" : ", fq = " + fq));
QueryResponse rsp; QueryResponse rsp;
try { int retry = 10;
rsp = this.server.query(params); Throwable error = null;
if (q != null) Thread.currentThread().setName(threadname); while (retry-- > 0) {
if (rsp != null) if (log.isFine()) log.fine(rsp.getResults().getNumFound() + " results for q=" + q); try {
return rsp.getResults(); rsp = this.server.query(params);
} catch (final SolrServerException e) { if (q != null) Thread.currentThread().setName(threadname);
clearCaches(); // prevent further OOM if this was caused by OOM if (rsp != null) if (log.isFine()) log.fine(rsp.getResults().getNumFound() + " results for q=" + q);
throw new SolrException(ErrorCode.UNKNOWN, e); return rsp.getResults();
} catch (final Throwable e) { } catch (final SolrServerException e) {
clearCaches(); // prevent further OOM if this was caused by OOM error = e;
throw new IOException("Error executing query", e); clearCaches(); // prevent further OOM if this was caused by OOM
} catch (final Throwable e) {
error = e;
clearCaches(); // prevent further OOM if this was caused by OOM
}
ConcurrentLog.severe("SolrServerConnector", "Failed to query remote Solr: " + error.getMessage() + ", query:" + q + (fq == null ? "" : ", fq = " + fq));
try {Thread.sleep(1000);} catch (InterruptedException e) {}
} }
throw new IOException("Error executing query", error);
} }
// luke requests: these do not work for attached SolrCloud Server // luke requests: these do not work for attached SolrCloud Server

View File

@ -388,7 +388,10 @@ public final class CrawlStacker {
try { try {
oldEntry = this.indexSegment.fulltext().getDefaultConnector().getLoadTimeURL(urlhash); oldEntry = this.indexSegment.fulltext().getDefaultConnector().getLoadTimeURL(urlhash);
} catch (IOException e) { } catch (IOException e) {
// if an exception here occurs then there is the danger that urls which had been in the crawler are overwritten a second time
// to prevent that, we reject urls in these events
ConcurrentLog.logException(e); ConcurrentLog.logException(e);
return "exception during double-test: " + e.getMessage();
} }
final Long oldDate = oldEntry == null ? null : oldEntry.date; final Long oldDate = oldEntry == null ? null : oldEntry.date;
if (oldDate == null) { if (oldDate == null) {

View File

@ -108,11 +108,16 @@ public class RSSLoader extends Thread {
} }
} }
for (final Map.Entry<String, DigestURL> e: urlmap.entrySet()) { for (final Map.Entry<String, DigestURL> e: urlmap.entrySet()) {
HarvestProcess harvestProcess = sb.urlExists(e.getKey()); HarvestProcess harvestProcess;
if (harvestProcess != null) continue; try {
list.add(e.getValue()); harvestProcess = sb.urlExists(e.getKey());
indexTriggered.insertIfAbsent(ASCII.getBytes(e.getKey()), new Date()); if (harvestProcess != null) continue;
loadCount++; list.add(e.getValue());
indexTriggered.insertIfAbsent(ASCII.getBytes(e.getKey()), new Date());
loadCount++;
} catch (IOException e1) {
ConcurrentLog.logException(e1);
}
} }
sb.addToIndex(list, null, null, collections, true); sb.addToIndex(list, null, null, collections, true);
// update info for loading // update info for loading

View File

@ -25,6 +25,7 @@
package net.yacy.crawler.retrieval; package net.yacy.crawler.retrieval;
import java.io.IOException;
import java.net.MalformedURLException; import java.net.MalformedURLException;
import java.util.Date; import java.util.Date;
@ -82,15 +83,20 @@ public class SitemapImporter extends Thread {
// check if the url is known and needs to be recrawled // check if the url is known and needs to be recrawled
Date lastMod = entry.lastmod(null); Date lastMod = entry.lastmod(null);
if (lastMod != null) { if (lastMod != null) {
final HarvestProcess dbocc = this.sb.urlExists(ASCII.String(nexturlhash)); HarvestProcess dbocc;
if (dbocc != null && dbocc == HarvestProcess.LOADED) { try {
// the url was already loaded. we need to check the date dbocc = this.sb.urlExists(ASCII.String(nexturlhash));
final URIMetadataNode oldEntry = this.sb.index.fulltext().getMetadata(nexturlhash); if (dbocc != null && dbocc == HarvestProcess.LOADED) {
if (oldEntry != null) { // the url was already loaded. we need to check the date
final Date modDate = oldEntry.moddate(); final URIMetadataNode oldEntry = this.sb.index.fulltext().getMetadata(nexturlhash);
// check if modDate is null if (oldEntry != null) {
if (modDate.after(lastMod)) return; final Date modDate = oldEntry.moddate();
// check if modDate is null
if (modDate.after(lastMod)) return;
}
} }
} catch (IOException e) {
ConcurrentLog.logException(e);
} }
} }

View File

@ -35,6 +35,7 @@ import net.yacy.cora.document.encoding.ASCII;
import net.yacy.cora.document.id.DigestURL; import net.yacy.cora.document.id.DigestURL;
import net.yacy.cora.federate.yacy.CacheStrategy; import net.yacy.cora.federate.yacy.CacheStrategy;
import net.yacy.cora.protocol.ClientIdentification; import net.yacy.cora.protocol.ClientIdentification;
import net.yacy.cora.util.ConcurrentLog;
import net.yacy.crawler.retrieval.Response; import net.yacy.crawler.retrieval.Response;
import net.yacy.document.Document; import net.yacy.document.Document;
import net.yacy.document.Parser.Failure; import net.yacy.document.Parser.Failure;
@ -82,7 +83,12 @@ public class YMarkMetadata {
public YMarkMetadata(final byte[] urlHash, final Segment indexSegment) { public YMarkMetadata(final byte[] urlHash, final Segment indexSegment) {
this.document = null; this.document = null;
this.indexSegment = indexSegment; this.indexSegment = indexSegment;
this.uri = this.indexSegment.fulltext().getURL(ASCII.String(urlHash)); try {
this.uri = this.indexSegment.fulltext().getURL(ASCII.String(urlHash));
} catch (IOException e) {
this.uri = null;
ConcurrentLog.logException(e);
}
} }
public YMarkMetadata(final Document document) { public YMarkMetadata(final Document document) {

View File

@ -24,6 +24,7 @@
package net.yacy.peers; package net.yacy.peers;
import java.io.IOException;
import java.util.ArrayList; import java.util.ArrayList;
import java.util.HashSet; import java.util.HashSet;
import java.util.Iterator; import java.util.Iterator;
@ -174,9 +175,15 @@ public class Transmission {
i = c.entries(); i = c.entries();
while (i.hasNext()) { while (i.hasNext()) {
final WordReference e = i.next(); final WordReference e = i.next();
if (Transmission.this.segment.fulltext().getLoadTime(ASCII.String(e.urlhash())) >= 0) { try {
this.references.put(e.urlhash()); if (Transmission.this.segment.fulltext().getLoadTime(ASCII.String(e.urlhash())) >= 0) {
} else { this.references.put(e.urlhash());
} else {
notFoundx.add(e.urlhash());
this.badReferences.put(e.urlhash());
}
} catch (IOException e1) {
ConcurrentLog.logException(e1);
notFoundx.add(e.urlhash()); notFoundx.add(e.urlhash());
this.badReferences.put(e.urlhash()); this.badReferences.put(e.urlhash());
} }

View File

@ -1620,18 +1620,12 @@ public final class Switchboard extends serverSwitch {
* @param hash * @param hash
* @return if it exists, the name of the database is returned, if it not exists, null is returned * @return if it exists, the name of the database is returned, if it not exists, null is returned
*/ */
public HarvestProcess urlExists(final String hash) { public HarvestProcess urlExists(final String hash) throws IOException {
if (this.index.getLoadTime(hash) >= 0) return HarvestProcess.LOADED; LoadTimeURL md = this.index.fulltext().getDefaultConnector().getLoadTimeURL(hash);
if (md != null && md.date >= 0) return HarvestProcess.LOADED;
HarvestProcess hp = this.crawlQueues.exists(ASCII.getBytes(hash)); HarvestProcess hp = this.crawlQueues.exists(ASCII.getBytes(hash));
if (hp != null) return hp; if (hp != null) return hp;
try { return null; // todo: can also be in error
LoadTimeURL md = this.index.fulltext().getDefaultConnector().getLoadTimeURL(hash);
if (md == null) return null;
return HarvestProcess.LOADED; // todo: can also be in error
} catch (IOException e) {
ConcurrentLog.logException(e);
return null;
}
} }
public void urlRemove(final Segment segment, final byte[] hash) { public void urlRemove(final Segment segment, final byte[] hash) {
@ -1640,7 +1634,7 @@ public final class Switchboard extends serverSwitch {
this.crawlQueues.removeURL(hash); this.crawlQueues.removeURL(hash);
} }
public DigestURL getURL(final byte[] urlhash) { public DigestURL getURL(final byte[] urlhash) throws IOException {
if (urlhash == null) return null; if (urlhash == null) return null;
if (urlhash.length == 0) return null; if (urlhash.length == 0) return null;
final DigestURL url = this.index.fulltext().getURL(ASCII.String(urlhash)); final DigestURL url = this.index.fulltext().getURL(ASCII.String(urlhash));
@ -2977,7 +2971,15 @@ public final class Switchboard extends serverSwitch {
// stacking may fail because of double occurrences of that url. Therefore // stacking may fail because of double occurrences of that url. Therefore
// we must wait here until the url has actually disappeared // we must wait here until the url has actually disappeared
int t = 100; int t = 100;
while (t-- > 0 && this.index.getLoadTime(ASCII.String(urlhash)) >= 0) { while (t-- > 0) {
try {
long lt = this.index.getLoadTime(ASCII.String(urlhash));
if (lt < 0) break;
} catch (IOException e) {
// if this fails, the url may still exist
// we should abandon the whole process
return "exist-test failed: " + e.getMessage();
}
try {Thread.sleep(100);} catch (final InterruptedException e) {} try {Thread.sleep(100);} catch (final InterruptedException e) {}
ConcurrentLog.fine("Switchboard", "STACKURL: waiting for deletion, t=" + t); ConcurrentLog.fine("Switchboard", "STACKURL: waiting for deletion, t=" + t);
//if (t == 20) this.index.fulltext().commit(true); //if (t == 20) this.index.fulltext().commit(true);
@ -3094,9 +3096,17 @@ public final class Switchboard extends serverSwitch {
final List<Request> requests = new ArrayList<Request>(); final List<Request> requests = new ArrayList<Request>();
for (Map.Entry<String, DigestURL> e: urlmap.entrySet()) { for (Map.Entry<String, DigestURL> e: urlmap.entrySet()) {
final String urlName = e.getValue().toNormalform(true); final String urlName = e.getValue().toNormalform(true);
if (doublecheck && this.index.getLoadTime(e.getKey()) >= 0) { if (doublecheck) {
this.log.info("addToIndex: double " + urlName); try {
continue; if (this.index.getLoadTime(e.getKey()) >= 0) {
this.log.info("addToIndex: double " + urlName);
continue;
}
} catch (IOException ee) {
// double check fail may mean that the url exist
this.log.info("addToIndex: doublecheck failed for " + urlName + ": " + ee.getMessage());
continue;
}
} }
final Request request = this.loader.request(e.getValue(), true, true); final Request request = this.loader.request(e.getValue(), true, true);
final CrawlProfile profile = this.crawler.get(ASCII.getBytes(request.profileHandle())); final CrawlProfile profile = this.crawler.get(ASCII.getBytes(request.profileHandle()));
@ -3168,7 +3178,11 @@ public final class Switchboard extends serverSwitch {
Map<String, DigestURL> urlmap = new HashMap<String, DigestURL>(); Map<String, DigestURL> urlmap = new HashMap<String, DigestURL>();
for (DigestURL url: urls) urlmap.put(ASCII.String(url.hash()), url); for (DigestURL url: urls) urlmap.put(ASCII.String(url.hash()), url);
for (Map.Entry<String, DigestURL> e: urlmap.entrySet()) { for (Map.Entry<String, DigestURL> e: urlmap.entrySet()) {
if (this.index.getLoadTime(e.getKey()) >= 0) continue; // double try {
if (this.index.getLoadTime(e.getKey()) >= 0) continue; // double
} catch (IOException ee) {
continue; // if the check fails, consider the url as double
}
DigestURL url = e.getValue(); DigestURL url = e.getValue();
final Request request = this.loader.request(url, true, true); final Request request = this.loader.request(url, true, true);
final CrawlProfile profile = this.crawler.get(ASCII.getBytes(request.profileHandle())); final CrawlProfile profile = this.crawler.get(ASCII.getBytes(request.profileHandle()));

View File

@ -473,16 +473,12 @@ public final class Fulltext {
return false; return false;
} }
public DigestURL getURL(final String urlHash) { public DigestURL getURL(final String urlHash) throws IOException {
if (urlHash == null || this.getDefaultConnector() == null) return null; if (urlHash == null || this.getDefaultConnector() == null) return null;
try { SolrConnector.LoadTimeURL md = this.getDefaultConnector().getLoadTimeURL(urlHash);
SolrConnector.LoadTimeURL md = this.getDefaultConnector().getLoadTimeURL(urlHash); if (md == null) return null;
if (md == null) return null; return new DigestURL(md.url, ASCII.getBytes(urlHash));
return new DigestURL(md.url, ASCII.getBytes(urlHash));
} catch (final IOException e) {
return null;
}
} }
/** /**
@ -490,16 +486,11 @@ public final class Fulltext {
* @param urlHash * @param urlHash
* @return the time in milliseconds since epoch for the load time or -1 if the document does not exist * @return the time in milliseconds since epoch for the load time or -1 if the document does not exist
*/ */
public long getLoadTime(final String urlHash) { public long getLoadTime(final String urlHash) throws IOException {
if (urlHash == null) return -1l; if (urlHash == null) return -1l;
try { SolrConnector.LoadTimeURL md = this.getDefaultConnector().getLoadTimeURL(urlHash);
SolrConnector.LoadTimeURL md = this.getDefaultConnector().getLoadTimeURL(urlHash); if (md == null) return -1l;
if (md == null) return -1l; return md.date;
return md.date;
} catch (final Throwable e) {
ConcurrentLog.logException(e);
}
return -1l;
} }
public List<File> dumpFiles() { public List<File> dumpFiles() {

View File

@ -356,7 +356,7 @@ public class Segment {
* @param urlHash * @param urlHash
* @return the time in milliseconds since epoch for the load time or -1 if the document does not exist * @return the time in milliseconds since epoch for the load time or -1 if the document does not exist
*/ */
public long getLoadTime(final String urlhash) { public long getLoadTime(final String urlhash) throws IOException {
return this.fulltext.getLoadTime(urlhash); return this.fulltext.getLoadTime(urlhash);
} }
@ -683,10 +683,10 @@ public class Segment {
if (urlhash == null) return 0; if (urlhash == null) return 0;
// determine the url string // determine the url string
final DigestURL url = fulltext().getURL(ASCII.String(urlhash));
if (url == null) return 0;
try { try {
final DigestURL url = fulltext().getURL(ASCII.String(urlhash));
if (url == null) return 0;
// parse the resource // parse the resource
final Document document = Document.mergeDocuments(url, null, loader.loadDocuments(loader.request(url, true, false), cacheStrategy, Integer.MAX_VALUE, null, agent)); final Document document = Document.mergeDocuments(url, null, loader.loadDocuments(loader.request(url, true, false), cacheStrategy, Integer.MAX_VALUE, null, agent));
if (document == null) { if (document == null) {