better error handling for remote solr queries and exists-checks

This commit is contained in:
orbiter 2014-08-01 11:00:10 +02:00
parent b510b182d8
commit 22ce4fb4dd
24 changed files with 292 additions and 185 deletions

View File

@ -138,8 +138,13 @@ public class HostBrowser {
String load = post.get("load", "");
boolean wait = false;
if (loadRight && autoload && path.length() != 0 && pathURI != null && load.length() == 0 && sb.index.getLoadTime(ASCII.String(pathURI.hash())) < 0) {
// in case that the url does not exist and loading is wanted turn this request into a loading request
try {
if (loadRight && autoload && path.length() != 0 && pathURI != null && load.length() == 0 && sb.index.getLoadTime(ASCII.String(pathURI.hash())) < 0) {
// in case that the url does not exist and loading is wanted turn this request into a loading request
load = path;
wait = true;
}
} catch (IOException e1) {
load = path;
wait = true;
}
@ -156,8 +161,13 @@ public class HostBrowser {
0, 0, 0
));
prop.putHTML("result", reasonString == null ? ("added url to indexer: " + load) : ("not indexed url '" + load + "': " + reasonString));
if (wait) for (int i = 0; i < 30; i++) {
if (sb.index.getLoadTime(ASCII.String(url.hash())) >= 0) break;
if (wait) waitloop: for (int i = 0; i < 30; i++) {
try {
if (sb.index.getLoadTime(ASCII.String(url.hash())) >= 0) break;
} catch (IOException e1) {
e1.printStackTrace();
break waitloop;
}
try {Thread.sleep(100);} catch (final InterruptedException e) {}
}
} catch (final MalformedURLException e) {

View File

@ -369,8 +369,7 @@ public class IndexControlRWIs_p {
Word.commonHashOrder,
urlb.size());
if ( post.containsKey("blacklisturls") ) {
final String[] supportedBlacklistTypes =
env.getConfig("BlackLists.types", "").split(",");
final String[] supportedBlacklistTypes = env.getConfig("BlackLists.types", "").split(",");
DigestURL url;
for ( final byte[] b : urlb ) {
try {
@ -378,28 +377,32 @@ public class IndexControlRWIs_p {
} catch (final SpaceExceededException e ) {
ConcurrentLog.logException(e);
}
url = segment.fulltext().getURL(ASCII.String(b));
segment.fulltext().remove(b);
if ( url != null ) {
for ( final String supportedBlacklistType : supportedBlacklistTypes ) {
if ( ListManager.listSetContains(
supportedBlacklistType + ".BlackLists",
blacklist) ) {
try {
Switchboard.urlBlacklist.add(
BlacklistType.valueOf(supportedBlacklistType),
blacklist,
url.getHost(),
url.getFile());
} catch (PunycodeException e) {
ConcurrentLog.warn(APP_NAME,
"Unable to add blacklist entry to blacklist "
+ supportedBlacklistType, e);
try {
url = segment.fulltext().getURL(ASCII.String(b));
segment.fulltext().remove(b);
if ( url != null ) {
for ( final String supportedBlacklistType : supportedBlacklistTypes ) {
if ( ListManager.listSetContains(
supportedBlacklistType + ".BlackLists",
blacklist) ) {
try {
Switchboard.urlBlacklist.add(
BlacklistType.valueOf(supportedBlacklistType),
blacklist,
url.getHost(),
url.getFile());
} catch (PunycodeException e) {
ConcurrentLog.warn(APP_NAME,
"Unable to add blacklist entry to blacklist "
+ supportedBlacklistType, e);
}
}
}
}
SearchEventCache.cleanupEvents(true);
}
}
SearchEventCache.cleanupEvents(true);
}
} catch (IOException e1) {
ConcurrentLog.logException(e1);
}
}
}
@ -411,27 +414,29 @@ public class IndexControlRWIs_p {
} catch (final SpaceExceededException e ) {
ConcurrentLog.logException(e);
}
url = segment.fulltext().getURL(ASCII.String(b));
segment.fulltext().remove(b);
if ( url != null ) {
for ( final BlacklistType supportedBlacklistType : BlacklistType.values() ) {
if ( ListManager.listSetContains(
supportedBlacklistType + ".BlackLists",
blacklist) ) {
try {
Switchboard.urlBlacklist.add(
supportedBlacklistType,
blacklist,
url.getHost(),
".*");
} catch (PunycodeException e) {
ConcurrentLog.warn(APP_NAME,
"Unable to add blacklist entry to blacklist "
+ supportedBlacklistType, e);
try {
url = segment.fulltext().getURL(ASCII.String(b));
segment.fulltext().remove(b);
if ( url != null ) {
for ( final BlacklistType supportedBlacklistType : BlacklistType.values() ) {
if ( ListManager.listSetContains(supportedBlacklistType + ".BlackLists", blacklist) ) {
try {
Switchboard.urlBlacklist.add(
supportedBlacklistType,
blacklist,
url.getHost(),
".*");
} catch (PunycodeException e) {
ConcurrentLog.warn(APP_NAME,
"Unable to add blacklist entry to blacklist "
+ supportedBlacklistType, e);
}
}
}
}
}
}
}
} catch (IOException e1) {
ConcurrentLog.logException(e1);
}
}
}
try {

View File

@ -183,14 +183,19 @@ public class IndexControlURLs_p {
}
if (post.containsKey("urlhashdelete")) {
final DigestURL url = segment.fulltext().getURL(urlhash);
if (url == null) {
prop.putHTML("result", "No Entry for URL hash " + urlhash + "; nothing deleted.");
} else {
urlstring = url.toNormalform(true);
prop.put("urlstring", "");
sb.urlRemove(segment, urlhash.getBytes());
prop.putHTML("result", "Removed URL " + urlstring);
DigestURL url;
try {
url = segment.fulltext().getURL(urlhash);
if (url == null) {
prop.putHTML("result", "No Entry for URL hash " + urlhash + "; nothing deleted.");
} else {
urlstring = url.toNormalform(true);
prop.put("urlstring", "");
sb.urlRemove(segment, urlhash.getBytes());
prop.putHTML("result", "Removed URL " + urlstring);
}
} catch (IOException e) {
prop.putHTML("result", "Error when querying the url hash " + urlhash + ":" + e.getMessage());
}
}

View File

@ -352,17 +352,23 @@ public class Load_RSS_p {
author = item.getAuthor();
if (author == null) author = item.getCopyright();
pubDate = item.getPubDate();
HarvestProcess harvestProcess = sb.urlExists(ASCII.String(messageurl.hash()));
prop.put("showitems_item_" + i + "_state", harvestProcess != null ? 2 : RSSLoader.indexTriggered.containsKey(messageurl.hash()) ? 1 : 0);
prop.put("showitems_item_" + i + "_state_count", i);
prop.putHTML("showitems_item_" + i + "_state_guid", item.getGuid());
prop.putHTML("showitems_item_" + i + "_author", author == null ? "" : author);
prop.putHTML("showitems_item_" + i + "_title", item.getTitle());
prop.putHTML("showitems_item_" + i + "_link", messageurl.toNormalform(true));
prop.putHTML("showitems_item_" + i + "_description", item.getDescriptions().toString());
prop.putHTML("showitems_item_" + i + "_language", item.getLanguage());
prop.putHTML("showitems_item_" + i + "_date", (pubDate == null) ? "" : DateFormat.getDateTimeInstance().format(pubDate));
i++;
HarvestProcess harvestProcess;
try {
harvestProcess = sb.urlExists(ASCII.String(messageurl.hash()));
prop.put("showitems_item_" + i + "_state", harvestProcess != null ? 2 : RSSLoader.indexTriggered.containsKey(messageurl.hash()) ? 1 : 0);
prop.put("showitems_item_" + i + "_state_count", i);
prop.putHTML("showitems_item_" + i + "_state_guid", item.getGuid());
prop.putHTML("showitems_item_" + i + "_author", author == null ? "" : author);
prop.putHTML("showitems_item_" + i + "_title", item.getTitle());
prop.putHTML("showitems_item_" + i + "_link", messageurl.toNormalform(true));
prop.putHTML("showitems_item_" + i + "_description", item.getDescriptions().toString());
prop.putHTML("showitems_item_" + i + "_language", item.getLanguage());
prop.putHTML("showitems_item_" + i + "_date", (pubDate == null) ? "" : DateFormat.getDateTimeInstance().format(pubDate));
i++;
} catch (IOException e) {
ConcurrentLog.logException(e);
continue;
}
} catch (final MalformedURLException e) {
ConcurrentLog.logException(e);
continue;

View File

@ -35,6 +35,7 @@ import net.yacy.cora.document.id.DigestURL;
import net.yacy.cora.federate.solr.connector.SolrConnector;
import net.yacy.cora.protocol.RequestHeader;
import net.yacy.cora.sorting.OrderedScoreMap;
import net.yacy.cora.util.ConcurrentLog;
import net.yacy.document.SentenceReader;
import net.yacy.search.Switchboard;
import net.yacy.search.index.Segment;
@ -86,10 +87,14 @@ public class citation {
} catch (final MalformedURLException e) {}
}
if (uri == null && hash.length() > 0) {
uri = sb.getURL(ASCII.getBytes(hash));
if (uri == null) {
connector.commit(true); // try again, that url can be fresh
try {
uri = sb.getURL(ASCII.getBytes(hash));
if (uri == null) {
connector.commit(true); // try again, that url can be fresh
uri = sb.getURL(ASCII.getBytes(hash));
}
} catch (IOException e) {
ConcurrentLog.logException(e);
}
}
if (uri == null) return prop; // no proper url addressed

View File

@ -17,6 +17,7 @@
// along with this program; if not, write to the Free Software
// Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
import java.io.IOException;
import java.net.MalformedURLException;
import net.yacy.cora.document.encoding.ASCII;
@ -25,6 +26,7 @@ import net.yacy.cora.order.Base64Order;
import net.yacy.cora.protocol.HeaderFramework;
import net.yacy.cora.protocol.RequestHeader;
import net.yacy.cora.protocol.ResponseHeader;
import net.yacy.cora.util.ConcurrentLog;
import net.yacy.search.Switchboard;
import net.yacy.search.index.Fulltext;
import net.yacy.search.schema.HyperlinkEdge;
@ -59,7 +61,11 @@ public class linkstructure {
String hostname = null;
if (about.length() == 12 && Base64Order.enhancedCoder.wellformed(ASCII.getBytes(about))) {
byte[] urlhash = ASCII.getBytes(about);
url = authenticated ? sb.getURL(urlhash) : null;
try {
url = authenticated ? sb.getURL(urlhash) : null;
} catch (IOException e) {
ConcurrentLog.logException(e);
}
} else if (url == null && about.length() > 0) { // consider "about" as url or hostname
url = new DigestURL(about.indexOf("://") >= 0 ? about : "http://" + about); // accept also domains
hostname = url.getHost();

View File

@ -65,7 +65,12 @@ public class webstructure {
} else if (about.length() == 12 && Base64Order.enhancedCoder.wellformed(ASCII.getBytes(about))) {
urlhash = ASCII.getBytes(about);
hosthash = about.substring(6);
url = authenticated ? sb.getURL(urlhash) : null;
try {
url = authenticated ? sb.getURL(urlhash) : null;
} catch (IOException e) {
url = null;
ConcurrentLog.logException(e);
}
} else if (about.length() > 0) {
// consider "about" as url or hostname
try {
@ -156,12 +161,17 @@ public class webstructure {
Iterator<byte[]> i = ids.iterator();
while (i.hasNext()) {
byte[] refhash = i.next();
DigestURL refurl = authenticated ? sb.getURL(refhash) : null;
prop.put("citations_documents_0_anchors_" + d + "_urle", refurl == null ? 0 : 1);
if (refurl != null) prop.putXML("citations_documents_0_anchors_" + d + "_urle_url", refurl.toNormalform(true));
prop.put("citations_documents_0_anchors_" + d + "_urle_hash", refhash);
prop.put("citations_documents_0_anchors_" + d + "_urle_date", GenericFormatter.SHORT_DAY_FORMATTER.format(new Date())); // superfluous?
d++;
DigestURL refurl;
try {
refurl = authenticated ? sb.getURL(refhash) : null;
prop.put("citations_documents_0_anchors_" + d + "_urle", refurl == null ? 0 : 1);
if (refurl != null) prop.putXML("citations_documents_0_anchors_" + d + "_urle_url", refurl.toNormalform(true));
prop.put("citations_documents_0_anchors_" + d + "_urle_hash", refhash);
prop.put("citations_documents_0_anchors_" + d + "_urle_date", GenericFormatter.SHORT_DAY_FORMATTER.format(new Date())); // superfluous?
d++;
} catch (IOException e) {
ConcurrentLog.logException(e);
}
}
prop.put("citations_documents_0_count", d);
prop.put("citations_documents_0_anchors", d);

View File

@ -34,18 +34,16 @@ public class add_ymark {
if (post.containsKey("urlHash")) {
final String urlHash = post.get("urlHash",YMarkUtil.EMPTY_STRING);
final DigestURL url = sb.index.fulltext().getURL(urlHash);
final String folders = post.get(YMarkEntry.BOOKMARK.FOLDERS.key(),YMarkEntry.BOOKMARK.FOLDERS.deflt());
final String tags = post.get(YMarkEntry.BOOKMARK.TAGS.key(),YMarkUtil.EMPTY_STRING);
try {
final DigestURL url = sb.index.fulltext().getURL(urlHash);
final String folders = post.get(YMarkEntry.BOOKMARK.FOLDERS.key(),YMarkEntry.BOOKMARK.FOLDERS.deflt());
final String tags = post.get(YMarkEntry.BOOKMARK.TAGS.key(),YMarkUtil.EMPTY_STRING);
ClientIdentification.Agent agent = ClientIdentification.getAgent(post.get("agentName", ClientIdentification.yacyInternetCrawlerAgentName));
sb.tables.bookmarks.createBookmark(sb.loader, url, agent, bmk_user, true, tags, folders);
prop.put("status", "1");
} catch (final IOException e) {
// TODO Auto-generated catch block
ConcurrentLog.logException(e);
} catch (final Failure e) {
// TODO Auto-generated catch block
ConcurrentLog.logException(e);
}

View File

@ -27,6 +27,7 @@
// javac -classpath .:../classes transferRWI.java
import java.io.IOException;
import java.util.ArrayList;
import java.util.HashSet;
import java.util.Iterator;
@ -234,9 +235,14 @@ public final class transferRWI {
}
for (String id: testids) {
try {
if (sb.index.fulltext().getLoadTime(id) >= 0) {
knownURL.put(ASCII.getBytes(id));
} else {
try {
if (sb.index.fulltext().getLoadTime(id) >= 0) {
knownURL.put(ASCII.getBytes(id));
} else {
unknownURL.put(ASCII.getBytes(id));
}
} catch (IOException e) {
ConcurrentLog.logException(e);
unknownURL.put(ASCII.getBytes(id));
}
} catch (final SpaceExceededException e) {

View File

@ -144,7 +144,14 @@ public final class transferURL {
doublecheck = 0;
for (String id : lEm.keySet()) {
if (sb.index.getLoadTime(id) < 0) {
long lt = -1;
try {
lt = sb.index.getLoadTime(id);
} catch (IOException e1) {
lt = -1;
ConcurrentLog.logException(e1);
}
if (lt < 0) {
lEntry = lEm.get(id);
// write entry to database

View File

@ -29,6 +29,7 @@ import net.yacy.cora.date.GenericFormatter;
import net.yacy.cora.document.encoding.ASCII;
import net.yacy.cora.document.id.DigestURL;
import net.yacy.cora.protocol.RequestHeader;
import net.yacy.cora.util.ConcurrentLog;
import net.yacy.crawler.data.NoticedURL;
import net.yacy.crawler.retrieval.Request;
import net.yacy.kelondro.data.meta.URIMetadataNode;
@ -74,7 +75,12 @@ public class urls {
if (entry == null) break;
// find referrer, if there is one
referrer = sb.getURL(entry.referrerhash());
try {
referrer = sb.getURL(entry.referrerhash());
} catch (IOException e) {
referrer = null;
ConcurrentLog.logException(e);
}
// place url to notice-url db
sb.crawlQueues.delegatedURL.put(ASCII.String(entry.url().hash()), entry.url());
@ -106,16 +112,20 @@ public class urls {
entry = sb.index.fulltext().getMetadata(ASCII.getBytes(urlhashes.substring(12 * i, 12 * (i + 1))));
if (entry == null) continue;
// find referrer, if there is one
referrer = sb.getURL(entry.referrerHash());
// create RSS entry
prop.put("item_" + c + "_title", entry.dc_title());
prop.putXML("item_" + c + "_link", entry.url().toNormalform(true));
prop.putXML("item_" + c + "_referrer", (referrer == null) ? "" : referrer.toNormalform(true));
prop.putXML("item_" + c + "_description", entry.dc_title());
prop.put("item_" + c + "_author", entry.dc_creator());
prop.put("item_" + c + "_pubDate", GenericFormatter.SHORT_SECOND_FORMATTER.format(entry.moddate()));
prop.put("item_" + c + "_guid", ASCII.String(entry.hash()));
c++;
try {
referrer = sb.getURL(entry.referrerHash());
// create RSS entry
prop.put("item_" + c + "_title", entry.dc_title());
prop.putXML("item_" + c + "_link", entry.url().toNormalform(true));
prop.putXML("item_" + c + "_referrer", (referrer == null) ? "" : referrer.toNormalform(true));
prop.putXML("item_" + c + "_description", entry.dc_title());
prop.put("item_" + c + "_author", entry.dc_creator());
prop.put("item_" + c + "_pubDate", GenericFormatter.SHORT_SECOND_FORMATTER.format(entry.moddate()));
prop.put("item_" + c + "_guid", ASCII.String(entry.hash()));
c++;
} catch (IOException e) {
ConcurrentLog.logException(e);
}
}
prop.put("item", c);
prop.putXML("response", "ok");

View File

@ -588,19 +588,23 @@ public class yacysearch {
return prop;
}
final String bookmarkHash = post.get("bookmarkref", ""); // urlhash
final DigestURL url = indexSegment.fulltext().getURL(bookmarkHash);
if ( url != null ) {
try {
sb.tables.bookmarks.createBookmark(
sb.loader,
url,
ClientIdentification.yacyInternetCrawlerAgent,
YMarkTables.USER_ADMIN,
true,
"searchresult",
"/search");
} catch (final Throwable e ) {
try {
final DigestURL url = indexSegment.fulltext().getURL(bookmarkHash);
if ( url != null ) {
try {
sb.tables.bookmarks.createBookmark(
sb.loader,
url,
ClientIdentification.yacyInternetCrawlerAgent,
YMarkTables.USER_ADMIN,
true,
"searchresult",
"/search");
} catch (final Throwable e ) {
}
}
} catch (IOException e) {
ConcurrentLog.logException(e);
}
}

View File

@ -313,9 +313,6 @@ public abstract class AbstractSolrConnector implements SolrConnector {
//params.setQuery(CollectionSchema.id.getSolrFieldName() + ":\"" + id + "\"");
String q = "{!raw f=" + CollectionSchema.id.getSolrFieldName() + "}" + id;
params.setQuery(q);
//params.setQuery("*:*");
//params.addFilterQuery(q);
//params.set("defType", "raw");
params.setRows(1);
params.setStart(0);
params.setFacet(false);

View File

@ -405,7 +405,7 @@ public class EmbeddedSolrConnector extends SolrServerConnector implements SolrCo
* @throws IOException
*/
@Override
public LoadTimeURL getLoadTimeURL(String id) {
public LoadTimeURL getLoadTimeURL(String id) throws IOException {
int responseCount = 0;
DocListSearcher docListSearcher = null;
try {
@ -421,10 +421,10 @@ public class EmbeddedSolrConnector extends SolrServerConnector implements SolrCo
//}
} catch (Throwable e) {
ConcurrentLog.logException(e);
throw new IOException(e.getMessage());
} finally {
if (docListSearcher != null) docListSearcher.close();
}
return null;
}
@Override

View File

@ -154,7 +154,7 @@ public interface SolrConnector extends Iterable<String> /* Iterable of document
* @param query
* @throws IOException
*/
public QueryResponse getResponseByParams(final ModifiableSolrParams query) throws IOException, SolrException;
public QueryResponse getResponseByParams(final ModifiableSolrParams query) throws IOException;
/**
* get the solr document list from a query response
@ -165,7 +165,7 @@ public interface SolrConnector extends Iterable<String> /* Iterable of document
* @throws IOException
* @throws SolrException
*/
public SolrDocumentList getDocumentListByParams(ModifiableSolrParams params) throws IOException, SolrException;
public SolrDocumentList getDocumentListByParams(ModifiableSolrParams params) throws IOException;
/**
* get the number of results for a query response
@ -174,7 +174,7 @@ public interface SolrConnector extends Iterable<String> /* Iterable of document
* @throws IOException
* @throws SolrException
*/
public long getDocumentCountByParams(ModifiableSolrParams params) throws IOException, SolrException;
public long getDocumentCountByParams(ModifiableSolrParams params) throws IOException;
/**
* get a query result from solr
@ -191,7 +191,7 @@ public interface SolrConnector extends Iterable<String> /* Iterable of document
final String sort,
final int offset,
final int count,
final String ... fields) throws IOException, SolrException;
final String ... fields) throws IOException;
/**
* get the number of results when this query is done.

View File

@ -33,7 +33,6 @@ import net.yacy.search.schema.CollectionSchema;
import org.apache.lucene.analysis.NumericTokenStream;
import org.apache.solr.common.SolrDocumentList;
import org.apache.solr.common.SolrException;
import org.apache.solr.common.SolrException.ErrorCode;
import org.apache.solr.common.SolrInputDocument;
import org.apache.solr.common.params.ModifiableSolrParams;
import org.apache.solr.common.util.NamedList;
@ -289,7 +288,7 @@ public abstract class SolrServerConnector extends AbstractSolrConnector implemen
* @throws SolrException
*/
@Override
public SolrDocumentList getDocumentListByParams(ModifiableSolrParams params) throws IOException, SolrException {
public SolrDocumentList getDocumentListByParams(ModifiableSolrParams params) throws IOException {
if (this.server == null) throw new IOException("server disconnected");
// during the solr query we set the thread name to the query string to get more debugging info in thread dumps
String q = params.get("q");
@ -297,18 +296,25 @@ public abstract class SolrServerConnector extends AbstractSolrConnector implemen
String threadname = Thread.currentThread().getName();
if (q != null) Thread.currentThread().setName("solr query: q = " + q + (fq == null ? "" : ", fq = " + fq));
QueryResponse rsp;
try {
rsp = this.server.query(params);
if (q != null) Thread.currentThread().setName(threadname);
if (rsp != null) if (log.isFine()) log.fine(rsp.getResults().getNumFound() + " results for q=" + q);
return rsp.getResults();
} catch (final SolrServerException e) {
clearCaches(); // prevent further OOM if this was caused by OOM
throw new SolrException(ErrorCode.UNKNOWN, e);
} catch (final Throwable e) {
clearCaches(); // prevent further OOM if this was caused by OOM
throw new IOException("Error executing query", e);
int retry = 10;
Throwable error = null;
while (retry-- > 0) {
try {
rsp = this.server.query(params);
if (q != null) Thread.currentThread().setName(threadname);
if (rsp != null) if (log.isFine()) log.fine(rsp.getResults().getNumFound() + " results for q=" + q);
return rsp.getResults();
} catch (final SolrServerException e) {
error = e;
clearCaches(); // prevent further OOM if this was caused by OOM
} catch (final Throwable e) {
error = e;
clearCaches(); // prevent further OOM if this was caused by OOM
}
ConcurrentLog.severe("SolrServerConnector", "Failed to query remote Solr: " + error.getMessage() + ", query:" + q + (fq == null ? "" : ", fq = " + fq));
try {Thread.sleep(1000);} catch (InterruptedException e) {}
}
throw new IOException("Error executing query", error);
}
// luke requests: these do not work for attached SolrCloud Server

View File

@ -388,7 +388,10 @@ public final class CrawlStacker {
try {
oldEntry = this.indexSegment.fulltext().getDefaultConnector().getLoadTimeURL(urlhash);
} catch (IOException e) {
// if an exception here occurs then there is the danger that urls which had been in the crawler are overwritten a second time
// to prevent that, we reject urls in these events
ConcurrentLog.logException(e);
return "exception during double-test: " + e.getMessage();
}
final Long oldDate = oldEntry == null ? null : oldEntry.date;
if (oldDate == null) {

View File

@ -108,11 +108,16 @@ public class RSSLoader extends Thread {
}
}
for (final Map.Entry<String, DigestURL> e: urlmap.entrySet()) {
HarvestProcess harvestProcess = sb.urlExists(e.getKey());
if (harvestProcess != null) continue;
list.add(e.getValue());
indexTriggered.insertIfAbsent(ASCII.getBytes(e.getKey()), new Date());
loadCount++;
HarvestProcess harvestProcess;
try {
harvestProcess = sb.urlExists(e.getKey());
if (harvestProcess != null) continue;
list.add(e.getValue());
indexTriggered.insertIfAbsent(ASCII.getBytes(e.getKey()), new Date());
loadCount++;
} catch (IOException e1) {
ConcurrentLog.logException(e1);
}
}
sb.addToIndex(list, null, null, collections, true);
// update info for loading

View File

@ -25,6 +25,7 @@
package net.yacy.crawler.retrieval;
import java.io.IOException;
import java.net.MalformedURLException;
import java.util.Date;
@ -82,15 +83,20 @@ public class SitemapImporter extends Thread {
// check if the url is known and needs to be recrawled
Date lastMod = entry.lastmod(null);
if (lastMod != null) {
final HarvestProcess dbocc = this.sb.urlExists(ASCII.String(nexturlhash));
if (dbocc != null && dbocc == HarvestProcess.LOADED) {
// the url was already loaded. we need to check the date
final URIMetadataNode oldEntry = this.sb.index.fulltext().getMetadata(nexturlhash);
if (oldEntry != null) {
final Date modDate = oldEntry.moddate();
// check if modDate is null
if (modDate.after(lastMod)) return;
HarvestProcess dbocc;
try {
dbocc = this.sb.urlExists(ASCII.String(nexturlhash));
if (dbocc != null && dbocc == HarvestProcess.LOADED) {
// the url was already loaded. we need to check the date
final URIMetadataNode oldEntry = this.sb.index.fulltext().getMetadata(nexturlhash);
if (oldEntry != null) {
final Date modDate = oldEntry.moddate();
// check if modDate is null
if (modDate.after(lastMod)) return;
}
}
} catch (IOException e) {
ConcurrentLog.logException(e);
}
}

View File

@ -35,6 +35,7 @@ import net.yacy.cora.document.encoding.ASCII;
import net.yacy.cora.document.id.DigestURL;
import net.yacy.cora.federate.yacy.CacheStrategy;
import net.yacy.cora.protocol.ClientIdentification;
import net.yacy.cora.util.ConcurrentLog;
import net.yacy.crawler.retrieval.Response;
import net.yacy.document.Document;
import net.yacy.document.Parser.Failure;
@ -82,7 +83,12 @@ public class YMarkMetadata {
public YMarkMetadata(final byte[] urlHash, final Segment indexSegment) {
this.document = null;
this.indexSegment = indexSegment;
this.uri = this.indexSegment.fulltext().getURL(ASCII.String(urlHash));
try {
this.uri = this.indexSegment.fulltext().getURL(ASCII.String(urlHash));
} catch (IOException e) {
this.uri = null;
ConcurrentLog.logException(e);
}
}
public YMarkMetadata(final Document document) {

View File

@ -24,6 +24,7 @@
package net.yacy.peers;
import java.io.IOException;
import java.util.ArrayList;
import java.util.HashSet;
import java.util.Iterator;
@ -174,9 +175,15 @@ public class Transmission {
i = c.entries();
while (i.hasNext()) {
final WordReference e = i.next();
if (Transmission.this.segment.fulltext().getLoadTime(ASCII.String(e.urlhash())) >= 0) {
this.references.put(e.urlhash());
} else {
try {
if (Transmission.this.segment.fulltext().getLoadTime(ASCII.String(e.urlhash())) >= 0) {
this.references.put(e.urlhash());
} else {
notFoundx.add(e.urlhash());
this.badReferences.put(e.urlhash());
}
} catch (IOException e1) {
ConcurrentLog.logException(e1);
notFoundx.add(e.urlhash());
this.badReferences.put(e.urlhash());
}

View File

@ -1620,18 +1620,12 @@ public final class Switchboard extends serverSwitch {
* @param hash
* @return if it exists, the name of the database is returned, if it not exists, null is returned
*/
public HarvestProcess urlExists(final String hash) {
if (this.index.getLoadTime(hash) >= 0) return HarvestProcess.LOADED;
public HarvestProcess urlExists(final String hash) throws IOException {
LoadTimeURL md = this.index.fulltext().getDefaultConnector().getLoadTimeURL(hash);
if (md != null && md.date >= 0) return HarvestProcess.LOADED;
HarvestProcess hp = this.crawlQueues.exists(ASCII.getBytes(hash));
if (hp != null) return hp;
try {
LoadTimeURL md = this.index.fulltext().getDefaultConnector().getLoadTimeURL(hash);
if (md == null) return null;
return HarvestProcess.LOADED; // todo: can also be in error
} catch (IOException e) {
ConcurrentLog.logException(e);
return null;
}
return null; // todo: can also be in error
}
public void urlRemove(final Segment segment, final byte[] hash) {
@ -1640,7 +1634,7 @@ public final class Switchboard extends serverSwitch {
this.crawlQueues.removeURL(hash);
}
public DigestURL getURL(final byte[] urlhash) {
public DigestURL getURL(final byte[] urlhash) throws IOException {
if (urlhash == null) return null;
if (urlhash.length == 0) return null;
final DigestURL url = this.index.fulltext().getURL(ASCII.String(urlhash));
@ -2977,7 +2971,15 @@ public final class Switchboard extends serverSwitch {
// stacking may fail because of double occurrences of that url. Therefore
// we must wait here until the url has actually disappeared
int t = 100;
while (t-- > 0 && this.index.getLoadTime(ASCII.String(urlhash)) >= 0) {
while (t-- > 0) {
try {
long lt = this.index.getLoadTime(ASCII.String(urlhash));
if (lt < 0) break;
} catch (IOException e) {
// if this fails, the url may still exist
// we should abandon the whole process
return "exist-test failed: " + e.getMessage();
}
try {Thread.sleep(100);} catch (final InterruptedException e) {}
ConcurrentLog.fine("Switchboard", "STACKURL: waiting for deletion, t=" + t);
//if (t == 20) this.index.fulltext().commit(true);
@ -3094,9 +3096,17 @@ public final class Switchboard extends serverSwitch {
final List<Request> requests = new ArrayList<Request>();
for (Map.Entry<String, DigestURL> e: urlmap.entrySet()) {
final String urlName = e.getValue().toNormalform(true);
if (doublecheck && this.index.getLoadTime(e.getKey()) >= 0) {
this.log.info("addToIndex: double " + urlName);
continue;
if (doublecheck) {
try {
if (this.index.getLoadTime(e.getKey()) >= 0) {
this.log.info("addToIndex: double " + urlName);
continue;
}
} catch (IOException ee) {
// double check fail may mean that the url exist
this.log.info("addToIndex: doublecheck failed for " + urlName + ": " + ee.getMessage());
continue;
}
}
final Request request = this.loader.request(e.getValue(), true, true);
final CrawlProfile profile = this.crawler.get(ASCII.getBytes(request.profileHandle()));
@ -3168,7 +3178,11 @@ public final class Switchboard extends serverSwitch {
Map<String, DigestURL> urlmap = new HashMap<String, DigestURL>();
for (DigestURL url: urls) urlmap.put(ASCII.String(url.hash()), url);
for (Map.Entry<String, DigestURL> e: urlmap.entrySet()) {
if (this.index.getLoadTime(e.getKey()) >= 0) continue; // double
try {
if (this.index.getLoadTime(e.getKey()) >= 0) continue; // double
} catch (IOException ee) {
continue; // if the check fails, consider the url as double
}
DigestURL url = e.getValue();
final Request request = this.loader.request(url, true, true);
final CrawlProfile profile = this.crawler.get(ASCII.getBytes(request.profileHandle()));

View File

@ -473,16 +473,12 @@ public final class Fulltext {
return false;
}
public DigestURL getURL(final String urlHash) {
public DigestURL getURL(final String urlHash) throws IOException {
if (urlHash == null || this.getDefaultConnector() == null) return null;
try {
SolrConnector.LoadTimeURL md = this.getDefaultConnector().getLoadTimeURL(urlHash);
if (md == null) return null;
return new DigestURL(md.url, ASCII.getBytes(urlHash));
} catch (final IOException e) {
return null;
}
SolrConnector.LoadTimeURL md = this.getDefaultConnector().getLoadTimeURL(urlHash);
if (md == null) return null;
return new DigestURL(md.url, ASCII.getBytes(urlHash));
}
/**
@ -490,16 +486,11 @@ public final class Fulltext {
* @param urlHash
* @return the time in milliseconds since epoch for the load time or -1 if the document does not exist
*/
public long getLoadTime(final String urlHash) {
public long getLoadTime(final String urlHash) throws IOException {
if (urlHash == null) return -1l;
try {
SolrConnector.LoadTimeURL md = this.getDefaultConnector().getLoadTimeURL(urlHash);
if (md == null) return -1l;
return md.date;
} catch (final Throwable e) {
ConcurrentLog.logException(e);
}
return -1l;
SolrConnector.LoadTimeURL md = this.getDefaultConnector().getLoadTimeURL(urlHash);
if (md == null) return -1l;
return md.date;
}
public List<File> dumpFiles() {

View File

@ -356,7 +356,7 @@ public class Segment {
* @param urlHash
* @return the time in milliseconds since epoch for the load time or -1 if the document does not exist
*/
public long getLoadTime(final String urlhash) {
public long getLoadTime(final String urlhash) throws IOException {
return this.fulltext.getLoadTime(urlhash);
}
@ -683,10 +683,10 @@ public class Segment {
if (urlhash == null) return 0;
// determine the url string
final DigestURL url = fulltext().getURL(ASCII.String(urlhash));
if (url == null) return 0;
try {
final DigestURL url = fulltext().getURL(ASCII.String(urlhash));
if (url == null) return 0;
// parse the resource
final Document document = Document.mergeDocuments(url, null, loader.loadDocuments(loader.request(url, true, false), cacheStrategy, Integer.MAX_VALUE, null, agent));
if (document == null) {