mirror of
https://github.com/yacy/yacy_search_server.git
synced 2024-09-19 00:01:41 +02:00
better error handling for remote solr queries and exists-checks
This commit is contained in:
parent
b510b182d8
commit
22ce4fb4dd
|
@ -138,8 +138,13 @@ public class HostBrowser {
|
|||
|
||||
String load = post.get("load", "");
|
||||
boolean wait = false;
|
||||
if (loadRight && autoload && path.length() != 0 && pathURI != null && load.length() == 0 && sb.index.getLoadTime(ASCII.String(pathURI.hash())) < 0) {
|
||||
// in case that the url does not exist and loading is wanted turn this request into a loading request
|
||||
try {
|
||||
if (loadRight && autoload && path.length() != 0 && pathURI != null && load.length() == 0 && sb.index.getLoadTime(ASCII.String(pathURI.hash())) < 0) {
|
||||
// in case that the url does not exist and loading is wanted turn this request into a loading request
|
||||
load = path;
|
||||
wait = true;
|
||||
}
|
||||
} catch (IOException e1) {
|
||||
load = path;
|
||||
wait = true;
|
||||
}
|
||||
|
@ -156,8 +161,13 @@ public class HostBrowser {
|
|||
0, 0, 0
|
||||
));
|
||||
prop.putHTML("result", reasonString == null ? ("added url to indexer: " + load) : ("not indexed url '" + load + "': " + reasonString));
|
||||
if (wait) for (int i = 0; i < 30; i++) {
|
||||
if (sb.index.getLoadTime(ASCII.String(url.hash())) >= 0) break;
|
||||
if (wait) waitloop: for (int i = 0; i < 30; i++) {
|
||||
try {
|
||||
if (sb.index.getLoadTime(ASCII.String(url.hash())) >= 0) break;
|
||||
} catch (IOException e1) {
|
||||
e1.printStackTrace();
|
||||
break waitloop;
|
||||
}
|
||||
try {Thread.sleep(100);} catch (final InterruptedException e) {}
|
||||
}
|
||||
} catch (final MalformedURLException e) {
|
||||
|
|
|
@ -369,8 +369,7 @@ public class IndexControlRWIs_p {
|
|||
Word.commonHashOrder,
|
||||
urlb.size());
|
||||
if ( post.containsKey("blacklisturls") ) {
|
||||
final String[] supportedBlacklistTypes =
|
||||
env.getConfig("BlackLists.types", "").split(",");
|
||||
final String[] supportedBlacklistTypes = env.getConfig("BlackLists.types", "").split(",");
|
||||
DigestURL url;
|
||||
for ( final byte[] b : urlb ) {
|
||||
try {
|
||||
|
@ -378,28 +377,32 @@ public class IndexControlRWIs_p {
|
|||
} catch (final SpaceExceededException e ) {
|
||||
ConcurrentLog.logException(e);
|
||||
}
|
||||
url = segment.fulltext().getURL(ASCII.String(b));
|
||||
segment.fulltext().remove(b);
|
||||
if ( url != null ) {
|
||||
for ( final String supportedBlacklistType : supportedBlacklistTypes ) {
|
||||
if ( ListManager.listSetContains(
|
||||
supportedBlacklistType + ".BlackLists",
|
||||
blacklist) ) {
|
||||
try {
|
||||
Switchboard.urlBlacklist.add(
|
||||
BlacklistType.valueOf(supportedBlacklistType),
|
||||
blacklist,
|
||||
url.getHost(),
|
||||
url.getFile());
|
||||
} catch (PunycodeException e) {
|
||||
ConcurrentLog.warn(APP_NAME,
|
||||
"Unable to add blacklist entry to blacklist "
|
||||
+ supportedBlacklistType, e);
|
||||
try {
|
||||
url = segment.fulltext().getURL(ASCII.String(b));
|
||||
segment.fulltext().remove(b);
|
||||
if ( url != null ) {
|
||||
for ( final String supportedBlacklistType : supportedBlacklistTypes ) {
|
||||
if ( ListManager.listSetContains(
|
||||
supportedBlacklistType + ".BlackLists",
|
||||
blacklist) ) {
|
||||
try {
|
||||
Switchboard.urlBlacklist.add(
|
||||
BlacklistType.valueOf(supportedBlacklistType),
|
||||
blacklist,
|
||||
url.getHost(),
|
||||
url.getFile());
|
||||
} catch (PunycodeException e) {
|
||||
ConcurrentLog.warn(APP_NAME,
|
||||
"Unable to add blacklist entry to blacklist "
|
||||
+ supportedBlacklistType, e);
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
SearchEventCache.cleanupEvents(true);
|
||||
}
|
||||
}
|
||||
SearchEventCache.cleanupEvents(true);
|
||||
}
|
||||
} catch (IOException e1) {
|
||||
ConcurrentLog.logException(e1);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
|
@ -411,27 +414,29 @@ public class IndexControlRWIs_p {
|
|||
} catch (final SpaceExceededException e ) {
|
||||
ConcurrentLog.logException(e);
|
||||
}
|
||||
url = segment.fulltext().getURL(ASCII.String(b));
|
||||
segment.fulltext().remove(b);
|
||||
if ( url != null ) {
|
||||
for ( final BlacklistType supportedBlacklistType : BlacklistType.values() ) {
|
||||
if ( ListManager.listSetContains(
|
||||
supportedBlacklistType + ".BlackLists",
|
||||
blacklist) ) {
|
||||
try {
|
||||
Switchboard.urlBlacklist.add(
|
||||
supportedBlacklistType,
|
||||
blacklist,
|
||||
url.getHost(),
|
||||
".*");
|
||||
} catch (PunycodeException e) {
|
||||
ConcurrentLog.warn(APP_NAME,
|
||||
"Unable to add blacklist entry to blacklist "
|
||||
+ supportedBlacklistType, e);
|
||||
try {
|
||||
url = segment.fulltext().getURL(ASCII.String(b));
|
||||
segment.fulltext().remove(b);
|
||||
if ( url != null ) {
|
||||
for ( final BlacklistType supportedBlacklistType : BlacklistType.values() ) {
|
||||
if ( ListManager.listSetContains(supportedBlacklistType + ".BlackLists", blacklist) ) {
|
||||
try {
|
||||
Switchboard.urlBlacklist.add(
|
||||
supportedBlacklistType,
|
||||
blacklist,
|
||||
url.getHost(),
|
||||
".*");
|
||||
} catch (PunycodeException e) {
|
||||
ConcurrentLog.warn(APP_NAME,
|
||||
"Unable to add blacklist entry to blacklist "
|
||||
+ supportedBlacklistType, e);
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
} catch (IOException e1) {
|
||||
ConcurrentLog.logException(e1);
|
||||
}
|
||||
}
|
||||
}
|
||||
try {
|
||||
|
|
|
@ -183,14 +183,19 @@ public class IndexControlURLs_p {
|
|||
}
|
||||
|
||||
if (post.containsKey("urlhashdelete")) {
|
||||
final DigestURL url = segment.fulltext().getURL(urlhash);
|
||||
if (url == null) {
|
||||
prop.putHTML("result", "No Entry for URL hash " + urlhash + "; nothing deleted.");
|
||||
} else {
|
||||
urlstring = url.toNormalform(true);
|
||||
prop.put("urlstring", "");
|
||||
sb.urlRemove(segment, urlhash.getBytes());
|
||||
prop.putHTML("result", "Removed URL " + urlstring);
|
||||
DigestURL url;
|
||||
try {
|
||||
url = segment.fulltext().getURL(urlhash);
|
||||
if (url == null) {
|
||||
prop.putHTML("result", "No Entry for URL hash " + urlhash + "; nothing deleted.");
|
||||
} else {
|
||||
urlstring = url.toNormalform(true);
|
||||
prop.put("urlstring", "");
|
||||
sb.urlRemove(segment, urlhash.getBytes());
|
||||
prop.putHTML("result", "Removed URL " + urlstring);
|
||||
}
|
||||
} catch (IOException e) {
|
||||
prop.putHTML("result", "Error when querying the url hash " + urlhash + ":" + e.getMessage());
|
||||
}
|
||||
}
|
||||
|
||||
|
|
|
@ -352,17 +352,23 @@ public class Load_RSS_p {
|
|||
author = item.getAuthor();
|
||||
if (author == null) author = item.getCopyright();
|
||||
pubDate = item.getPubDate();
|
||||
HarvestProcess harvestProcess = sb.urlExists(ASCII.String(messageurl.hash()));
|
||||
prop.put("showitems_item_" + i + "_state", harvestProcess != null ? 2 : RSSLoader.indexTriggered.containsKey(messageurl.hash()) ? 1 : 0);
|
||||
prop.put("showitems_item_" + i + "_state_count", i);
|
||||
prop.putHTML("showitems_item_" + i + "_state_guid", item.getGuid());
|
||||
prop.putHTML("showitems_item_" + i + "_author", author == null ? "" : author);
|
||||
prop.putHTML("showitems_item_" + i + "_title", item.getTitle());
|
||||
prop.putHTML("showitems_item_" + i + "_link", messageurl.toNormalform(true));
|
||||
prop.putHTML("showitems_item_" + i + "_description", item.getDescriptions().toString());
|
||||
prop.putHTML("showitems_item_" + i + "_language", item.getLanguage());
|
||||
prop.putHTML("showitems_item_" + i + "_date", (pubDate == null) ? "" : DateFormat.getDateTimeInstance().format(pubDate));
|
||||
i++;
|
||||
HarvestProcess harvestProcess;
|
||||
try {
|
||||
harvestProcess = sb.urlExists(ASCII.String(messageurl.hash()));
|
||||
prop.put("showitems_item_" + i + "_state", harvestProcess != null ? 2 : RSSLoader.indexTriggered.containsKey(messageurl.hash()) ? 1 : 0);
|
||||
prop.put("showitems_item_" + i + "_state_count", i);
|
||||
prop.putHTML("showitems_item_" + i + "_state_guid", item.getGuid());
|
||||
prop.putHTML("showitems_item_" + i + "_author", author == null ? "" : author);
|
||||
prop.putHTML("showitems_item_" + i + "_title", item.getTitle());
|
||||
prop.putHTML("showitems_item_" + i + "_link", messageurl.toNormalform(true));
|
||||
prop.putHTML("showitems_item_" + i + "_description", item.getDescriptions().toString());
|
||||
prop.putHTML("showitems_item_" + i + "_language", item.getLanguage());
|
||||
prop.putHTML("showitems_item_" + i + "_date", (pubDate == null) ? "" : DateFormat.getDateTimeInstance().format(pubDate));
|
||||
i++;
|
||||
} catch (IOException e) {
|
||||
ConcurrentLog.logException(e);
|
||||
continue;
|
||||
}
|
||||
} catch (final MalformedURLException e) {
|
||||
ConcurrentLog.logException(e);
|
||||
continue;
|
||||
|
|
|
@ -35,6 +35,7 @@ import net.yacy.cora.document.id.DigestURL;
|
|||
import net.yacy.cora.federate.solr.connector.SolrConnector;
|
||||
import net.yacy.cora.protocol.RequestHeader;
|
||||
import net.yacy.cora.sorting.OrderedScoreMap;
|
||||
import net.yacy.cora.util.ConcurrentLog;
|
||||
import net.yacy.document.SentenceReader;
|
||||
import net.yacy.search.Switchboard;
|
||||
import net.yacy.search.index.Segment;
|
||||
|
@ -86,10 +87,14 @@ public class citation {
|
|||
} catch (final MalformedURLException e) {}
|
||||
}
|
||||
if (uri == null && hash.length() > 0) {
|
||||
uri = sb.getURL(ASCII.getBytes(hash));
|
||||
if (uri == null) {
|
||||
connector.commit(true); // try again, that url can be fresh
|
||||
try {
|
||||
uri = sb.getURL(ASCII.getBytes(hash));
|
||||
if (uri == null) {
|
||||
connector.commit(true); // try again, that url can be fresh
|
||||
uri = sb.getURL(ASCII.getBytes(hash));
|
||||
}
|
||||
} catch (IOException e) {
|
||||
ConcurrentLog.logException(e);
|
||||
}
|
||||
}
|
||||
if (uri == null) return prop; // no proper url addressed
|
||||
|
|
|
@ -17,6 +17,7 @@
|
|||
// along with this program; if not, write to the Free Software
|
||||
// Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
|
||||
|
||||
import java.io.IOException;
|
||||
import java.net.MalformedURLException;
|
||||
|
||||
import net.yacy.cora.document.encoding.ASCII;
|
||||
|
@ -25,6 +26,7 @@ import net.yacy.cora.order.Base64Order;
|
|||
import net.yacy.cora.protocol.HeaderFramework;
|
||||
import net.yacy.cora.protocol.RequestHeader;
|
||||
import net.yacy.cora.protocol.ResponseHeader;
|
||||
import net.yacy.cora.util.ConcurrentLog;
|
||||
import net.yacy.search.Switchboard;
|
||||
import net.yacy.search.index.Fulltext;
|
||||
import net.yacy.search.schema.HyperlinkEdge;
|
||||
|
@ -59,7 +61,11 @@ public class linkstructure {
|
|||
String hostname = null;
|
||||
if (about.length() == 12 && Base64Order.enhancedCoder.wellformed(ASCII.getBytes(about))) {
|
||||
byte[] urlhash = ASCII.getBytes(about);
|
||||
url = authenticated ? sb.getURL(urlhash) : null;
|
||||
try {
|
||||
url = authenticated ? sb.getURL(urlhash) : null;
|
||||
} catch (IOException e) {
|
||||
ConcurrentLog.logException(e);
|
||||
}
|
||||
} else if (url == null && about.length() > 0) { // consider "about" as url or hostname
|
||||
url = new DigestURL(about.indexOf("://") >= 0 ? about : "http://" + about); // accept also domains
|
||||
hostname = url.getHost();
|
||||
|
|
|
@ -65,7 +65,12 @@ public class webstructure {
|
|||
} else if (about.length() == 12 && Base64Order.enhancedCoder.wellformed(ASCII.getBytes(about))) {
|
||||
urlhash = ASCII.getBytes(about);
|
||||
hosthash = about.substring(6);
|
||||
url = authenticated ? sb.getURL(urlhash) : null;
|
||||
try {
|
||||
url = authenticated ? sb.getURL(urlhash) : null;
|
||||
} catch (IOException e) {
|
||||
url = null;
|
||||
ConcurrentLog.logException(e);
|
||||
}
|
||||
} else if (about.length() > 0) {
|
||||
// consider "about" as url or hostname
|
||||
try {
|
||||
|
@ -156,12 +161,17 @@ public class webstructure {
|
|||
Iterator<byte[]> i = ids.iterator();
|
||||
while (i.hasNext()) {
|
||||
byte[] refhash = i.next();
|
||||
DigestURL refurl = authenticated ? sb.getURL(refhash) : null;
|
||||
prop.put("citations_documents_0_anchors_" + d + "_urle", refurl == null ? 0 : 1);
|
||||
if (refurl != null) prop.putXML("citations_documents_0_anchors_" + d + "_urle_url", refurl.toNormalform(true));
|
||||
prop.put("citations_documents_0_anchors_" + d + "_urle_hash", refhash);
|
||||
prop.put("citations_documents_0_anchors_" + d + "_urle_date", GenericFormatter.SHORT_DAY_FORMATTER.format(new Date())); // superfluous?
|
||||
d++;
|
||||
DigestURL refurl;
|
||||
try {
|
||||
refurl = authenticated ? sb.getURL(refhash) : null;
|
||||
prop.put("citations_documents_0_anchors_" + d + "_urle", refurl == null ? 0 : 1);
|
||||
if (refurl != null) prop.putXML("citations_documents_0_anchors_" + d + "_urle_url", refurl.toNormalform(true));
|
||||
prop.put("citations_documents_0_anchors_" + d + "_urle_hash", refhash);
|
||||
prop.put("citations_documents_0_anchors_" + d + "_urle_date", GenericFormatter.SHORT_DAY_FORMATTER.format(new Date())); // superfluous?
|
||||
d++;
|
||||
} catch (IOException e) {
|
||||
ConcurrentLog.logException(e);
|
||||
}
|
||||
}
|
||||
prop.put("citations_documents_0_count", d);
|
||||
prop.put("citations_documents_0_anchors", d);
|
||||
|
|
|
@ -34,18 +34,16 @@ public class add_ymark {
|
|||
|
||||
if (post.containsKey("urlHash")) {
|
||||
final String urlHash = post.get("urlHash",YMarkUtil.EMPTY_STRING);
|
||||
final DigestURL url = sb.index.fulltext().getURL(urlHash);
|
||||
final String folders = post.get(YMarkEntry.BOOKMARK.FOLDERS.key(),YMarkEntry.BOOKMARK.FOLDERS.deflt());
|
||||
final String tags = post.get(YMarkEntry.BOOKMARK.TAGS.key(),YMarkUtil.EMPTY_STRING);
|
||||
try {
|
||||
final DigestURL url = sb.index.fulltext().getURL(urlHash);
|
||||
final String folders = post.get(YMarkEntry.BOOKMARK.FOLDERS.key(),YMarkEntry.BOOKMARK.FOLDERS.deflt());
|
||||
final String tags = post.get(YMarkEntry.BOOKMARK.TAGS.key(),YMarkUtil.EMPTY_STRING);
|
||||
ClientIdentification.Agent agent = ClientIdentification.getAgent(post.get("agentName", ClientIdentification.yacyInternetCrawlerAgentName));
|
||||
sb.tables.bookmarks.createBookmark(sb.loader, url, agent, bmk_user, true, tags, folders);
|
||||
prop.put("status", "1");
|
||||
} catch (final IOException e) {
|
||||
// TODO Auto-generated catch block
|
||||
ConcurrentLog.logException(e);
|
||||
} catch (final Failure e) {
|
||||
// TODO Auto-generated catch block
|
||||
ConcurrentLog.logException(e);
|
||||
}
|
||||
|
||||
|
|
|
@ -27,6 +27,7 @@
|
|||
// javac -classpath .:../classes transferRWI.java
|
||||
|
||||
|
||||
import java.io.IOException;
|
||||
import java.util.ArrayList;
|
||||
import java.util.HashSet;
|
||||
import java.util.Iterator;
|
||||
|
@ -234,9 +235,14 @@ public final class transferRWI {
|
|||
}
|
||||
for (String id: testids) {
|
||||
try {
|
||||
if (sb.index.fulltext().getLoadTime(id) >= 0) {
|
||||
knownURL.put(ASCII.getBytes(id));
|
||||
} else {
|
||||
try {
|
||||
if (sb.index.fulltext().getLoadTime(id) >= 0) {
|
||||
knownURL.put(ASCII.getBytes(id));
|
||||
} else {
|
||||
unknownURL.put(ASCII.getBytes(id));
|
||||
}
|
||||
} catch (IOException e) {
|
||||
ConcurrentLog.logException(e);
|
||||
unknownURL.put(ASCII.getBytes(id));
|
||||
}
|
||||
} catch (final SpaceExceededException e) {
|
||||
|
|
|
@ -144,7 +144,14 @@ public final class transferURL {
|
|||
|
||||
doublecheck = 0;
|
||||
for (String id : lEm.keySet()) {
|
||||
if (sb.index.getLoadTime(id) < 0) {
|
||||
long lt = -1;
|
||||
try {
|
||||
lt = sb.index.getLoadTime(id);
|
||||
} catch (IOException e1) {
|
||||
lt = -1;
|
||||
ConcurrentLog.logException(e1);
|
||||
}
|
||||
if (lt < 0) {
|
||||
lEntry = lEm.get(id);
|
||||
|
||||
// write entry to database
|
||||
|
|
|
@ -29,6 +29,7 @@ import net.yacy.cora.date.GenericFormatter;
|
|||
import net.yacy.cora.document.encoding.ASCII;
|
||||
import net.yacy.cora.document.id.DigestURL;
|
||||
import net.yacy.cora.protocol.RequestHeader;
|
||||
import net.yacy.cora.util.ConcurrentLog;
|
||||
import net.yacy.crawler.data.NoticedURL;
|
||||
import net.yacy.crawler.retrieval.Request;
|
||||
import net.yacy.kelondro.data.meta.URIMetadataNode;
|
||||
|
@ -74,7 +75,12 @@ public class urls {
|
|||
if (entry == null) break;
|
||||
|
||||
// find referrer, if there is one
|
||||
referrer = sb.getURL(entry.referrerhash());
|
||||
try {
|
||||
referrer = sb.getURL(entry.referrerhash());
|
||||
} catch (IOException e) {
|
||||
referrer = null;
|
||||
ConcurrentLog.logException(e);
|
||||
}
|
||||
|
||||
// place url to notice-url db
|
||||
sb.crawlQueues.delegatedURL.put(ASCII.String(entry.url().hash()), entry.url());
|
||||
|
@ -106,16 +112,20 @@ public class urls {
|
|||
entry = sb.index.fulltext().getMetadata(ASCII.getBytes(urlhashes.substring(12 * i, 12 * (i + 1))));
|
||||
if (entry == null) continue;
|
||||
// find referrer, if there is one
|
||||
referrer = sb.getURL(entry.referrerHash());
|
||||
// create RSS entry
|
||||
prop.put("item_" + c + "_title", entry.dc_title());
|
||||
prop.putXML("item_" + c + "_link", entry.url().toNormalform(true));
|
||||
prop.putXML("item_" + c + "_referrer", (referrer == null) ? "" : referrer.toNormalform(true));
|
||||
prop.putXML("item_" + c + "_description", entry.dc_title());
|
||||
prop.put("item_" + c + "_author", entry.dc_creator());
|
||||
prop.put("item_" + c + "_pubDate", GenericFormatter.SHORT_SECOND_FORMATTER.format(entry.moddate()));
|
||||
prop.put("item_" + c + "_guid", ASCII.String(entry.hash()));
|
||||
c++;
|
||||
try {
|
||||
referrer = sb.getURL(entry.referrerHash());
|
||||
// create RSS entry
|
||||
prop.put("item_" + c + "_title", entry.dc_title());
|
||||
prop.putXML("item_" + c + "_link", entry.url().toNormalform(true));
|
||||
prop.putXML("item_" + c + "_referrer", (referrer == null) ? "" : referrer.toNormalform(true));
|
||||
prop.putXML("item_" + c + "_description", entry.dc_title());
|
||||
prop.put("item_" + c + "_author", entry.dc_creator());
|
||||
prop.put("item_" + c + "_pubDate", GenericFormatter.SHORT_SECOND_FORMATTER.format(entry.moddate()));
|
||||
prop.put("item_" + c + "_guid", ASCII.String(entry.hash()));
|
||||
c++;
|
||||
} catch (IOException e) {
|
||||
ConcurrentLog.logException(e);
|
||||
}
|
||||
}
|
||||
prop.put("item", c);
|
||||
prop.putXML("response", "ok");
|
||||
|
|
|
@ -588,19 +588,23 @@ public class yacysearch {
|
|||
return prop;
|
||||
}
|
||||
final String bookmarkHash = post.get("bookmarkref", ""); // urlhash
|
||||
final DigestURL url = indexSegment.fulltext().getURL(bookmarkHash);
|
||||
if ( url != null ) {
|
||||
try {
|
||||
sb.tables.bookmarks.createBookmark(
|
||||
sb.loader,
|
||||
url,
|
||||
ClientIdentification.yacyInternetCrawlerAgent,
|
||||
YMarkTables.USER_ADMIN,
|
||||
true,
|
||||
"searchresult",
|
||||
"/search");
|
||||
} catch (final Throwable e ) {
|
||||
try {
|
||||
final DigestURL url = indexSegment.fulltext().getURL(bookmarkHash);
|
||||
if ( url != null ) {
|
||||
try {
|
||||
sb.tables.bookmarks.createBookmark(
|
||||
sb.loader,
|
||||
url,
|
||||
ClientIdentification.yacyInternetCrawlerAgent,
|
||||
YMarkTables.USER_ADMIN,
|
||||
true,
|
||||
"searchresult",
|
||||
"/search");
|
||||
} catch (final Throwable e ) {
|
||||
}
|
||||
}
|
||||
} catch (IOException e) {
|
||||
ConcurrentLog.logException(e);
|
||||
}
|
||||
}
|
||||
|
||||
|
|
|
@ -313,9 +313,6 @@ public abstract class AbstractSolrConnector implements SolrConnector {
|
|||
//params.setQuery(CollectionSchema.id.getSolrFieldName() + ":\"" + id + "\"");
|
||||
String q = "{!raw f=" + CollectionSchema.id.getSolrFieldName() + "}" + id;
|
||||
params.setQuery(q);
|
||||
//params.setQuery("*:*");
|
||||
//params.addFilterQuery(q);
|
||||
//params.set("defType", "raw");
|
||||
params.setRows(1);
|
||||
params.setStart(0);
|
||||
params.setFacet(false);
|
||||
|
|
|
@ -405,7 +405,7 @@ public class EmbeddedSolrConnector extends SolrServerConnector implements SolrCo
|
|||
* @throws IOException
|
||||
*/
|
||||
@Override
|
||||
public LoadTimeURL getLoadTimeURL(String id) {
|
||||
public LoadTimeURL getLoadTimeURL(String id) throws IOException {
|
||||
int responseCount = 0;
|
||||
DocListSearcher docListSearcher = null;
|
||||
try {
|
||||
|
@ -421,10 +421,10 @@ public class EmbeddedSolrConnector extends SolrServerConnector implements SolrCo
|
|||
//}
|
||||
} catch (Throwable e) {
|
||||
ConcurrentLog.logException(e);
|
||||
throw new IOException(e.getMessage());
|
||||
} finally {
|
||||
if (docListSearcher != null) docListSearcher.close();
|
||||
}
|
||||
return null;
|
||||
}
|
||||
|
||||
@Override
|
||||
|
|
|
@ -154,7 +154,7 @@ public interface SolrConnector extends Iterable<String> /* Iterable of document
|
|||
* @param query
|
||||
* @throws IOException
|
||||
*/
|
||||
public QueryResponse getResponseByParams(final ModifiableSolrParams query) throws IOException, SolrException;
|
||||
public QueryResponse getResponseByParams(final ModifiableSolrParams query) throws IOException;
|
||||
|
||||
/**
|
||||
* get the solr document list from a query response
|
||||
|
@ -165,7 +165,7 @@ public interface SolrConnector extends Iterable<String> /* Iterable of document
|
|||
* @throws IOException
|
||||
* @throws SolrException
|
||||
*/
|
||||
public SolrDocumentList getDocumentListByParams(ModifiableSolrParams params) throws IOException, SolrException;
|
||||
public SolrDocumentList getDocumentListByParams(ModifiableSolrParams params) throws IOException;
|
||||
|
||||
/**
|
||||
* get the number of results for a query response
|
||||
|
@ -174,7 +174,7 @@ public interface SolrConnector extends Iterable<String> /* Iterable of document
|
|||
* @throws IOException
|
||||
* @throws SolrException
|
||||
*/
|
||||
public long getDocumentCountByParams(ModifiableSolrParams params) throws IOException, SolrException;
|
||||
public long getDocumentCountByParams(ModifiableSolrParams params) throws IOException;
|
||||
|
||||
/**
|
||||
* get a query result from solr
|
||||
|
@ -191,7 +191,7 @@ public interface SolrConnector extends Iterable<String> /* Iterable of document
|
|||
final String sort,
|
||||
final int offset,
|
||||
final int count,
|
||||
final String ... fields) throws IOException, SolrException;
|
||||
final String ... fields) throws IOException;
|
||||
|
||||
/**
|
||||
* get the number of results when this query is done.
|
||||
|
|
|
@ -33,7 +33,6 @@ import net.yacy.search.schema.CollectionSchema;
|
|||
import org.apache.lucene.analysis.NumericTokenStream;
|
||||
import org.apache.solr.common.SolrDocumentList;
|
||||
import org.apache.solr.common.SolrException;
|
||||
import org.apache.solr.common.SolrException.ErrorCode;
|
||||
import org.apache.solr.common.SolrInputDocument;
|
||||
import org.apache.solr.common.params.ModifiableSolrParams;
|
||||
import org.apache.solr.common.util.NamedList;
|
||||
|
@ -289,7 +288,7 @@ public abstract class SolrServerConnector extends AbstractSolrConnector implemen
|
|||
* @throws SolrException
|
||||
*/
|
||||
@Override
|
||||
public SolrDocumentList getDocumentListByParams(ModifiableSolrParams params) throws IOException, SolrException {
|
||||
public SolrDocumentList getDocumentListByParams(ModifiableSolrParams params) throws IOException {
|
||||
if (this.server == null) throw new IOException("server disconnected");
|
||||
// during the solr query we set the thread name to the query string to get more debugging info in thread dumps
|
||||
String q = params.get("q");
|
||||
|
@ -297,18 +296,25 @@ public abstract class SolrServerConnector extends AbstractSolrConnector implemen
|
|||
String threadname = Thread.currentThread().getName();
|
||||
if (q != null) Thread.currentThread().setName("solr query: q = " + q + (fq == null ? "" : ", fq = " + fq));
|
||||
QueryResponse rsp;
|
||||
try {
|
||||
rsp = this.server.query(params);
|
||||
if (q != null) Thread.currentThread().setName(threadname);
|
||||
if (rsp != null) if (log.isFine()) log.fine(rsp.getResults().getNumFound() + " results for q=" + q);
|
||||
return rsp.getResults();
|
||||
} catch (final SolrServerException e) {
|
||||
clearCaches(); // prevent further OOM if this was caused by OOM
|
||||
throw new SolrException(ErrorCode.UNKNOWN, e);
|
||||
} catch (final Throwable e) {
|
||||
clearCaches(); // prevent further OOM if this was caused by OOM
|
||||
throw new IOException("Error executing query", e);
|
||||
int retry = 10;
|
||||
Throwable error = null;
|
||||
while (retry-- > 0) {
|
||||
try {
|
||||
rsp = this.server.query(params);
|
||||
if (q != null) Thread.currentThread().setName(threadname);
|
||||
if (rsp != null) if (log.isFine()) log.fine(rsp.getResults().getNumFound() + " results for q=" + q);
|
||||
return rsp.getResults();
|
||||
} catch (final SolrServerException e) {
|
||||
error = e;
|
||||
clearCaches(); // prevent further OOM if this was caused by OOM
|
||||
} catch (final Throwable e) {
|
||||
error = e;
|
||||
clearCaches(); // prevent further OOM if this was caused by OOM
|
||||
}
|
||||
ConcurrentLog.severe("SolrServerConnector", "Failed to query remote Solr: " + error.getMessage() + ", query:" + q + (fq == null ? "" : ", fq = " + fq));
|
||||
try {Thread.sleep(1000);} catch (InterruptedException e) {}
|
||||
}
|
||||
throw new IOException("Error executing query", error);
|
||||
}
|
||||
|
||||
// luke requests: these do not work for attached SolrCloud Server
|
||||
|
|
|
@ -388,7 +388,10 @@ public final class CrawlStacker {
|
|||
try {
|
||||
oldEntry = this.indexSegment.fulltext().getDefaultConnector().getLoadTimeURL(urlhash);
|
||||
} catch (IOException e) {
|
||||
// if an exception here occurs then there is the danger that urls which had been in the crawler are overwritten a second time
|
||||
// to prevent that, we reject urls in these events
|
||||
ConcurrentLog.logException(e);
|
||||
return "exception during double-test: " + e.getMessage();
|
||||
}
|
||||
final Long oldDate = oldEntry == null ? null : oldEntry.date;
|
||||
if (oldDate == null) {
|
||||
|
|
|
@ -108,11 +108,16 @@ public class RSSLoader extends Thread {
|
|||
}
|
||||
}
|
||||
for (final Map.Entry<String, DigestURL> e: urlmap.entrySet()) {
|
||||
HarvestProcess harvestProcess = sb.urlExists(e.getKey());
|
||||
if (harvestProcess != null) continue;
|
||||
list.add(e.getValue());
|
||||
indexTriggered.insertIfAbsent(ASCII.getBytes(e.getKey()), new Date());
|
||||
loadCount++;
|
||||
HarvestProcess harvestProcess;
|
||||
try {
|
||||
harvestProcess = sb.urlExists(e.getKey());
|
||||
if (harvestProcess != null) continue;
|
||||
list.add(e.getValue());
|
||||
indexTriggered.insertIfAbsent(ASCII.getBytes(e.getKey()), new Date());
|
||||
loadCount++;
|
||||
} catch (IOException e1) {
|
||||
ConcurrentLog.logException(e1);
|
||||
}
|
||||
}
|
||||
sb.addToIndex(list, null, null, collections, true);
|
||||
// update info for loading
|
||||
|
|
|
@ -25,6 +25,7 @@
|
|||
|
||||
package net.yacy.crawler.retrieval;
|
||||
|
||||
import java.io.IOException;
|
||||
import java.net.MalformedURLException;
|
||||
import java.util.Date;
|
||||
|
||||
|
@ -82,15 +83,20 @@ public class SitemapImporter extends Thread {
|
|||
// check if the url is known and needs to be recrawled
|
||||
Date lastMod = entry.lastmod(null);
|
||||
if (lastMod != null) {
|
||||
final HarvestProcess dbocc = this.sb.urlExists(ASCII.String(nexturlhash));
|
||||
if (dbocc != null && dbocc == HarvestProcess.LOADED) {
|
||||
// the url was already loaded. we need to check the date
|
||||
final URIMetadataNode oldEntry = this.sb.index.fulltext().getMetadata(nexturlhash);
|
||||
if (oldEntry != null) {
|
||||
final Date modDate = oldEntry.moddate();
|
||||
// check if modDate is null
|
||||
if (modDate.after(lastMod)) return;
|
||||
HarvestProcess dbocc;
|
||||
try {
|
||||
dbocc = this.sb.urlExists(ASCII.String(nexturlhash));
|
||||
if (dbocc != null && dbocc == HarvestProcess.LOADED) {
|
||||
// the url was already loaded. we need to check the date
|
||||
final URIMetadataNode oldEntry = this.sb.index.fulltext().getMetadata(nexturlhash);
|
||||
if (oldEntry != null) {
|
||||
final Date modDate = oldEntry.moddate();
|
||||
// check if modDate is null
|
||||
if (modDate.after(lastMod)) return;
|
||||
}
|
||||
}
|
||||
} catch (IOException e) {
|
||||
ConcurrentLog.logException(e);
|
||||
}
|
||||
}
|
||||
|
||||
|
|
|
@ -35,6 +35,7 @@ import net.yacy.cora.document.encoding.ASCII;
|
|||
import net.yacy.cora.document.id.DigestURL;
|
||||
import net.yacy.cora.federate.yacy.CacheStrategy;
|
||||
import net.yacy.cora.protocol.ClientIdentification;
|
||||
import net.yacy.cora.util.ConcurrentLog;
|
||||
import net.yacy.crawler.retrieval.Response;
|
||||
import net.yacy.document.Document;
|
||||
import net.yacy.document.Parser.Failure;
|
||||
|
@ -82,7 +83,12 @@ public class YMarkMetadata {
|
|||
public YMarkMetadata(final byte[] urlHash, final Segment indexSegment) {
|
||||
this.document = null;
|
||||
this.indexSegment = indexSegment;
|
||||
this.uri = this.indexSegment.fulltext().getURL(ASCII.String(urlHash));
|
||||
try {
|
||||
this.uri = this.indexSegment.fulltext().getURL(ASCII.String(urlHash));
|
||||
} catch (IOException e) {
|
||||
this.uri = null;
|
||||
ConcurrentLog.logException(e);
|
||||
}
|
||||
}
|
||||
|
||||
public YMarkMetadata(final Document document) {
|
||||
|
|
|
@ -24,6 +24,7 @@
|
|||
|
||||
package net.yacy.peers;
|
||||
|
||||
import java.io.IOException;
|
||||
import java.util.ArrayList;
|
||||
import java.util.HashSet;
|
||||
import java.util.Iterator;
|
||||
|
@ -174,9 +175,15 @@ public class Transmission {
|
|||
i = c.entries();
|
||||
while (i.hasNext()) {
|
||||
final WordReference e = i.next();
|
||||
if (Transmission.this.segment.fulltext().getLoadTime(ASCII.String(e.urlhash())) >= 0) {
|
||||
this.references.put(e.urlhash());
|
||||
} else {
|
||||
try {
|
||||
if (Transmission.this.segment.fulltext().getLoadTime(ASCII.String(e.urlhash())) >= 0) {
|
||||
this.references.put(e.urlhash());
|
||||
} else {
|
||||
notFoundx.add(e.urlhash());
|
||||
this.badReferences.put(e.urlhash());
|
||||
}
|
||||
} catch (IOException e1) {
|
||||
ConcurrentLog.logException(e1);
|
||||
notFoundx.add(e.urlhash());
|
||||
this.badReferences.put(e.urlhash());
|
||||
}
|
||||
|
|
|
@ -1620,18 +1620,12 @@ public final class Switchboard extends serverSwitch {
|
|||
* @param hash
|
||||
* @return if it exists, the name of the database is returned, if it not exists, null is returned
|
||||
*/
|
||||
public HarvestProcess urlExists(final String hash) {
|
||||
if (this.index.getLoadTime(hash) >= 0) return HarvestProcess.LOADED;
|
||||
public HarvestProcess urlExists(final String hash) throws IOException {
|
||||
LoadTimeURL md = this.index.fulltext().getDefaultConnector().getLoadTimeURL(hash);
|
||||
if (md != null && md.date >= 0) return HarvestProcess.LOADED;
|
||||
HarvestProcess hp = this.crawlQueues.exists(ASCII.getBytes(hash));
|
||||
if (hp != null) return hp;
|
||||
try {
|
||||
LoadTimeURL md = this.index.fulltext().getDefaultConnector().getLoadTimeURL(hash);
|
||||
if (md == null) return null;
|
||||
return HarvestProcess.LOADED; // todo: can also be in error
|
||||
} catch (IOException e) {
|
||||
ConcurrentLog.logException(e);
|
||||
return null;
|
||||
}
|
||||
return null; // todo: can also be in error
|
||||
}
|
||||
|
||||
public void urlRemove(final Segment segment, final byte[] hash) {
|
||||
|
@ -1640,7 +1634,7 @@ public final class Switchboard extends serverSwitch {
|
|||
this.crawlQueues.removeURL(hash);
|
||||
}
|
||||
|
||||
public DigestURL getURL(final byte[] urlhash) {
|
||||
public DigestURL getURL(final byte[] urlhash) throws IOException {
|
||||
if (urlhash == null) return null;
|
||||
if (urlhash.length == 0) return null;
|
||||
final DigestURL url = this.index.fulltext().getURL(ASCII.String(urlhash));
|
||||
|
@ -2977,7 +2971,15 @@ public final class Switchboard extends serverSwitch {
|
|||
// stacking may fail because of double occurrences of that url. Therefore
|
||||
// we must wait here until the url has actually disappeared
|
||||
int t = 100;
|
||||
while (t-- > 0 && this.index.getLoadTime(ASCII.String(urlhash)) >= 0) {
|
||||
while (t-- > 0) {
|
||||
try {
|
||||
long lt = this.index.getLoadTime(ASCII.String(urlhash));
|
||||
if (lt < 0) break;
|
||||
} catch (IOException e) {
|
||||
// if this fails, the url may still exist
|
||||
// we should abandon the whole process
|
||||
return "exist-test failed: " + e.getMessage();
|
||||
}
|
||||
try {Thread.sleep(100);} catch (final InterruptedException e) {}
|
||||
ConcurrentLog.fine("Switchboard", "STACKURL: waiting for deletion, t=" + t);
|
||||
//if (t == 20) this.index.fulltext().commit(true);
|
||||
|
@ -3094,9 +3096,17 @@ public final class Switchboard extends serverSwitch {
|
|||
final List<Request> requests = new ArrayList<Request>();
|
||||
for (Map.Entry<String, DigestURL> e: urlmap.entrySet()) {
|
||||
final String urlName = e.getValue().toNormalform(true);
|
||||
if (doublecheck && this.index.getLoadTime(e.getKey()) >= 0) {
|
||||
this.log.info("addToIndex: double " + urlName);
|
||||
continue;
|
||||
if (doublecheck) {
|
||||
try {
|
||||
if (this.index.getLoadTime(e.getKey()) >= 0) {
|
||||
this.log.info("addToIndex: double " + urlName);
|
||||
continue;
|
||||
}
|
||||
} catch (IOException ee) {
|
||||
// double check fail may mean that the url exist
|
||||
this.log.info("addToIndex: doublecheck failed for " + urlName + ": " + ee.getMessage());
|
||||
continue;
|
||||
}
|
||||
}
|
||||
final Request request = this.loader.request(e.getValue(), true, true);
|
||||
final CrawlProfile profile = this.crawler.get(ASCII.getBytes(request.profileHandle()));
|
||||
|
@ -3168,7 +3178,11 @@ public final class Switchboard extends serverSwitch {
|
|||
Map<String, DigestURL> urlmap = new HashMap<String, DigestURL>();
|
||||
for (DigestURL url: urls) urlmap.put(ASCII.String(url.hash()), url);
|
||||
for (Map.Entry<String, DigestURL> e: urlmap.entrySet()) {
|
||||
if (this.index.getLoadTime(e.getKey()) >= 0) continue; // double
|
||||
try {
|
||||
if (this.index.getLoadTime(e.getKey()) >= 0) continue; // double
|
||||
} catch (IOException ee) {
|
||||
continue; // if the check fails, consider the url as double
|
||||
}
|
||||
DigestURL url = e.getValue();
|
||||
final Request request = this.loader.request(url, true, true);
|
||||
final CrawlProfile profile = this.crawler.get(ASCII.getBytes(request.profileHandle()));
|
||||
|
|
|
@ -473,16 +473,12 @@ public final class Fulltext {
|
|||
return false;
|
||||
}
|
||||
|
||||
public DigestURL getURL(final String urlHash) {
|
||||
public DigestURL getURL(final String urlHash) throws IOException {
|
||||
if (urlHash == null || this.getDefaultConnector() == null) return null;
|
||||
|
||||
try {
|
||||
SolrConnector.LoadTimeURL md = this.getDefaultConnector().getLoadTimeURL(urlHash);
|
||||
if (md == null) return null;
|
||||
return new DigestURL(md.url, ASCII.getBytes(urlHash));
|
||||
} catch (final IOException e) {
|
||||
return null;
|
||||
}
|
||||
SolrConnector.LoadTimeURL md = this.getDefaultConnector().getLoadTimeURL(urlHash);
|
||||
if (md == null) return null;
|
||||
return new DigestURL(md.url, ASCII.getBytes(urlHash));
|
||||
}
|
||||
|
||||
/**
|
||||
|
@ -490,16 +486,11 @@ public final class Fulltext {
|
|||
* @param urlHash
|
||||
* @return the time in milliseconds since epoch for the load time or -1 if the document does not exist
|
||||
*/
|
||||
public long getLoadTime(final String urlHash) {
|
||||
public long getLoadTime(final String urlHash) throws IOException {
|
||||
if (urlHash == null) return -1l;
|
||||
try {
|
||||
SolrConnector.LoadTimeURL md = this.getDefaultConnector().getLoadTimeURL(urlHash);
|
||||
if (md == null) return -1l;
|
||||
return md.date;
|
||||
} catch (final Throwable e) {
|
||||
ConcurrentLog.logException(e);
|
||||
}
|
||||
return -1l;
|
||||
SolrConnector.LoadTimeURL md = this.getDefaultConnector().getLoadTimeURL(urlHash);
|
||||
if (md == null) return -1l;
|
||||
return md.date;
|
||||
}
|
||||
|
||||
public List<File> dumpFiles() {
|
||||
|
|
|
@ -356,7 +356,7 @@ public class Segment {
|
|||
* @param urlHash
|
||||
* @return the time in milliseconds since epoch for the load time or -1 if the document does not exist
|
||||
*/
|
||||
public long getLoadTime(final String urlhash) {
|
||||
public long getLoadTime(final String urlhash) throws IOException {
|
||||
return this.fulltext.getLoadTime(urlhash);
|
||||
}
|
||||
|
||||
|
@ -683,10 +683,10 @@ public class Segment {
|
|||
|
||||
if (urlhash == null) return 0;
|
||||
// determine the url string
|
||||
final DigestURL url = fulltext().getURL(ASCII.String(urlhash));
|
||||
if (url == null) return 0;
|
||||
|
||||
try {
|
||||
final DigestURL url = fulltext().getURL(ASCII.String(urlhash));
|
||||
if (url == null) return 0;
|
||||
|
||||
// parse the resource
|
||||
final Document document = Document.mergeDocuments(url, null, loader.loadDocuments(loader.request(url, true, false), cacheStrategy, Integer.MAX_VALUE, null, agent));
|
||||
if (document == null) {
|
||||
|
|
Loading…
Reference in New Issue
Block a user