mirror of
https://github.com/yacy/yacy_search_server.git
synced 2024-09-19 00:01:41 +02:00
better error handling for remote solr queries and exists-checks
This commit is contained in:
parent
b510b182d8
commit
22ce4fb4dd
|
@ -138,8 +138,13 @@ public class HostBrowser {
|
||||||
|
|
||||||
String load = post.get("load", "");
|
String load = post.get("load", "");
|
||||||
boolean wait = false;
|
boolean wait = false;
|
||||||
if (loadRight && autoload && path.length() != 0 && pathURI != null && load.length() == 0 && sb.index.getLoadTime(ASCII.String(pathURI.hash())) < 0) {
|
try {
|
||||||
// in case that the url does not exist and loading is wanted turn this request into a loading request
|
if (loadRight && autoload && path.length() != 0 && pathURI != null && load.length() == 0 && sb.index.getLoadTime(ASCII.String(pathURI.hash())) < 0) {
|
||||||
|
// in case that the url does not exist and loading is wanted turn this request into a loading request
|
||||||
|
load = path;
|
||||||
|
wait = true;
|
||||||
|
}
|
||||||
|
} catch (IOException e1) {
|
||||||
load = path;
|
load = path;
|
||||||
wait = true;
|
wait = true;
|
||||||
}
|
}
|
||||||
|
@ -156,8 +161,13 @@ public class HostBrowser {
|
||||||
0, 0, 0
|
0, 0, 0
|
||||||
));
|
));
|
||||||
prop.putHTML("result", reasonString == null ? ("added url to indexer: " + load) : ("not indexed url '" + load + "': " + reasonString));
|
prop.putHTML("result", reasonString == null ? ("added url to indexer: " + load) : ("not indexed url '" + load + "': " + reasonString));
|
||||||
if (wait) for (int i = 0; i < 30; i++) {
|
if (wait) waitloop: for (int i = 0; i < 30; i++) {
|
||||||
if (sb.index.getLoadTime(ASCII.String(url.hash())) >= 0) break;
|
try {
|
||||||
|
if (sb.index.getLoadTime(ASCII.String(url.hash())) >= 0) break;
|
||||||
|
} catch (IOException e1) {
|
||||||
|
e1.printStackTrace();
|
||||||
|
break waitloop;
|
||||||
|
}
|
||||||
try {Thread.sleep(100);} catch (final InterruptedException e) {}
|
try {Thread.sleep(100);} catch (final InterruptedException e) {}
|
||||||
}
|
}
|
||||||
} catch (final MalformedURLException e) {
|
} catch (final MalformedURLException e) {
|
||||||
|
|
|
@ -369,8 +369,7 @@ public class IndexControlRWIs_p {
|
||||||
Word.commonHashOrder,
|
Word.commonHashOrder,
|
||||||
urlb.size());
|
urlb.size());
|
||||||
if ( post.containsKey("blacklisturls") ) {
|
if ( post.containsKey("blacklisturls") ) {
|
||||||
final String[] supportedBlacklistTypes =
|
final String[] supportedBlacklistTypes = env.getConfig("BlackLists.types", "").split(",");
|
||||||
env.getConfig("BlackLists.types", "").split(",");
|
|
||||||
DigestURL url;
|
DigestURL url;
|
||||||
for ( final byte[] b : urlb ) {
|
for ( final byte[] b : urlb ) {
|
||||||
try {
|
try {
|
||||||
|
@ -378,28 +377,32 @@ public class IndexControlRWIs_p {
|
||||||
} catch (final SpaceExceededException e ) {
|
} catch (final SpaceExceededException e ) {
|
||||||
ConcurrentLog.logException(e);
|
ConcurrentLog.logException(e);
|
||||||
}
|
}
|
||||||
url = segment.fulltext().getURL(ASCII.String(b));
|
try {
|
||||||
segment.fulltext().remove(b);
|
url = segment.fulltext().getURL(ASCII.String(b));
|
||||||
if ( url != null ) {
|
segment.fulltext().remove(b);
|
||||||
for ( final String supportedBlacklistType : supportedBlacklistTypes ) {
|
if ( url != null ) {
|
||||||
if ( ListManager.listSetContains(
|
for ( final String supportedBlacklistType : supportedBlacklistTypes ) {
|
||||||
supportedBlacklistType + ".BlackLists",
|
if ( ListManager.listSetContains(
|
||||||
blacklist) ) {
|
supportedBlacklistType + ".BlackLists",
|
||||||
try {
|
blacklist) ) {
|
||||||
Switchboard.urlBlacklist.add(
|
try {
|
||||||
BlacklistType.valueOf(supportedBlacklistType),
|
Switchboard.urlBlacklist.add(
|
||||||
blacklist,
|
BlacklistType.valueOf(supportedBlacklistType),
|
||||||
url.getHost(),
|
blacklist,
|
||||||
url.getFile());
|
url.getHost(),
|
||||||
} catch (PunycodeException e) {
|
url.getFile());
|
||||||
ConcurrentLog.warn(APP_NAME,
|
} catch (PunycodeException e) {
|
||||||
"Unable to add blacklist entry to blacklist "
|
ConcurrentLog.warn(APP_NAME,
|
||||||
+ supportedBlacklistType, e);
|
"Unable to add blacklist entry to blacklist "
|
||||||
|
+ supportedBlacklistType, e);
|
||||||
|
}
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
}
|
SearchEventCache.cleanupEvents(true);
|
||||||
SearchEventCache.cleanupEvents(true);
|
}
|
||||||
}
|
} catch (IOException e1) {
|
||||||
|
ConcurrentLog.logException(e1);
|
||||||
|
}
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
@ -411,27 +414,29 @@ public class IndexControlRWIs_p {
|
||||||
} catch (final SpaceExceededException e ) {
|
} catch (final SpaceExceededException e ) {
|
||||||
ConcurrentLog.logException(e);
|
ConcurrentLog.logException(e);
|
||||||
}
|
}
|
||||||
url = segment.fulltext().getURL(ASCII.String(b));
|
try {
|
||||||
segment.fulltext().remove(b);
|
url = segment.fulltext().getURL(ASCII.String(b));
|
||||||
if ( url != null ) {
|
segment.fulltext().remove(b);
|
||||||
for ( final BlacklistType supportedBlacklistType : BlacklistType.values() ) {
|
if ( url != null ) {
|
||||||
if ( ListManager.listSetContains(
|
for ( final BlacklistType supportedBlacklistType : BlacklistType.values() ) {
|
||||||
supportedBlacklistType + ".BlackLists",
|
if ( ListManager.listSetContains(supportedBlacklistType + ".BlackLists", blacklist) ) {
|
||||||
blacklist) ) {
|
try {
|
||||||
try {
|
Switchboard.urlBlacklist.add(
|
||||||
Switchboard.urlBlacklist.add(
|
supportedBlacklistType,
|
||||||
supportedBlacklistType,
|
blacklist,
|
||||||
blacklist,
|
url.getHost(),
|
||||||
url.getHost(),
|
".*");
|
||||||
".*");
|
} catch (PunycodeException e) {
|
||||||
} catch (PunycodeException e) {
|
ConcurrentLog.warn(APP_NAME,
|
||||||
ConcurrentLog.warn(APP_NAME,
|
"Unable to add blacklist entry to blacklist "
|
||||||
"Unable to add blacklist entry to blacklist "
|
+ supportedBlacklistType, e);
|
||||||
+ supportedBlacklistType, e);
|
}
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
}
|
} catch (IOException e1) {
|
||||||
|
ConcurrentLog.logException(e1);
|
||||||
|
}
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
try {
|
try {
|
||||||
|
|
|
@ -183,14 +183,19 @@ public class IndexControlURLs_p {
|
||||||
}
|
}
|
||||||
|
|
||||||
if (post.containsKey("urlhashdelete")) {
|
if (post.containsKey("urlhashdelete")) {
|
||||||
final DigestURL url = segment.fulltext().getURL(urlhash);
|
DigestURL url;
|
||||||
if (url == null) {
|
try {
|
||||||
prop.putHTML("result", "No Entry for URL hash " + urlhash + "; nothing deleted.");
|
url = segment.fulltext().getURL(urlhash);
|
||||||
} else {
|
if (url == null) {
|
||||||
urlstring = url.toNormalform(true);
|
prop.putHTML("result", "No Entry for URL hash " + urlhash + "; nothing deleted.");
|
||||||
prop.put("urlstring", "");
|
} else {
|
||||||
sb.urlRemove(segment, urlhash.getBytes());
|
urlstring = url.toNormalform(true);
|
||||||
prop.putHTML("result", "Removed URL " + urlstring);
|
prop.put("urlstring", "");
|
||||||
|
sb.urlRemove(segment, urlhash.getBytes());
|
||||||
|
prop.putHTML("result", "Removed URL " + urlstring);
|
||||||
|
}
|
||||||
|
} catch (IOException e) {
|
||||||
|
prop.putHTML("result", "Error when querying the url hash " + urlhash + ":" + e.getMessage());
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
|
@ -352,17 +352,23 @@ public class Load_RSS_p {
|
||||||
author = item.getAuthor();
|
author = item.getAuthor();
|
||||||
if (author == null) author = item.getCopyright();
|
if (author == null) author = item.getCopyright();
|
||||||
pubDate = item.getPubDate();
|
pubDate = item.getPubDate();
|
||||||
HarvestProcess harvestProcess = sb.urlExists(ASCII.String(messageurl.hash()));
|
HarvestProcess harvestProcess;
|
||||||
prop.put("showitems_item_" + i + "_state", harvestProcess != null ? 2 : RSSLoader.indexTriggered.containsKey(messageurl.hash()) ? 1 : 0);
|
try {
|
||||||
prop.put("showitems_item_" + i + "_state_count", i);
|
harvestProcess = sb.urlExists(ASCII.String(messageurl.hash()));
|
||||||
prop.putHTML("showitems_item_" + i + "_state_guid", item.getGuid());
|
prop.put("showitems_item_" + i + "_state", harvestProcess != null ? 2 : RSSLoader.indexTriggered.containsKey(messageurl.hash()) ? 1 : 0);
|
||||||
prop.putHTML("showitems_item_" + i + "_author", author == null ? "" : author);
|
prop.put("showitems_item_" + i + "_state_count", i);
|
||||||
prop.putHTML("showitems_item_" + i + "_title", item.getTitle());
|
prop.putHTML("showitems_item_" + i + "_state_guid", item.getGuid());
|
||||||
prop.putHTML("showitems_item_" + i + "_link", messageurl.toNormalform(true));
|
prop.putHTML("showitems_item_" + i + "_author", author == null ? "" : author);
|
||||||
prop.putHTML("showitems_item_" + i + "_description", item.getDescriptions().toString());
|
prop.putHTML("showitems_item_" + i + "_title", item.getTitle());
|
||||||
prop.putHTML("showitems_item_" + i + "_language", item.getLanguage());
|
prop.putHTML("showitems_item_" + i + "_link", messageurl.toNormalform(true));
|
||||||
prop.putHTML("showitems_item_" + i + "_date", (pubDate == null) ? "" : DateFormat.getDateTimeInstance().format(pubDate));
|
prop.putHTML("showitems_item_" + i + "_description", item.getDescriptions().toString());
|
||||||
i++;
|
prop.putHTML("showitems_item_" + i + "_language", item.getLanguage());
|
||||||
|
prop.putHTML("showitems_item_" + i + "_date", (pubDate == null) ? "" : DateFormat.getDateTimeInstance().format(pubDate));
|
||||||
|
i++;
|
||||||
|
} catch (IOException e) {
|
||||||
|
ConcurrentLog.logException(e);
|
||||||
|
continue;
|
||||||
|
}
|
||||||
} catch (final MalformedURLException e) {
|
} catch (final MalformedURLException e) {
|
||||||
ConcurrentLog.logException(e);
|
ConcurrentLog.logException(e);
|
||||||
continue;
|
continue;
|
||||||
|
|
|
@ -35,6 +35,7 @@ import net.yacy.cora.document.id.DigestURL;
|
||||||
import net.yacy.cora.federate.solr.connector.SolrConnector;
|
import net.yacy.cora.federate.solr.connector.SolrConnector;
|
||||||
import net.yacy.cora.protocol.RequestHeader;
|
import net.yacy.cora.protocol.RequestHeader;
|
||||||
import net.yacy.cora.sorting.OrderedScoreMap;
|
import net.yacy.cora.sorting.OrderedScoreMap;
|
||||||
|
import net.yacy.cora.util.ConcurrentLog;
|
||||||
import net.yacy.document.SentenceReader;
|
import net.yacy.document.SentenceReader;
|
||||||
import net.yacy.search.Switchboard;
|
import net.yacy.search.Switchboard;
|
||||||
import net.yacy.search.index.Segment;
|
import net.yacy.search.index.Segment;
|
||||||
|
@ -86,10 +87,14 @@ public class citation {
|
||||||
} catch (final MalformedURLException e) {}
|
} catch (final MalformedURLException e) {}
|
||||||
}
|
}
|
||||||
if (uri == null && hash.length() > 0) {
|
if (uri == null && hash.length() > 0) {
|
||||||
uri = sb.getURL(ASCII.getBytes(hash));
|
try {
|
||||||
if (uri == null) {
|
|
||||||
connector.commit(true); // try again, that url can be fresh
|
|
||||||
uri = sb.getURL(ASCII.getBytes(hash));
|
uri = sb.getURL(ASCII.getBytes(hash));
|
||||||
|
if (uri == null) {
|
||||||
|
connector.commit(true); // try again, that url can be fresh
|
||||||
|
uri = sb.getURL(ASCII.getBytes(hash));
|
||||||
|
}
|
||||||
|
} catch (IOException e) {
|
||||||
|
ConcurrentLog.logException(e);
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
if (uri == null) return prop; // no proper url addressed
|
if (uri == null) return prop; // no proper url addressed
|
||||||
|
|
|
@ -17,6 +17,7 @@
|
||||||
// along with this program; if not, write to the Free Software
|
// along with this program; if not, write to the Free Software
|
||||||
// Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
|
// Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
|
||||||
|
|
||||||
|
import java.io.IOException;
|
||||||
import java.net.MalformedURLException;
|
import java.net.MalformedURLException;
|
||||||
|
|
||||||
import net.yacy.cora.document.encoding.ASCII;
|
import net.yacy.cora.document.encoding.ASCII;
|
||||||
|
@ -25,6 +26,7 @@ import net.yacy.cora.order.Base64Order;
|
||||||
import net.yacy.cora.protocol.HeaderFramework;
|
import net.yacy.cora.protocol.HeaderFramework;
|
||||||
import net.yacy.cora.protocol.RequestHeader;
|
import net.yacy.cora.protocol.RequestHeader;
|
||||||
import net.yacy.cora.protocol.ResponseHeader;
|
import net.yacy.cora.protocol.ResponseHeader;
|
||||||
|
import net.yacy.cora.util.ConcurrentLog;
|
||||||
import net.yacy.search.Switchboard;
|
import net.yacy.search.Switchboard;
|
||||||
import net.yacy.search.index.Fulltext;
|
import net.yacy.search.index.Fulltext;
|
||||||
import net.yacy.search.schema.HyperlinkEdge;
|
import net.yacy.search.schema.HyperlinkEdge;
|
||||||
|
@ -59,7 +61,11 @@ public class linkstructure {
|
||||||
String hostname = null;
|
String hostname = null;
|
||||||
if (about.length() == 12 && Base64Order.enhancedCoder.wellformed(ASCII.getBytes(about))) {
|
if (about.length() == 12 && Base64Order.enhancedCoder.wellformed(ASCII.getBytes(about))) {
|
||||||
byte[] urlhash = ASCII.getBytes(about);
|
byte[] urlhash = ASCII.getBytes(about);
|
||||||
url = authenticated ? sb.getURL(urlhash) : null;
|
try {
|
||||||
|
url = authenticated ? sb.getURL(urlhash) : null;
|
||||||
|
} catch (IOException e) {
|
||||||
|
ConcurrentLog.logException(e);
|
||||||
|
}
|
||||||
} else if (url == null && about.length() > 0) { // consider "about" as url or hostname
|
} else if (url == null && about.length() > 0) { // consider "about" as url or hostname
|
||||||
url = new DigestURL(about.indexOf("://") >= 0 ? about : "http://" + about); // accept also domains
|
url = new DigestURL(about.indexOf("://") >= 0 ? about : "http://" + about); // accept also domains
|
||||||
hostname = url.getHost();
|
hostname = url.getHost();
|
||||||
|
|
|
@ -65,7 +65,12 @@ public class webstructure {
|
||||||
} else if (about.length() == 12 && Base64Order.enhancedCoder.wellformed(ASCII.getBytes(about))) {
|
} else if (about.length() == 12 && Base64Order.enhancedCoder.wellformed(ASCII.getBytes(about))) {
|
||||||
urlhash = ASCII.getBytes(about);
|
urlhash = ASCII.getBytes(about);
|
||||||
hosthash = about.substring(6);
|
hosthash = about.substring(6);
|
||||||
url = authenticated ? sb.getURL(urlhash) : null;
|
try {
|
||||||
|
url = authenticated ? sb.getURL(urlhash) : null;
|
||||||
|
} catch (IOException e) {
|
||||||
|
url = null;
|
||||||
|
ConcurrentLog.logException(e);
|
||||||
|
}
|
||||||
} else if (about.length() > 0) {
|
} else if (about.length() > 0) {
|
||||||
// consider "about" as url or hostname
|
// consider "about" as url or hostname
|
||||||
try {
|
try {
|
||||||
|
@ -156,12 +161,17 @@ public class webstructure {
|
||||||
Iterator<byte[]> i = ids.iterator();
|
Iterator<byte[]> i = ids.iterator();
|
||||||
while (i.hasNext()) {
|
while (i.hasNext()) {
|
||||||
byte[] refhash = i.next();
|
byte[] refhash = i.next();
|
||||||
DigestURL refurl = authenticated ? sb.getURL(refhash) : null;
|
DigestURL refurl;
|
||||||
prop.put("citations_documents_0_anchors_" + d + "_urle", refurl == null ? 0 : 1);
|
try {
|
||||||
if (refurl != null) prop.putXML("citations_documents_0_anchors_" + d + "_urle_url", refurl.toNormalform(true));
|
refurl = authenticated ? sb.getURL(refhash) : null;
|
||||||
prop.put("citations_documents_0_anchors_" + d + "_urle_hash", refhash);
|
prop.put("citations_documents_0_anchors_" + d + "_urle", refurl == null ? 0 : 1);
|
||||||
prop.put("citations_documents_0_anchors_" + d + "_urle_date", GenericFormatter.SHORT_DAY_FORMATTER.format(new Date())); // superfluous?
|
if (refurl != null) prop.putXML("citations_documents_0_anchors_" + d + "_urle_url", refurl.toNormalform(true));
|
||||||
d++;
|
prop.put("citations_documents_0_anchors_" + d + "_urle_hash", refhash);
|
||||||
|
prop.put("citations_documents_0_anchors_" + d + "_urle_date", GenericFormatter.SHORT_DAY_FORMATTER.format(new Date())); // superfluous?
|
||||||
|
d++;
|
||||||
|
} catch (IOException e) {
|
||||||
|
ConcurrentLog.logException(e);
|
||||||
|
}
|
||||||
}
|
}
|
||||||
prop.put("citations_documents_0_count", d);
|
prop.put("citations_documents_0_count", d);
|
||||||
prop.put("citations_documents_0_anchors", d);
|
prop.put("citations_documents_0_anchors", d);
|
||||||
|
|
|
@ -34,18 +34,16 @@ public class add_ymark {
|
||||||
|
|
||||||
if (post.containsKey("urlHash")) {
|
if (post.containsKey("urlHash")) {
|
||||||
final String urlHash = post.get("urlHash",YMarkUtil.EMPTY_STRING);
|
final String urlHash = post.get("urlHash",YMarkUtil.EMPTY_STRING);
|
||||||
final DigestURL url = sb.index.fulltext().getURL(urlHash);
|
|
||||||
final String folders = post.get(YMarkEntry.BOOKMARK.FOLDERS.key(),YMarkEntry.BOOKMARK.FOLDERS.deflt());
|
|
||||||
final String tags = post.get(YMarkEntry.BOOKMARK.TAGS.key(),YMarkUtil.EMPTY_STRING);
|
|
||||||
try {
|
try {
|
||||||
|
final DigestURL url = sb.index.fulltext().getURL(urlHash);
|
||||||
|
final String folders = post.get(YMarkEntry.BOOKMARK.FOLDERS.key(),YMarkEntry.BOOKMARK.FOLDERS.deflt());
|
||||||
|
final String tags = post.get(YMarkEntry.BOOKMARK.TAGS.key(),YMarkUtil.EMPTY_STRING);
|
||||||
ClientIdentification.Agent agent = ClientIdentification.getAgent(post.get("agentName", ClientIdentification.yacyInternetCrawlerAgentName));
|
ClientIdentification.Agent agent = ClientIdentification.getAgent(post.get("agentName", ClientIdentification.yacyInternetCrawlerAgentName));
|
||||||
sb.tables.bookmarks.createBookmark(sb.loader, url, agent, bmk_user, true, tags, folders);
|
sb.tables.bookmarks.createBookmark(sb.loader, url, agent, bmk_user, true, tags, folders);
|
||||||
prop.put("status", "1");
|
prop.put("status", "1");
|
||||||
} catch (final IOException e) {
|
} catch (final IOException e) {
|
||||||
// TODO Auto-generated catch block
|
|
||||||
ConcurrentLog.logException(e);
|
ConcurrentLog.logException(e);
|
||||||
} catch (final Failure e) {
|
} catch (final Failure e) {
|
||||||
// TODO Auto-generated catch block
|
|
||||||
ConcurrentLog.logException(e);
|
ConcurrentLog.logException(e);
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
|
@ -27,6 +27,7 @@
|
||||||
// javac -classpath .:../classes transferRWI.java
|
// javac -classpath .:../classes transferRWI.java
|
||||||
|
|
||||||
|
|
||||||
|
import java.io.IOException;
|
||||||
import java.util.ArrayList;
|
import java.util.ArrayList;
|
||||||
import java.util.HashSet;
|
import java.util.HashSet;
|
||||||
import java.util.Iterator;
|
import java.util.Iterator;
|
||||||
|
@ -234,9 +235,14 @@ public final class transferRWI {
|
||||||
}
|
}
|
||||||
for (String id: testids) {
|
for (String id: testids) {
|
||||||
try {
|
try {
|
||||||
if (sb.index.fulltext().getLoadTime(id) >= 0) {
|
try {
|
||||||
knownURL.put(ASCII.getBytes(id));
|
if (sb.index.fulltext().getLoadTime(id) >= 0) {
|
||||||
} else {
|
knownURL.put(ASCII.getBytes(id));
|
||||||
|
} else {
|
||||||
|
unknownURL.put(ASCII.getBytes(id));
|
||||||
|
}
|
||||||
|
} catch (IOException e) {
|
||||||
|
ConcurrentLog.logException(e);
|
||||||
unknownURL.put(ASCII.getBytes(id));
|
unknownURL.put(ASCII.getBytes(id));
|
||||||
}
|
}
|
||||||
} catch (final SpaceExceededException e) {
|
} catch (final SpaceExceededException e) {
|
||||||
|
|
|
@ -144,7 +144,14 @@ public final class transferURL {
|
||||||
|
|
||||||
doublecheck = 0;
|
doublecheck = 0;
|
||||||
for (String id : lEm.keySet()) {
|
for (String id : lEm.keySet()) {
|
||||||
if (sb.index.getLoadTime(id) < 0) {
|
long lt = -1;
|
||||||
|
try {
|
||||||
|
lt = sb.index.getLoadTime(id);
|
||||||
|
} catch (IOException e1) {
|
||||||
|
lt = -1;
|
||||||
|
ConcurrentLog.logException(e1);
|
||||||
|
}
|
||||||
|
if (lt < 0) {
|
||||||
lEntry = lEm.get(id);
|
lEntry = lEm.get(id);
|
||||||
|
|
||||||
// write entry to database
|
// write entry to database
|
||||||
|
|
|
@ -29,6 +29,7 @@ import net.yacy.cora.date.GenericFormatter;
|
||||||
import net.yacy.cora.document.encoding.ASCII;
|
import net.yacy.cora.document.encoding.ASCII;
|
||||||
import net.yacy.cora.document.id.DigestURL;
|
import net.yacy.cora.document.id.DigestURL;
|
||||||
import net.yacy.cora.protocol.RequestHeader;
|
import net.yacy.cora.protocol.RequestHeader;
|
||||||
|
import net.yacy.cora.util.ConcurrentLog;
|
||||||
import net.yacy.crawler.data.NoticedURL;
|
import net.yacy.crawler.data.NoticedURL;
|
||||||
import net.yacy.crawler.retrieval.Request;
|
import net.yacy.crawler.retrieval.Request;
|
||||||
import net.yacy.kelondro.data.meta.URIMetadataNode;
|
import net.yacy.kelondro.data.meta.URIMetadataNode;
|
||||||
|
@ -74,7 +75,12 @@ public class urls {
|
||||||
if (entry == null) break;
|
if (entry == null) break;
|
||||||
|
|
||||||
// find referrer, if there is one
|
// find referrer, if there is one
|
||||||
referrer = sb.getURL(entry.referrerhash());
|
try {
|
||||||
|
referrer = sb.getURL(entry.referrerhash());
|
||||||
|
} catch (IOException e) {
|
||||||
|
referrer = null;
|
||||||
|
ConcurrentLog.logException(e);
|
||||||
|
}
|
||||||
|
|
||||||
// place url to notice-url db
|
// place url to notice-url db
|
||||||
sb.crawlQueues.delegatedURL.put(ASCII.String(entry.url().hash()), entry.url());
|
sb.crawlQueues.delegatedURL.put(ASCII.String(entry.url().hash()), entry.url());
|
||||||
|
@ -106,16 +112,20 @@ public class urls {
|
||||||
entry = sb.index.fulltext().getMetadata(ASCII.getBytes(urlhashes.substring(12 * i, 12 * (i + 1))));
|
entry = sb.index.fulltext().getMetadata(ASCII.getBytes(urlhashes.substring(12 * i, 12 * (i + 1))));
|
||||||
if (entry == null) continue;
|
if (entry == null) continue;
|
||||||
// find referrer, if there is one
|
// find referrer, if there is one
|
||||||
referrer = sb.getURL(entry.referrerHash());
|
try {
|
||||||
// create RSS entry
|
referrer = sb.getURL(entry.referrerHash());
|
||||||
prop.put("item_" + c + "_title", entry.dc_title());
|
// create RSS entry
|
||||||
prop.putXML("item_" + c + "_link", entry.url().toNormalform(true));
|
prop.put("item_" + c + "_title", entry.dc_title());
|
||||||
prop.putXML("item_" + c + "_referrer", (referrer == null) ? "" : referrer.toNormalform(true));
|
prop.putXML("item_" + c + "_link", entry.url().toNormalform(true));
|
||||||
prop.putXML("item_" + c + "_description", entry.dc_title());
|
prop.putXML("item_" + c + "_referrer", (referrer == null) ? "" : referrer.toNormalform(true));
|
||||||
prop.put("item_" + c + "_author", entry.dc_creator());
|
prop.putXML("item_" + c + "_description", entry.dc_title());
|
||||||
prop.put("item_" + c + "_pubDate", GenericFormatter.SHORT_SECOND_FORMATTER.format(entry.moddate()));
|
prop.put("item_" + c + "_author", entry.dc_creator());
|
||||||
prop.put("item_" + c + "_guid", ASCII.String(entry.hash()));
|
prop.put("item_" + c + "_pubDate", GenericFormatter.SHORT_SECOND_FORMATTER.format(entry.moddate()));
|
||||||
c++;
|
prop.put("item_" + c + "_guid", ASCII.String(entry.hash()));
|
||||||
|
c++;
|
||||||
|
} catch (IOException e) {
|
||||||
|
ConcurrentLog.logException(e);
|
||||||
|
}
|
||||||
}
|
}
|
||||||
prop.put("item", c);
|
prop.put("item", c);
|
||||||
prop.putXML("response", "ok");
|
prop.putXML("response", "ok");
|
||||||
|
|
|
@ -588,19 +588,23 @@ public class yacysearch {
|
||||||
return prop;
|
return prop;
|
||||||
}
|
}
|
||||||
final String bookmarkHash = post.get("bookmarkref", ""); // urlhash
|
final String bookmarkHash = post.get("bookmarkref", ""); // urlhash
|
||||||
final DigestURL url = indexSegment.fulltext().getURL(bookmarkHash);
|
try {
|
||||||
if ( url != null ) {
|
final DigestURL url = indexSegment.fulltext().getURL(bookmarkHash);
|
||||||
try {
|
if ( url != null ) {
|
||||||
sb.tables.bookmarks.createBookmark(
|
try {
|
||||||
sb.loader,
|
sb.tables.bookmarks.createBookmark(
|
||||||
url,
|
sb.loader,
|
||||||
ClientIdentification.yacyInternetCrawlerAgent,
|
url,
|
||||||
YMarkTables.USER_ADMIN,
|
ClientIdentification.yacyInternetCrawlerAgent,
|
||||||
true,
|
YMarkTables.USER_ADMIN,
|
||||||
"searchresult",
|
true,
|
||||||
"/search");
|
"searchresult",
|
||||||
} catch (final Throwable e ) {
|
"/search");
|
||||||
|
} catch (final Throwable e ) {
|
||||||
|
}
|
||||||
}
|
}
|
||||||
|
} catch (IOException e) {
|
||||||
|
ConcurrentLog.logException(e);
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
|
@ -313,9 +313,6 @@ public abstract class AbstractSolrConnector implements SolrConnector {
|
||||||
//params.setQuery(CollectionSchema.id.getSolrFieldName() + ":\"" + id + "\"");
|
//params.setQuery(CollectionSchema.id.getSolrFieldName() + ":\"" + id + "\"");
|
||||||
String q = "{!raw f=" + CollectionSchema.id.getSolrFieldName() + "}" + id;
|
String q = "{!raw f=" + CollectionSchema.id.getSolrFieldName() + "}" + id;
|
||||||
params.setQuery(q);
|
params.setQuery(q);
|
||||||
//params.setQuery("*:*");
|
|
||||||
//params.addFilterQuery(q);
|
|
||||||
//params.set("defType", "raw");
|
|
||||||
params.setRows(1);
|
params.setRows(1);
|
||||||
params.setStart(0);
|
params.setStart(0);
|
||||||
params.setFacet(false);
|
params.setFacet(false);
|
||||||
|
|
|
@ -405,7 +405,7 @@ public class EmbeddedSolrConnector extends SolrServerConnector implements SolrCo
|
||||||
* @throws IOException
|
* @throws IOException
|
||||||
*/
|
*/
|
||||||
@Override
|
@Override
|
||||||
public LoadTimeURL getLoadTimeURL(String id) {
|
public LoadTimeURL getLoadTimeURL(String id) throws IOException {
|
||||||
int responseCount = 0;
|
int responseCount = 0;
|
||||||
DocListSearcher docListSearcher = null;
|
DocListSearcher docListSearcher = null;
|
||||||
try {
|
try {
|
||||||
|
@ -421,10 +421,10 @@ public class EmbeddedSolrConnector extends SolrServerConnector implements SolrCo
|
||||||
//}
|
//}
|
||||||
} catch (Throwable e) {
|
} catch (Throwable e) {
|
||||||
ConcurrentLog.logException(e);
|
ConcurrentLog.logException(e);
|
||||||
|
throw new IOException(e.getMessage());
|
||||||
} finally {
|
} finally {
|
||||||
if (docListSearcher != null) docListSearcher.close();
|
if (docListSearcher != null) docListSearcher.close();
|
||||||
}
|
}
|
||||||
return null;
|
|
||||||
}
|
}
|
||||||
|
|
||||||
@Override
|
@Override
|
||||||
|
|
|
@ -154,7 +154,7 @@ public interface SolrConnector extends Iterable<String> /* Iterable of document
|
||||||
* @param query
|
* @param query
|
||||||
* @throws IOException
|
* @throws IOException
|
||||||
*/
|
*/
|
||||||
public QueryResponse getResponseByParams(final ModifiableSolrParams query) throws IOException, SolrException;
|
public QueryResponse getResponseByParams(final ModifiableSolrParams query) throws IOException;
|
||||||
|
|
||||||
/**
|
/**
|
||||||
* get the solr document list from a query response
|
* get the solr document list from a query response
|
||||||
|
@ -165,7 +165,7 @@ public interface SolrConnector extends Iterable<String> /* Iterable of document
|
||||||
* @throws IOException
|
* @throws IOException
|
||||||
* @throws SolrException
|
* @throws SolrException
|
||||||
*/
|
*/
|
||||||
public SolrDocumentList getDocumentListByParams(ModifiableSolrParams params) throws IOException, SolrException;
|
public SolrDocumentList getDocumentListByParams(ModifiableSolrParams params) throws IOException;
|
||||||
|
|
||||||
/**
|
/**
|
||||||
* get the number of results for a query response
|
* get the number of results for a query response
|
||||||
|
@ -174,7 +174,7 @@ public interface SolrConnector extends Iterable<String> /* Iterable of document
|
||||||
* @throws IOException
|
* @throws IOException
|
||||||
* @throws SolrException
|
* @throws SolrException
|
||||||
*/
|
*/
|
||||||
public long getDocumentCountByParams(ModifiableSolrParams params) throws IOException, SolrException;
|
public long getDocumentCountByParams(ModifiableSolrParams params) throws IOException;
|
||||||
|
|
||||||
/**
|
/**
|
||||||
* get a query result from solr
|
* get a query result from solr
|
||||||
|
@ -191,7 +191,7 @@ public interface SolrConnector extends Iterable<String> /* Iterable of document
|
||||||
final String sort,
|
final String sort,
|
||||||
final int offset,
|
final int offset,
|
||||||
final int count,
|
final int count,
|
||||||
final String ... fields) throws IOException, SolrException;
|
final String ... fields) throws IOException;
|
||||||
|
|
||||||
/**
|
/**
|
||||||
* get the number of results when this query is done.
|
* get the number of results when this query is done.
|
||||||
|
|
|
@ -33,7 +33,6 @@ import net.yacy.search.schema.CollectionSchema;
|
||||||
import org.apache.lucene.analysis.NumericTokenStream;
|
import org.apache.lucene.analysis.NumericTokenStream;
|
||||||
import org.apache.solr.common.SolrDocumentList;
|
import org.apache.solr.common.SolrDocumentList;
|
||||||
import org.apache.solr.common.SolrException;
|
import org.apache.solr.common.SolrException;
|
||||||
import org.apache.solr.common.SolrException.ErrorCode;
|
|
||||||
import org.apache.solr.common.SolrInputDocument;
|
import org.apache.solr.common.SolrInputDocument;
|
||||||
import org.apache.solr.common.params.ModifiableSolrParams;
|
import org.apache.solr.common.params.ModifiableSolrParams;
|
||||||
import org.apache.solr.common.util.NamedList;
|
import org.apache.solr.common.util.NamedList;
|
||||||
|
@ -289,7 +288,7 @@ public abstract class SolrServerConnector extends AbstractSolrConnector implemen
|
||||||
* @throws SolrException
|
* @throws SolrException
|
||||||
*/
|
*/
|
||||||
@Override
|
@Override
|
||||||
public SolrDocumentList getDocumentListByParams(ModifiableSolrParams params) throws IOException, SolrException {
|
public SolrDocumentList getDocumentListByParams(ModifiableSolrParams params) throws IOException {
|
||||||
if (this.server == null) throw new IOException("server disconnected");
|
if (this.server == null) throw new IOException("server disconnected");
|
||||||
// during the solr query we set the thread name to the query string to get more debugging info in thread dumps
|
// during the solr query we set the thread name to the query string to get more debugging info in thread dumps
|
||||||
String q = params.get("q");
|
String q = params.get("q");
|
||||||
|
@ -297,18 +296,25 @@ public abstract class SolrServerConnector extends AbstractSolrConnector implemen
|
||||||
String threadname = Thread.currentThread().getName();
|
String threadname = Thread.currentThread().getName();
|
||||||
if (q != null) Thread.currentThread().setName("solr query: q = " + q + (fq == null ? "" : ", fq = " + fq));
|
if (q != null) Thread.currentThread().setName("solr query: q = " + q + (fq == null ? "" : ", fq = " + fq));
|
||||||
QueryResponse rsp;
|
QueryResponse rsp;
|
||||||
try {
|
int retry = 10;
|
||||||
rsp = this.server.query(params);
|
Throwable error = null;
|
||||||
if (q != null) Thread.currentThread().setName(threadname);
|
while (retry-- > 0) {
|
||||||
if (rsp != null) if (log.isFine()) log.fine(rsp.getResults().getNumFound() + " results for q=" + q);
|
try {
|
||||||
return rsp.getResults();
|
rsp = this.server.query(params);
|
||||||
} catch (final SolrServerException e) {
|
if (q != null) Thread.currentThread().setName(threadname);
|
||||||
clearCaches(); // prevent further OOM if this was caused by OOM
|
if (rsp != null) if (log.isFine()) log.fine(rsp.getResults().getNumFound() + " results for q=" + q);
|
||||||
throw new SolrException(ErrorCode.UNKNOWN, e);
|
return rsp.getResults();
|
||||||
} catch (final Throwable e) {
|
} catch (final SolrServerException e) {
|
||||||
clearCaches(); // prevent further OOM if this was caused by OOM
|
error = e;
|
||||||
throw new IOException("Error executing query", e);
|
clearCaches(); // prevent further OOM if this was caused by OOM
|
||||||
|
} catch (final Throwable e) {
|
||||||
|
error = e;
|
||||||
|
clearCaches(); // prevent further OOM if this was caused by OOM
|
||||||
|
}
|
||||||
|
ConcurrentLog.severe("SolrServerConnector", "Failed to query remote Solr: " + error.getMessage() + ", query:" + q + (fq == null ? "" : ", fq = " + fq));
|
||||||
|
try {Thread.sleep(1000);} catch (InterruptedException e) {}
|
||||||
}
|
}
|
||||||
|
throw new IOException("Error executing query", error);
|
||||||
}
|
}
|
||||||
|
|
||||||
// luke requests: these do not work for attached SolrCloud Server
|
// luke requests: these do not work for attached SolrCloud Server
|
||||||
|
|
|
@ -388,7 +388,10 @@ public final class CrawlStacker {
|
||||||
try {
|
try {
|
||||||
oldEntry = this.indexSegment.fulltext().getDefaultConnector().getLoadTimeURL(urlhash);
|
oldEntry = this.indexSegment.fulltext().getDefaultConnector().getLoadTimeURL(urlhash);
|
||||||
} catch (IOException e) {
|
} catch (IOException e) {
|
||||||
|
// if an exception here occurs then there is the danger that urls which had been in the crawler are overwritten a second time
|
||||||
|
// to prevent that, we reject urls in these events
|
||||||
ConcurrentLog.logException(e);
|
ConcurrentLog.logException(e);
|
||||||
|
return "exception during double-test: " + e.getMessage();
|
||||||
}
|
}
|
||||||
final Long oldDate = oldEntry == null ? null : oldEntry.date;
|
final Long oldDate = oldEntry == null ? null : oldEntry.date;
|
||||||
if (oldDate == null) {
|
if (oldDate == null) {
|
||||||
|
|
|
@ -108,11 +108,16 @@ public class RSSLoader extends Thread {
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
for (final Map.Entry<String, DigestURL> e: urlmap.entrySet()) {
|
for (final Map.Entry<String, DigestURL> e: urlmap.entrySet()) {
|
||||||
HarvestProcess harvestProcess = sb.urlExists(e.getKey());
|
HarvestProcess harvestProcess;
|
||||||
if (harvestProcess != null) continue;
|
try {
|
||||||
list.add(e.getValue());
|
harvestProcess = sb.urlExists(e.getKey());
|
||||||
indexTriggered.insertIfAbsent(ASCII.getBytes(e.getKey()), new Date());
|
if (harvestProcess != null) continue;
|
||||||
loadCount++;
|
list.add(e.getValue());
|
||||||
|
indexTriggered.insertIfAbsent(ASCII.getBytes(e.getKey()), new Date());
|
||||||
|
loadCount++;
|
||||||
|
} catch (IOException e1) {
|
||||||
|
ConcurrentLog.logException(e1);
|
||||||
|
}
|
||||||
}
|
}
|
||||||
sb.addToIndex(list, null, null, collections, true);
|
sb.addToIndex(list, null, null, collections, true);
|
||||||
// update info for loading
|
// update info for loading
|
||||||
|
|
|
@ -25,6 +25,7 @@
|
||||||
|
|
||||||
package net.yacy.crawler.retrieval;
|
package net.yacy.crawler.retrieval;
|
||||||
|
|
||||||
|
import java.io.IOException;
|
||||||
import java.net.MalformedURLException;
|
import java.net.MalformedURLException;
|
||||||
import java.util.Date;
|
import java.util.Date;
|
||||||
|
|
||||||
|
@ -82,15 +83,20 @@ public class SitemapImporter extends Thread {
|
||||||
// check if the url is known and needs to be recrawled
|
// check if the url is known and needs to be recrawled
|
||||||
Date lastMod = entry.lastmod(null);
|
Date lastMod = entry.lastmod(null);
|
||||||
if (lastMod != null) {
|
if (lastMod != null) {
|
||||||
final HarvestProcess dbocc = this.sb.urlExists(ASCII.String(nexturlhash));
|
HarvestProcess dbocc;
|
||||||
if (dbocc != null && dbocc == HarvestProcess.LOADED) {
|
try {
|
||||||
// the url was already loaded. we need to check the date
|
dbocc = this.sb.urlExists(ASCII.String(nexturlhash));
|
||||||
final URIMetadataNode oldEntry = this.sb.index.fulltext().getMetadata(nexturlhash);
|
if (dbocc != null && dbocc == HarvestProcess.LOADED) {
|
||||||
if (oldEntry != null) {
|
// the url was already loaded. we need to check the date
|
||||||
final Date modDate = oldEntry.moddate();
|
final URIMetadataNode oldEntry = this.sb.index.fulltext().getMetadata(nexturlhash);
|
||||||
// check if modDate is null
|
if (oldEntry != null) {
|
||||||
if (modDate.after(lastMod)) return;
|
final Date modDate = oldEntry.moddate();
|
||||||
|
// check if modDate is null
|
||||||
|
if (modDate.after(lastMod)) return;
|
||||||
|
}
|
||||||
}
|
}
|
||||||
|
} catch (IOException e) {
|
||||||
|
ConcurrentLog.logException(e);
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
|
@ -35,6 +35,7 @@ import net.yacy.cora.document.encoding.ASCII;
|
||||||
import net.yacy.cora.document.id.DigestURL;
|
import net.yacy.cora.document.id.DigestURL;
|
||||||
import net.yacy.cora.federate.yacy.CacheStrategy;
|
import net.yacy.cora.federate.yacy.CacheStrategy;
|
||||||
import net.yacy.cora.protocol.ClientIdentification;
|
import net.yacy.cora.protocol.ClientIdentification;
|
||||||
|
import net.yacy.cora.util.ConcurrentLog;
|
||||||
import net.yacy.crawler.retrieval.Response;
|
import net.yacy.crawler.retrieval.Response;
|
||||||
import net.yacy.document.Document;
|
import net.yacy.document.Document;
|
||||||
import net.yacy.document.Parser.Failure;
|
import net.yacy.document.Parser.Failure;
|
||||||
|
@ -82,7 +83,12 @@ public class YMarkMetadata {
|
||||||
public YMarkMetadata(final byte[] urlHash, final Segment indexSegment) {
|
public YMarkMetadata(final byte[] urlHash, final Segment indexSegment) {
|
||||||
this.document = null;
|
this.document = null;
|
||||||
this.indexSegment = indexSegment;
|
this.indexSegment = indexSegment;
|
||||||
this.uri = this.indexSegment.fulltext().getURL(ASCII.String(urlHash));
|
try {
|
||||||
|
this.uri = this.indexSegment.fulltext().getURL(ASCII.String(urlHash));
|
||||||
|
} catch (IOException e) {
|
||||||
|
this.uri = null;
|
||||||
|
ConcurrentLog.logException(e);
|
||||||
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
public YMarkMetadata(final Document document) {
|
public YMarkMetadata(final Document document) {
|
||||||
|
|
|
@ -24,6 +24,7 @@
|
||||||
|
|
||||||
package net.yacy.peers;
|
package net.yacy.peers;
|
||||||
|
|
||||||
|
import java.io.IOException;
|
||||||
import java.util.ArrayList;
|
import java.util.ArrayList;
|
||||||
import java.util.HashSet;
|
import java.util.HashSet;
|
||||||
import java.util.Iterator;
|
import java.util.Iterator;
|
||||||
|
@ -174,9 +175,15 @@ public class Transmission {
|
||||||
i = c.entries();
|
i = c.entries();
|
||||||
while (i.hasNext()) {
|
while (i.hasNext()) {
|
||||||
final WordReference e = i.next();
|
final WordReference e = i.next();
|
||||||
if (Transmission.this.segment.fulltext().getLoadTime(ASCII.String(e.urlhash())) >= 0) {
|
try {
|
||||||
this.references.put(e.urlhash());
|
if (Transmission.this.segment.fulltext().getLoadTime(ASCII.String(e.urlhash())) >= 0) {
|
||||||
} else {
|
this.references.put(e.urlhash());
|
||||||
|
} else {
|
||||||
|
notFoundx.add(e.urlhash());
|
||||||
|
this.badReferences.put(e.urlhash());
|
||||||
|
}
|
||||||
|
} catch (IOException e1) {
|
||||||
|
ConcurrentLog.logException(e1);
|
||||||
notFoundx.add(e.urlhash());
|
notFoundx.add(e.urlhash());
|
||||||
this.badReferences.put(e.urlhash());
|
this.badReferences.put(e.urlhash());
|
||||||
}
|
}
|
||||||
|
|
|
@ -1620,18 +1620,12 @@ public final class Switchboard extends serverSwitch {
|
||||||
* @param hash
|
* @param hash
|
||||||
* @return if it exists, the name of the database is returned, if it not exists, null is returned
|
* @return if it exists, the name of the database is returned, if it not exists, null is returned
|
||||||
*/
|
*/
|
||||||
public HarvestProcess urlExists(final String hash) {
|
public HarvestProcess urlExists(final String hash) throws IOException {
|
||||||
if (this.index.getLoadTime(hash) >= 0) return HarvestProcess.LOADED;
|
LoadTimeURL md = this.index.fulltext().getDefaultConnector().getLoadTimeURL(hash);
|
||||||
|
if (md != null && md.date >= 0) return HarvestProcess.LOADED;
|
||||||
HarvestProcess hp = this.crawlQueues.exists(ASCII.getBytes(hash));
|
HarvestProcess hp = this.crawlQueues.exists(ASCII.getBytes(hash));
|
||||||
if (hp != null) return hp;
|
if (hp != null) return hp;
|
||||||
try {
|
return null; // todo: can also be in error
|
||||||
LoadTimeURL md = this.index.fulltext().getDefaultConnector().getLoadTimeURL(hash);
|
|
||||||
if (md == null) return null;
|
|
||||||
return HarvestProcess.LOADED; // todo: can also be in error
|
|
||||||
} catch (IOException e) {
|
|
||||||
ConcurrentLog.logException(e);
|
|
||||||
return null;
|
|
||||||
}
|
|
||||||
}
|
}
|
||||||
|
|
||||||
public void urlRemove(final Segment segment, final byte[] hash) {
|
public void urlRemove(final Segment segment, final byte[] hash) {
|
||||||
|
@ -1640,7 +1634,7 @@ public final class Switchboard extends serverSwitch {
|
||||||
this.crawlQueues.removeURL(hash);
|
this.crawlQueues.removeURL(hash);
|
||||||
}
|
}
|
||||||
|
|
||||||
public DigestURL getURL(final byte[] urlhash) {
|
public DigestURL getURL(final byte[] urlhash) throws IOException {
|
||||||
if (urlhash == null) return null;
|
if (urlhash == null) return null;
|
||||||
if (urlhash.length == 0) return null;
|
if (urlhash.length == 0) return null;
|
||||||
final DigestURL url = this.index.fulltext().getURL(ASCII.String(urlhash));
|
final DigestURL url = this.index.fulltext().getURL(ASCII.String(urlhash));
|
||||||
|
@ -2977,7 +2971,15 @@ public final class Switchboard extends serverSwitch {
|
||||||
// stacking may fail because of double occurrences of that url. Therefore
|
// stacking may fail because of double occurrences of that url. Therefore
|
||||||
// we must wait here until the url has actually disappeared
|
// we must wait here until the url has actually disappeared
|
||||||
int t = 100;
|
int t = 100;
|
||||||
while (t-- > 0 && this.index.getLoadTime(ASCII.String(urlhash)) >= 0) {
|
while (t-- > 0) {
|
||||||
|
try {
|
||||||
|
long lt = this.index.getLoadTime(ASCII.String(urlhash));
|
||||||
|
if (lt < 0) break;
|
||||||
|
} catch (IOException e) {
|
||||||
|
// if this fails, the url may still exist
|
||||||
|
// we should abandon the whole process
|
||||||
|
return "exist-test failed: " + e.getMessage();
|
||||||
|
}
|
||||||
try {Thread.sleep(100);} catch (final InterruptedException e) {}
|
try {Thread.sleep(100);} catch (final InterruptedException e) {}
|
||||||
ConcurrentLog.fine("Switchboard", "STACKURL: waiting for deletion, t=" + t);
|
ConcurrentLog.fine("Switchboard", "STACKURL: waiting for deletion, t=" + t);
|
||||||
//if (t == 20) this.index.fulltext().commit(true);
|
//if (t == 20) this.index.fulltext().commit(true);
|
||||||
|
@ -3094,9 +3096,17 @@ public final class Switchboard extends serverSwitch {
|
||||||
final List<Request> requests = new ArrayList<Request>();
|
final List<Request> requests = new ArrayList<Request>();
|
||||||
for (Map.Entry<String, DigestURL> e: urlmap.entrySet()) {
|
for (Map.Entry<String, DigestURL> e: urlmap.entrySet()) {
|
||||||
final String urlName = e.getValue().toNormalform(true);
|
final String urlName = e.getValue().toNormalform(true);
|
||||||
if (doublecheck && this.index.getLoadTime(e.getKey()) >= 0) {
|
if (doublecheck) {
|
||||||
this.log.info("addToIndex: double " + urlName);
|
try {
|
||||||
continue;
|
if (this.index.getLoadTime(e.getKey()) >= 0) {
|
||||||
|
this.log.info("addToIndex: double " + urlName);
|
||||||
|
continue;
|
||||||
|
}
|
||||||
|
} catch (IOException ee) {
|
||||||
|
// double check fail may mean that the url exist
|
||||||
|
this.log.info("addToIndex: doublecheck failed for " + urlName + ": " + ee.getMessage());
|
||||||
|
continue;
|
||||||
|
}
|
||||||
}
|
}
|
||||||
final Request request = this.loader.request(e.getValue(), true, true);
|
final Request request = this.loader.request(e.getValue(), true, true);
|
||||||
final CrawlProfile profile = this.crawler.get(ASCII.getBytes(request.profileHandle()));
|
final CrawlProfile profile = this.crawler.get(ASCII.getBytes(request.profileHandle()));
|
||||||
|
@ -3168,7 +3178,11 @@ public final class Switchboard extends serverSwitch {
|
||||||
Map<String, DigestURL> urlmap = new HashMap<String, DigestURL>();
|
Map<String, DigestURL> urlmap = new HashMap<String, DigestURL>();
|
||||||
for (DigestURL url: urls) urlmap.put(ASCII.String(url.hash()), url);
|
for (DigestURL url: urls) urlmap.put(ASCII.String(url.hash()), url);
|
||||||
for (Map.Entry<String, DigestURL> e: urlmap.entrySet()) {
|
for (Map.Entry<String, DigestURL> e: urlmap.entrySet()) {
|
||||||
if (this.index.getLoadTime(e.getKey()) >= 0) continue; // double
|
try {
|
||||||
|
if (this.index.getLoadTime(e.getKey()) >= 0) continue; // double
|
||||||
|
} catch (IOException ee) {
|
||||||
|
continue; // if the check fails, consider the url as double
|
||||||
|
}
|
||||||
DigestURL url = e.getValue();
|
DigestURL url = e.getValue();
|
||||||
final Request request = this.loader.request(url, true, true);
|
final Request request = this.loader.request(url, true, true);
|
||||||
final CrawlProfile profile = this.crawler.get(ASCII.getBytes(request.profileHandle()));
|
final CrawlProfile profile = this.crawler.get(ASCII.getBytes(request.profileHandle()));
|
||||||
|
|
|
@ -473,16 +473,12 @@ public final class Fulltext {
|
||||||
return false;
|
return false;
|
||||||
}
|
}
|
||||||
|
|
||||||
public DigestURL getURL(final String urlHash) {
|
public DigestURL getURL(final String urlHash) throws IOException {
|
||||||
if (urlHash == null || this.getDefaultConnector() == null) return null;
|
if (urlHash == null || this.getDefaultConnector() == null) return null;
|
||||||
|
|
||||||
try {
|
SolrConnector.LoadTimeURL md = this.getDefaultConnector().getLoadTimeURL(urlHash);
|
||||||
SolrConnector.LoadTimeURL md = this.getDefaultConnector().getLoadTimeURL(urlHash);
|
if (md == null) return null;
|
||||||
if (md == null) return null;
|
return new DigestURL(md.url, ASCII.getBytes(urlHash));
|
||||||
return new DigestURL(md.url, ASCII.getBytes(urlHash));
|
|
||||||
} catch (final IOException e) {
|
|
||||||
return null;
|
|
||||||
}
|
|
||||||
}
|
}
|
||||||
|
|
||||||
/**
|
/**
|
||||||
|
@ -490,16 +486,11 @@ public final class Fulltext {
|
||||||
* @param urlHash
|
* @param urlHash
|
||||||
* @return the time in milliseconds since epoch for the load time or -1 if the document does not exist
|
* @return the time in milliseconds since epoch for the load time or -1 if the document does not exist
|
||||||
*/
|
*/
|
||||||
public long getLoadTime(final String urlHash) {
|
public long getLoadTime(final String urlHash) throws IOException {
|
||||||
if (urlHash == null) return -1l;
|
if (urlHash == null) return -1l;
|
||||||
try {
|
SolrConnector.LoadTimeURL md = this.getDefaultConnector().getLoadTimeURL(urlHash);
|
||||||
SolrConnector.LoadTimeURL md = this.getDefaultConnector().getLoadTimeURL(urlHash);
|
if (md == null) return -1l;
|
||||||
if (md == null) return -1l;
|
return md.date;
|
||||||
return md.date;
|
|
||||||
} catch (final Throwable e) {
|
|
||||||
ConcurrentLog.logException(e);
|
|
||||||
}
|
|
||||||
return -1l;
|
|
||||||
}
|
}
|
||||||
|
|
||||||
public List<File> dumpFiles() {
|
public List<File> dumpFiles() {
|
||||||
|
|
|
@ -356,7 +356,7 @@ public class Segment {
|
||||||
* @param urlHash
|
* @param urlHash
|
||||||
* @return the time in milliseconds since epoch for the load time or -1 if the document does not exist
|
* @return the time in milliseconds since epoch for the load time or -1 if the document does not exist
|
||||||
*/
|
*/
|
||||||
public long getLoadTime(final String urlhash) {
|
public long getLoadTime(final String urlhash) throws IOException {
|
||||||
return this.fulltext.getLoadTime(urlhash);
|
return this.fulltext.getLoadTime(urlhash);
|
||||||
}
|
}
|
||||||
|
|
||||||
|
@ -683,10 +683,10 @@ public class Segment {
|
||||||
|
|
||||||
if (urlhash == null) return 0;
|
if (urlhash == null) return 0;
|
||||||
// determine the url string
|
// determine the url string
|
||||||
final DigestURL url = fulltext().getURL(ASCII.String(urlhash));
|
|
||||||
if (url == null) return 0;
|
|
||||||
|
|
||||||
try {
|
try {
|
||||||
|
final DigestURL url = fulltext().getURL(ASCII.String(urlhash));
|
||||||
|
if (url == null) return 0;
|
||||||
|
|
||||||
// parse the resource
|
// parse the resource
|
||||||
final Document document = Document.mergeDocuments(url, null, loader.loadDocuments(loader.request(url, true, false), cacheStrategy, Integer.MAX_VALUE, null, agent));
|
final Document document = Document.mergeDocuments(url, null, loader.loadDocuments(loader.request(url, true, false), cacheStrategy, Integer.MAX_VALUE, null, agent));
|
||||||
if (document == null) {
|
if (document == null) {
|
||||||
|
|
Loading…
Reference in New Issue
Block a user