proper deletion of loadtime index

This commit is contained in:
Michael Peter Christen 2021-12-22 01:22:46 +01:00
parent bd3f2483a1
commit 9c38b1254e
3 changed files with 36 additions and 26 deletions

View File

@ -416,6 +416,7 @@ public class Crawler_p {
if ("smb.ftp".indexOf(u.getProtocol()) >= 0 || "pdf".equals(MultiProtocolURL.getFileExtension(u.getFileName()))) anysmbftporpdf = true; if ("smb.ftp".indexOf(u.getProtocol()) >= 0 || "pdf".equals(MultiProtocolURL.getFileExtension(u.getFileName()))) anysmbftporpdf = true;
} }
sb.index.fulltext().remove(deleteIDs); sb.index.fulltext().remove(deleteIDs);
deleteIDs.forEach(urlhash -> {try {sb.index.loadTimeIndex().remove(urlhash.getBytes());} catch (IOException e) {}});
sb.crawlQueues.removeHosts(hosthashes); sb.crawlQueues.removeHosts(hosthashes);
sb.index.fulltext().commit(true); sb.index.fulltext().commit(true);
@ -437,6 +438,7 @@ public class Crawler_p {
String basepath = u.toNormalform(true); String basepath = u.toNormalform(true);
if (!basepath.endsWith("/")) {final int p = basepath.lastIndexOf("/"); if (p > 0) basepath = basepath.substring(0, p + 1);} if (!basepath.endsWith("/")) {final int p = basepath.lastIndexOf("/"); if (p > 0) basepath = basepath.substring(0, p + 1);}
final int count = sb.index.fulltext().remove(basepath, deleteageDate); final int count = sb.index.fulltext().remove(basepath, deleteageDate);
try {sb.index.loadTimeIndex().clear();} catch (IOException e) {}
if (count > 0) ConcurrentLog.info("Crawler_p", "deleted " + count + " documents for host " + u.getHost()); if (count > 0) ConcurrentLog.info("Crawler_p", "deleted " + count + " documents for host " + u.getHost());
} }
} }

View File

@ -143,7 +143,6 @@ public class IndexControlURLs_p {
if ( post.get("deleteFirstSeen", "").equals("on")) { if ( post.get("deleteFirstSeen", "").equals("on")) {
try { try {
segment.firstSeenIndex().clear(); segment.firstSeenIndex().clear();
segment.loadTimeIndex().clear();
} catch (final IOException e) {} } catch (final IOException e) {}
} }
if ( post.get("deleteCrawlQueues", "").equals("on") ) { if ( post.get("deleteCrawlQueues", "").equals("on") ) {
@ -166,6 +165,7 @@ public class IndexControlURLs_p {
ClientIdentification.Agent agent = ClientIdentification.getAgent(post.get("agentName", ClientIdentification.yacyInternetCrawlerAgentName)); ClientIdentification.Agent agent = ClientIdentification.getAgent(post.get("agentName", ClientIdentification.yacyInternetCrawlerAgentName));
int i = segment.removeAllUrlReferences(urlhash.getBytes(), sb.loader, agent, CacheStrategy.IFEXIST); int i = segment.removeAllUrlReferences(urlhash.getBytes(), sb.loader, agent, CacheStrategy.IFEXIST);
try {segment.loadTimeIndex().remove(urlhash.getBytes());} catch (IOException e) {}
prop.put("result", "Deleted URL and " + i + " references from " + i + " word indexes."); prop.put("result", "Deleted URL and " + i + " references from " + i + " word indexes.");
} }
@ -183,6 +183,7 @@ public class IndexControlURLs_p {
sb.urlRemove(segment, urlhash.getBytes()); sb.urlRemove(segment, urlhash.getBytes());
prop.putHTML("result", "Removed URL " + url); prop.putHTML("result", "Removed URL " + url);
} }
segment.loadTimeIndex().remove(urlhash.getBytes());
} catch (IOException e) { } catch (IOException e) {
prop.putHTML("result", "Error when querying the url hash " + urlhash + ":" + e.getMessage()); prop.putHTML("result", "Error when querying the url hash " + urlhash + ":" + e.getMessage());
} }
@ -201,6 +202,7 @@ public class IndexControlURLs_p {
prop.put("result", "No input given; nothing deleted."); prop.put("result", "No input given; nothing deleted.");
} else { } else {
sb.urlRemove(segment, urlhash.getBytes()); sb.urlRemove(segment, urlhash.getBytes());
try {segment.loadTimeIndex().remove(urlhash.getBytes());} catch (IOException e) {}
prop.putHTML("result", "Removed URL " + urlstring); prop.putHTML("result", "Removed URL " + urlstring);
} }
} }
@ -267,6 +269,7 @@ public class IndexControlURLs_p {
Set<String> hostnames = new HashSet<String>(); Set<String> hostnames = new HashSet<String>();
hostnames.add(domain); hostnames.add(domain);
segment.fulltext().deleteStaleDomainNames(hostnames, null); segment.fulltext().deleteStaleDomainNames(hostnames, null);
try {segment.loadTimeIndex().clear();} catch (IOException e) {} // delete all to prevent that existing entries reject reloading
// trigger the loading of the table // trigger the loading of the table
post.put("statistics", ""); post.put("statistics", "");
} }

View File

@ -161,6 +161,7 @@ public class IndexDeletion_p {
} else { } else {
sb.remove(ids); sb.remove(ids);
defaultConnector.commit(false); defaultConnector.commit(false);
ids.forEach(urlhash -> {try {sb.index.loadTimeIndex().remove(urlhash.getBytes());} catch (IOException e) {}});
sb.tables.recordAPICall(post, "IndexDeletion_p.html", WorkTables.TABLE_API_TYPE_DELETION, "deletion, docs matching with " + urldelete); sb.tables.recordAPICall(post, "IndexDeletion_p.html", WorkTables.TABLE_API_TYPE_DELETION, "deletion, docs matching with " + urldelete);
prop.put("urldelete-active", 2); prop.put("urldelete-active", 2);
} }
@ -177,6 +178,7 @@ public class IndexDeletion_p {
try { try {
defaultConnector.deleteByQuery(regexquery); defaultConnector.deleteByQuery(regexquery);
defaultConnector.commit(false); defaultConnector.commit(false);
try {sb.index.loadTimeIndex().clear();} catch (IOException e) {}
sb.tables.recordAPICall(post, "IndexDeletion_p.html", WorkTables.TABLE_API_TYPE_DELETION, "deletion, regex match = " + urldelete); sb.tables.recordAPICall(post, "IndexDeletion_p.html", WorkTables.TABLE_API_TYPE_DELETION, "deletion, regex match = " + urldelete);
} catch (final IOException e) { } catch (final IOException e) {
} }
@ -206,6 +208,7 @@ public class IndexDeletion_p {
try { try {
defaultConnector.deleteByQuery(collection1Query); defaultConnector.deleteByQuery(collection1Query);
defaultConnector.commit(false); defaultConnector.commit(false);
try {sb.index.loadTimeIndex().clear();} catch (IOException e) {}
if (webgraphConnector != null) webgraphConnector.deleteByQuery(webgraphQuery); if (webgraphConnector != null) webgraphConnector.deleteByQuery(webgraphQuery);
sb.tables.recordAPICall(post, "IndexDeletion_p.html", WorkTables.TABLE_API_TYPE_DELETION, "deletion, docs older than " + timedelete_number + " " + timedelete_unit); sb.tables.recordAPICall(post, "IndexDeletion_p.html", WorkTables.TABLE_API_TYPE_DELETION, "deletion, docs older than " + timedelete_number + " " + timedelete_unit);
} catch (final IOException e) { } catch (final IOException e) {
@ -232,6 +235,7 @@ public class IndexDeletion_p {
try { try {
defaultConnector.deleteByQuery(query); defaultConnector.deleteByQuery(query);
defaultConnector.commit(false); defaultConnector.commit(false);
try {sb.index.loadTimeIndex().clear();} catch (IOException e) {}
sb.tables.recordAPICall(post, "IndexDeletion_p.html", WorkTables.TABLE_API_TYPE_DELETION, "deletion, collection " + collectiondelete); sb.tables.recordAPICall(post, "IndexDeletion_p.html", WorkTables.TABLE_API_TYPE_DELETION, "deletion, collection " + collectiondelete);
} catch (final IOException e) { } catch (final IOException e) {
} }
@ -258,6 +262,7 @@ public class IndexDeletion_p {
ConcurrentLog.info("IndexDeletion", "delete by query \"" + querydelete + "\", size before deletion = " + connector.getSize()); ConcurrentLog.info("IndexDeletion", "delete by query \"" + querydelete + "\", size before deletion = " + connector.getSize());
connector.deleteByQuery(querydelete); connector.deleteByQuery(querydelete);
connector.commit(false); connector.commit(false);
try {sb.index.loadTimeIndex().clear();} catch (IOException e) {}
ConcurrentLog.info("IndexDeletion", "delete by query \"" + querydelete + "\", size after commit = " + connector.getSize()); ConcurrentLog.info("IndexDeletion", "delete by query \"" + querydelete + "\", size after commit = " + connector.getSize());
sb.tables.recordAPICall(post, "IndexDeletion_p.html", WorkTables.TABLE_API_TYPE_DELETION, "deletion, solr query, q = " + querydelete); sb.tables.recordAPICall(post, "IndexDeletion_p.html", WorkTables.TABLE_API_TYPE_DELETION, "deletion, solr query, q = " + querydelete);
} catch (final IOException e) { } catch (final IOException e) {