proper deletion of loadtime index

This commit is contained in:
Michael Peter Christen 2021-12-22 01:22:46 +01:00
parent bd3f2483a1
commit 9c38b1254e
3 changed files with 36 additions and 26 deletions

View File

@ -416,6 +416,7 @@ public class Crawler_p {
if ("smb.ftp".indexOf(u.getProtocol()) >= 0 || "pdf".equals(MultiProtocolURL.getFileExtension(u.getFileName()))) anysmbftporpdf = true; if ("smb.ftp".indexOf(u.getProtocol()) >= 0 || "pdf".equals(MultiProtocolURL.getFileExtension(u.getFileName()))) anysmbftporpdf = true;
} }
sb.index.fulltext().remove(deleteIDs); sb.index.fulltext().remove(deleteIDs);
deleteIDs.forEach(urlhash -> {try {sb.index.loadTimeIndex().remove(urlhash.getBytes());} catch (IOException e) {}});
sb.crawlQueues.removeHosts(hosthashes); sb.crawlQueues.removeHosts(hosthashes);
sb.index.fulltext().commit(true); sb.index.fulltext().commit(true);
@ -437,6 +438,7 @@ public class Crawler_p {
String basepath = u.toNormalform(true); String basepath = u.toNormalform(true);
if (!basepath.endsWith("/")) {final int p = basepath.lastIndexOf("/"); if (p > 0) basepath = basepath.substring(0, p + 1);} if (!basepath.endsWith("/")) {final int p = basepath.lastIndexOf("/"); if (p > 0) basepath = basepath.substring(0, p + 1);}
final int count = sb.index.fulltext().remove(basepath, deleteageDate); final int count = sb.index.fulltext().remove(basepath, deleteageDate);
try {sb.index.loadTimeIndex().clear();} catch (IOException e) {}
if (count > 0) ConcurrentLog.info("Crawler_p", "deleted " + count + " documents for host " + u.getHost()); if (count > 0) ConcurrentLog.info("Crawler_p", "deleted " + count + " documents for host " + u.getHost());
} }
} }

View File

@ -143,7 +143,6 @@ public class IndexControlURLs_p {
if ( post.get("deleteFirstSeen", "").equals("on")) { if ( post.get("deleteFirstSeen", "").equals("on")) {
try { try {
segment.firstSeenIndex().clear(); segment.firstSeenIndex().clear();
segment.loadTimeIndex().clear();
} catch (final IOException e) {} } catch (final IOException e) {}
} }
if ( post.get("deleteCrawlQueues", "").equals("on") ) { if ( post.get("deleteCrawlQueues", "").equals("on") ) {
@ -166,6 +165,7 @@ public class IndexControlURLs_p {
ClientIdentification.Agent agent = ClientIdentification.getAgent(post.get("agentName", ClientIdentification.yacyInternetCrawlerAgentName)); ClientIdentification.Agent agent = ClientIdentification.getAgent(post.get("agentName", ClientIdentification.yacyInternetCrawlerAgentName));
int i = segment.removeAllUrlReferences(urlhash.getBytes(), sb.loader, agent, CacheStrategy.IFEXIST); int i = segment.removeAllUrlReferences(urlhash.getBytes(), sb.loader, agent, CacheStrategy.IFEXIST);
try {segment.loadTimeIndex().remove(urlhash.getBytes());} catch (IOException e) {}
prop.put("result", "Deleted URL and " + i + " references from " + i + " word indexes."); prop.put("result", "Deleted URL and " + i + " references from " + i + " word indexes.");
} }
@ -183,6 +183,7 @@ public class IndexControlURLs_p {
sb.urlRemove(segment, urlhash.getBytes()); sb.urlRemove(segment, urlhash.getBytes());
prop.putHTML("result", "Removed URL " + url); prop.putHTML("result", "Removed URL " + url);
} }
segment.loadTimeIndex().remove(urlhash.getBytes());
} catch (IOException e) { } catch (IOException e) {
prop.putHTML("result", "Error when querying the url hash " + urlhash + ":" + e.getMessage()); prop.putHTML("result", "Error when querying the url hash " + urlhash + ":" + e.getMessage());
} }
@ -201,6 +202,7 @@ public class IndexControlURLs_p {
prop.put("result", "No input given; nothing deleted."); prop.put("result", "No input given; nothing deleted.");
} else { } else {
sb.urlRemove(segment, urlhash.getBytes()); sb.urlRemove(segment, urlhash.getBytes());
try {segment.loadTimeIndex().remove(urlhash.getBytes());} catch (IOException e) {}
prop.putHTML("result", "Removed URL " + urlstring); prop.putHTML("result", "Removed URL " + urlstring);
} }
} }
@ -267,6 +269,7 @@ public class IndexControlURLs_p {
Set<String> hostnames = new HashSet<String>(); Set<String> hostnames = new HashSet<String>();
hostnames.add(domain); hostnames.add(domain);
segment.fulltext().deleteStaleDomainNames(hostnames, null); segment.fulltext().deleteStaleDomainNames(hostnames, null);
try {segment.loadTimeIndex().clear();} catch (IOException e) {} // delete all to prevent that existing entries reject reloading
// trigger the loading of the table // trigger the loading of the table
post.put("statistics", ""); post.put("statistics", "");
} }

View File

@ -52,7 +52,7 @@ public class IndexDeletion_p {
// return variable that accumulates replacements // return variable that accumulates replacements
final Switchboard sb = (Switchboard) env; final Switchboard sb = (Switchboard) env;
final serverObjects prop = new serverObjects(); final serverObjects prop = new serverObjects();
/* Acquire a transaction token for the next POST form submission */ /* Acquire a transaction token for the next POST form submission */
prop.put(TransactionManager.TRANSACTION_TOKEN_PARAM, TransactionManager.getTransactionToken(header)); prop.put(TransactionManager.TRANSACTION_TOKEN_PARAM, TransactionManager.getTransactionToken(header));
@ -61,8 +61,8 @@ public class IndexDeletion_p {
if (post == null || post.size() == 0) defaultConnector.commit(false); // we must do a commit here because the user cannot see a proper count. if (post == null || post.size() == 0) defaultConnector.commit(false); // we must do a commit here because the user cannot see a proper count.
String schemaName = CollectionSchema.CORE_NAME; String schemaName = CollectionSchema.CORE_NAME;
if (post != null) schemaName = post.get("core", schemaName); if (post != null) schemaName = post.get("core", schemaName);
// Delete by URL Matching // Delete by URL Matching
String urldelete = post == null ? "" : post.get("urldelete", ""); String urldelete = post == null ? "" : post.get("urldelete", "");
boolean urldelete_mm_subpath_checked = post == null ? true : post.get("urldelete-mm", "subpath").equals("subpath"); boolean urldelete_mm_subpath_checked = post == null ? true : post.get("urldelete-mm", "subpath").equals("subpath");
@ -70,7 +70,7 @@ public class IndexDeletion_p {
prop.put("urldelete-mm-subpath-checked", urldelete_mm_subpath_checked ? 1 : 0); prop.put("urldelete-mm-subpath-checked", urldelete_mm_subpath_checked ? 1 : 0);
prop.put("urldelete-mm-regexp-checked", urldelete_mm_subpath_checked ? 0 : 1); prop.put("urldelete-mm-regexp-checked", urldelete_mm_subpath_checked ? 0 : 1);
prop.put("urldelete-active", 0); prop.put("urldelete-active", 0);
// Delete by Age // Delete by Age
int timedelete_number = post == null ? 14 : post.getInt("timedelete-number", 14); int timedelete_number = post == null ? 14 : post.getInt("timedelete-number", 14);
String timedelete_unit = post == null ? "day" : post.get("timedelete-unit", "day"); String timedelete_unit = post == null ? "day" : post.get("timedelete-unit", "day");
@ -84,7 +84,7 @@ public class IndexDeletion_p {
prop.put("timedelete-source-loaddate-checked", timedelete_source_loaddate_checked ? 1 : 0); prop.put("timedelete-source-loaddate-checked", timedelete_source_loaddate_checked ? 1 : 0);
prop.put("timedelete-source-lastmodified-checked", timedelete_source_loaddate_checked ? 0 : 1); prop.put("timedelete-source-lastmodified-checked", timedelete_source_loaddate_checked ? 0 : 1);
prop.put("timedelete-active", 0); prop.put("timedelete-active", 0);
// Delete Collections // Delete Collections
boolean collectiondelete_mode_unassigned_checked = post == null ? true : post.get("collectiondelete-mode", "unassigned").equals("unassigned"); boolean collectiondelete_mode_unassigned_checked = post == null ? true : post.get("collectiondelete-mode", "unassigned").equals("unassigned");
String collectiondelete = post == null ? "" : post.get("collectiondelete", ""); String collectiondelete = post == null ? "" : post.get("collectiondelete", "");
@ -112,7 +112,7 @@ public class IndexDeletion_p {
prop.put("collectiondelete-mode-assigned-checked", collectiondelete_mode_unassigned_checked ? 0 : 1); prop.put("collectiondelete-mode-assigned-checked", collectiondelete_mode_unassigned_checked ? 0 : 1);
prop.putHTML("collectiondelete-select_collectiondelete", collectiondelete); prop.putHTML("collectiondelete-select_collectiondelete", collectiondelete);
prop.put("collectiondelete-active", 0); prop.put("collectiondelete-active", 0);
// Delete by Solr Query // Delete by Solr Query
prop.put("querydelete", ""); prop.put("querydelete", "");
String querydelete = post == null ? "" : post.get("querydelete", ""); String querydelete = post == null ? "" : post.get("querydelete", "");
@ -121,16 +121,16 @@ public class IndexDeletion_p {
prop.putHTML("querydelete", querydelete); prop.putHTML("querydelete", querydelete);
prop.put("querydelete-active", 0); prop.put("querydelete-active", 0);
int count = post == null ? -1 : post.getInt("count", -1); int count = post == null ? -1 : post.getInt("count", -1);
if (post != null && (post.containsKey("simulate-urldelete") || post.containsKey("engage-urldelete"))) { if (post != null && (post.containsKey("simulate-urldelete") || post.containsKey("engage-urldelete"))) {
/* Check the transaction is valid */ /* Check the transaction is valid */
TransactionManager.checkPostTransaction(header, post); TransactionManager.checkPostTransaction(header, post);
boolean simulate = post.containsKey("simulate-urldelete"); boolean simulate = post.containsKey("simulate-urldelete");
// parse the input // parse the input
urldelete = urldelete.trim(); urldelete = urldelete.trim();
if (urldelete_mm_subpath_checked) { if (urldelete_mm_subpath_checked) {
// collect using url stubs // collect using url stubs
Set<String> ids = new HashSet<String>(); Set<String> ids = new HashSet<String>();
@ -154,13 +154,14 @@ public class IndexDeletion_p {
} }
} catch (final MalformedURLException e) {} } catch (final MalformedURLException e) {}
} }
if (simulate) { if (simulate) {
count = ids.size(); count = ids.size();
prop.put("urldelete-active", count == 0 ? 2 : 1); prop.put("urldelete-active", count == 0 ? 2 : 1);
} else { } else {
sb.remove(ids); sb.remove(ids);
defaultConnector.commit(false); defaultConnector.commit(false);
ids.forEach(urlhash -> {try {sb.index.loadTimeIndex().remove(urlhash.getBytes());} catch (IOException e) {}});
sb.tables.recordAPICall(post, "IndexDeletion_p.html", WorkTables.TABLE_API_TYPE_DELETION, "deletion, docs matching with " + urldelete); sb.tables.recordAPICall(post, "IndexDeletion_p.html", WorkTables.TABLE_API_TYPE_DELETION, "deletion, docs matching with " + urldelete);
prop.put("urldelete-active", 2); prop.put("urldelete-active", 2);
} }
@ -177,6 +178,7 @@ public class IndexDeletion_p {
try { try {
defaultConnector.deleteByQuery(regexquery); defaultConnector.deleteByQuery(regexquery);
defaultConnector.commit(false); defaultConnector.commit(false);
try {sb.index.loadTimeIndex().clear();} catch (IOException e) {}
sb.tables.recordAPICall(post, "IndexDeletion_p.html", WorkTables.TABLE_API_TYPE_DELETION, "deletion, regex match = " + urldelete); sb.tables.recordAPICall(post, "IndexDeletion_p.html", WorkTables.TABLE_API_TYPE_DELETION, "deletion, regex match = " + urldelete);
} catch (final IOException e) { } catch (final IOException e) {
} }
@ -187,9 +189,9 @@ public class IndexDeletion_p {
} }
if (post != null && (post.containsKey("simulate-timedelete") || post.containsKey("engage-timedelete"))) { if (post != null && (post.containsKey("simulate-timedelete") || post.containsKey("engage-timedelete"))) {
/* Check the transaction is valid */ /* Check the transaction is valid */
TransactionManager.checkPostTransaction(header, post); TransactionManager.checkPostTransaction(header, post);
boolean simulate = post.containsKey("simulate-timedelete"); boolean simulate = post.containsKey("simulate-timedelete");
Date deleteageDate = null; Date deleteageDate = null;
long t = timeParser(timedelete_number, timedelete_unit); // year, month, day, hour long t = timeParser(timedelete_number, timedelete_unit); // year, month, day, hour
@ -206,6 +208,7 @@ public class IndexDeletion_p {
try { try {
defaultConnector.deleteByQuery(collection1Query); defaultConnector.deleteByQuery(collection1Query);
defaultConnector.commit(false); defaultConnector.commit(false);
try {sb.index.loadTimeIndex().clear();} catch (IOException e) {}
if (webgraphConnector != null) webgraphConnector.deleteByQuery(webgraphQuery); if (webgraphConnector != null) webgraphConnector.deleteByQuery(webgraphQuery);
sb.tables.recordAPICall(post, "IndexDeletion_p.html", WorkTables.TABLE_API_TYPE_DELETION, "deletion, docs older than " + timedelete_number + " " + timedelete_unit); sb.tables.recordAPICall(post, "IndexDeletion_p.html", WorkTables.TABLE_API_TYPE_DELETION, "deletion, docs older than " + timedelete_number + " " + timedelete_unit);
} catch (final IOException e) { } catch (final IOException e) {
@ -214,11 +217,11 @@ public class IndexDeletion_p {
} }
prop.put("timedelete-active_count", count); prop.put("timedelete-active_count", count);
} }
if (post != null && (post.containsKey("simulate-collectiondelete") || post.containsKey("engage-collectiondelete"))) { if (post != null && (post.containsKey("simulate-collectiondelete") || post.containsKey("engage-collectiondelete"))) {
/* Check the transaction is valid */ /* Check the transaction is valid */
TransactionManager.checkPostTransaction(header, post); TransactionManager.checkPostTransaction(header, post);
boolean simulate = post.containsKey("simulate-collectiondelete"); boolean simulate = post.containsKey("simulate-collectiondelete");
collectiondelete = collectiondelete.replaceAll(" ","").replaceAll(",", "|"); collectiondelete = collectiondelete.replaceAll(" ","").replaceAll(",", "|");
String query = collectiondelete_mode_unassigned_checked ? "-" + CollectionSchema.collection_sxt + AbstractSolrConnector.CATCHALL_DTERM : collectiondelete.length() == 0 ? CollectionSchema.collection_sxt + ":\"\"" : QueryModifier.parseCollectionExpression(collectiondelete); String query = collectiondelete_mode_unassigned_checked ? "-" + CollectionSchema.collection_sxt + AbstractSolrConnector.CATCHALL_DTERM : collectiondelete.length() == 0 ? CollectionSchema.collection_sxt + ":\"\"" : QueryModifier.parseCollectionExpression(collectiondelete);
@ -232,6 +235,7 @@ public class IndexDeletion_p {
try { try {
defaultConnector.deleteByQuery(query); defaultConnector.deleteByQuery(query);
defaultConnector.commit(false); defaultConnector.commit(false);
try {sb.index.loadTimeIndex().clear();} catch (IOException e) {}
sb.tables.recordAPICall(post, "IndexDeletion_p.html", WorkTables.TABLE_API_TYPE_DELETION, "deletion, collection " + collectiondelete); sb.tables.recordAPICall(post, "IndexDeletion_p.html", WorkTables.TABLE_API_TYPE_DELETION, "deletion, collection " + collectiondelete);
} catch (final IOException e) { } catch (final IOException e) {
} }
@ -239,11 +243,11 @@ public class IndexDeletion_p {
} }
prop.put("collectiondelete-active_count", count); prop.put("collectiondelete-active_count", count);
} }
if (post != null && (post.containsKey("simulate-querydelete") || post.containsKey("engage-querydelete"))) { if (post != null && (post.containsKey("simulate-querydelete") || post.containsKey("engage-querydelete"))) {
/* Check the transaction is valid */ /* Check the transaction is valid */
TransactionManager.checkPostTransaction(header, post); TransactionManager.checkPostTransaction(header, post);
boolean simulate = post.containsKey("simulate-querydelete"); boolean simulate = post.containsKey("simulate-querydelete");
SolrConnector connector = schemaName.equals(CollectionSchema.CORE_NAME) ? defaultConnector : sb.index.fulltext().getWebgraphConnector(); SolrConnector connector = schemaName.equals(CollectionSchema.CORE_NAME) ? defaultConnector : sb.index.fulltext().getWebgraphConnector();
@ -258,6 +262,7 @@ public class IndexDeletion_p {
ConcurrentLog.info("IndexDeletion", "delete by query \"" + querydelete + "\", size before deletion = " + connector.getSize()); ConcurrentLog.info("IndexDeletion", "delete by query \"" + querydelete + "\", size before deletion = " + connector.getSize());
connector.deleteByQuery(querydelete); connector.deleteByQuery(querydelete);
connector.commit(false); connector.commit(false);
try {sb.index.loadTimeIndex().clear();} catch (IOException e) {}
ConcurrentLog.info("IndexDeletion", "delete by query \"" + querydelete + "\", size after commit = " + connector.getSize()); ConcurrentLog.info("IndexDeletion", "delete by query \"" + querydelete + "\", size after commit = " + connector.getSize());
sb.tables.recordAPICall(post, "IndexDeletion_p.html", WorkTables.TABLE_API_TYPE_DELETION, "deletion, solr query, q = " + querydelete); sb.tables.recordAPICall(post, "IndexDeletion_p.html", WorkTables.TABLE_API_TYPE_DELETION, "deletion, solr query, q = " + querydelete);
} catch (final IOException e) { } catch (final IOException e) {
@ -267,14 +272,14 @@ public class IndexDeletion_p {
prop.put("querydelete-active_count", count); prop.put("querydelete-active_count", count);
} }
prop.put("doccount", defaultConnector.getSize()); prop.put("doccount", defaultConnector.getSize());
prop.put("cores_" + 0 + "_name", CollectionSchema.CORE_NAME); prop.put("cores_" + 0 + "_name", CollectionSchema.CORE_NAME);
prop.put("cores_" + 0 + "_selected", CollectionSchema.CORE_NAME.equals(schemaName) ? 1 : 0); prop.put("cores_" + 0 + "_selected", CollectionSchema.CORE_NAME.equals(schemaName) ? 1 : 0);
prop.put("cores_" + 1 + "_name", WebgraphSchema.CORE_NAME); prop.put("cores_" + 1 + "_name", WebgraphSchema.CORE_NAME);
prop.put("cores_" + 1 + "_selected", WebgraphSchema.CORE_NAME.equals(schemaName) ? 1 : 0); prop.put("cores_" + 1 + "_selected", WebgraphSchema.CORE_NAME.equals(schemaName) ? 1 : 0);
prop.put("cores", 2); prop.put("cores", 2);
// return rewrite properties // return rewrite properties
return prop; return prop;
} }