mirror of
https://github.com/yacy/yacy_search_server.git
synced 2024-09-19 00:01:41 +02:00
proper deletion of loadtime index
This commit is contained in:
parent
bd3f2483a1
commit
9c38b1254e
|
@ -416,6 +416,7 @@ public class Crawler_p {
|
||||||
if ("smb.ftp".indexOf(u.getProtocol()) >= 0 || "pdf".equals(MultiProtocolURL.getFileExtension(u.getFileName()))) anysmbftporpdf = true;
|
if ("smb.ftp".indexOf(u.getProtocol()) >= 0 || "pdf".equals(MultiProtocolURL.getFileExtension(u.getFileName()))) anysmbftporpdf = true;
|
||||||
}
|
}
|
||||||
sb.index.fulltext().remove(deleteIDs);
|
sb.index.fulltext().remove(deleteIDs);
|
||||||
|
deleteIDs.forEach(urlhash -> {try {sb.index.loadTimeIndex().remove(urlhash.getBytes());} catch (IOException e) {}});
|
||||||
sb.crawlQueues.removeHosts(hosthashes);
|
sb.crawlQueues.removeHosts(hosthashes);
|
||||||
sb.index.fulltext().commit(true);
|
sb.index.fulltext().commit(true);
|
||||||
|
|
||||||
|
@ -437,6 +438,7 @@ public class Crawler_p {
|
||||||
String basepath = u.toNormalform(true);
|
String basepath = u.toNormalform(true);
|
||||||
if (!basepath.endsWith("/")) {final int p = basepath.lastIndexOf("/"); if (p > 0) basepath = basepath.substring(0, p + 1);}
|
if (!basepath.endsWith("/")) {final int p = basepath.lastIndexOf("/"); if (p > 0) basepath = basepath.substring(0, p + 1);}
|
||||||
final int count = sb.index.fulltext().remove(basepath, deleteageDate);
|
final int count = sb.index.fulltext().remove(basepath, deleteageDate);
|
||||||
|
try {sb.index.loadTimeIndex().clear();} catch (IOException e) {}
|
||||||
if (count > 0) ConcurrentLog.info("Crawler_p", "deleted " + count + " documents for host " + u.getHost());
|
if (count > 0) ConcurrentLog.info("Crawler_p", "deleted " + count + " documents for host " + u.getHost());
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
|
@ -143,7 +143,6 @@ public class IndexControlURLs_p {
|
||||||
if ( post.get("deleteFirstSeen", "").equals("on")) {
|
if ( post.get("deleteFirstSeen", "").equals("on")) {
|
||||||
try {
|
try {
|
||||||
segment.firstSeenIndex().clear();
|
segment.firstSeenIndex().clear();
|
||||||
segment.loadTimeIndex().clear();
|
|
||||||
} catch (final IOException e) {}
|
} catch (final IOException e) {}
|
||||||
}
|
}
|
||||||
if ( post.get("deleteCrawlQueues", "").equals("on") ) {
|
if ( post.get("deleteCrawlQueues", "").equals("on") ) {
|
||||||
|
@ -166,6 +165,7 @@ public class IndexControlURLs_p {
|
||||||
|
|
||||||
ClientIdentification.Agent agent = ClientIdentification.getAgent(post.get("agentName", ClientIdentification.yacyInternetCrawlerAgentName));
|
ClientIdentification.Agent agent = ClientIdentification.getAgent(post.get("agentName", ClientIdentification.yacyInternetCrawlerAgentName));
|
||||||
int i = segment.removeAllUrlReferences(urlhash.getBytes(), sb.loader, agent, CacheStrategy.IFEXIST);
|
int i = segment.removeAllUrlReferences(urlhash.getBytes(), sb.loader, agent, CacheStrategy.IFEXIST);
|
||||||
|
try {segment.loadTimeIndex().remove(urlhash.getBytes());} catch (IOException e) {}
|
||||||
prop.put("result", "Deleted URL and " + i + " references from " + i + " word indexes.");
|
prop.put("result", "Deleted URL and " + i + " references from " + i + " word indexes.");
|
||||||
}
|
}
|
||||||
|
|
||||||
|
@ -183,6 +183,7 @@ public class IndexControlURLs_p {
|
||||||
sb.urlRemove(segment, urlhash.getBytes());
|
sb.urlRemove(segment, urlhash.getBytes());
|
||||||
prop.putHTML("result", "Removed URL " + url);
|
prop.putHTML("result", "Removed URL " + url);
|
||||||
}
|
}
|
||||||
|
segment.loadTimeIndex().remove(urlhash.getBytes());
|
||||||
} catch (IOException e) {
|
} catch (IOException e) {
|
||||||
prop.putHTML("result", "Error when querying the url hash " + urlhash + ":" + e.getMessage());
|
prop.putHTML("result", "Error when querying the url hash " + urlhash + ":" + e.getMessage());
|
||||||
}
|
}
|
||||||
|
@ -201,6 +202,7 @@ public class IndexControlURLs_p {
|
||||||
prop.put("result", "No input given; nothing deleted.");
|
prop.put("result", "No input given; nothing deleted.");
|
||||||
} else {
|
} else {
|
||||||
sb.urlRemove(segment, urlhash.getBytes());
|
sb.urlRemove(segment, urlhash.getBytes());
|
||||||
|
try {segment.loadTimeIndex().remove(urlhash.getBytes());} catch (IOException e) {}
|
||||||
prop.putHTML("result", "Removed URL " + urlstring);
|
prop.putHTML("result", "Removed URL " + urlstring);
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
@ -267,6 +269,7 @@ public class IndexControlURLs_p {
|
||||||
Set<String> hostnames = new HashSet<String>();
|
Set<String> hostnames = new HashSet<String>();
|
||||||
hostnames.add(domain);
|
hostnames.add(domain);
|
||||||
segment.fulltext().deleteStaleDomainNames(hostnames, null);
|
segment.fulltext().deleteStaleDomainNames(hostnames, null);
|
||||||
|
try {segment.loadTimeIndex().clear();} catch (IOException e) {} // delete all to prevent that existing entries reject reloading
|
||||||
// trigger the loading of the table
|
// trigger the loading of the table
|
||||||
post.put("statistics", "");
|
post.put("statistics", "");
|
||||||
}
|
}
|
||||||
|
|
|
@ -52,7 +52,7 @@ public class IndexDeletion_p {
|
||||||
// return variable that accumulates replacements
|
// return variable that accumulates replacements
|
||||||
final Switchboard sb = (Switchboard) env;
|
final Switchboard sb = (Switchboard) env;
|
||||||
final serverObjects prop = new serverObjects();
|
final serverObjects prop = new serverObjects();
|
||||||
|
|
||||||
/* Acquire a transaction token for the next POST form submission */
|
/* Acquire a transaction token for the next POST form submission */
|
||||||
prop.put(TransactionManager.TRANSACTION_TOKEN_PARAM, TransactionManager.getTransactionToken(header));
|
prop.put(TransactionManager.TRANSACTION_TOKEN_PARAM, TransactionManager.getTransactionToken(header));
|
||||||
|
|
||||||
|
@ -61,8 +61,8 @@ public class IndexDeletion_p {
|
||||||
if (post == null || post.size() == 0) defaultConnector.commit(false); // we must do a commit here because the user cannot see a proper count.
|
if (post == null || post.size() == 0) defaultConnector.commit(false); // we must do a commit here because the user cannot see a proper count.
|
||||||
|
|
||||||
String schemaName = CollectionSchema.CORE_NAME;
|
String schemaName = CollectionSchema.CORE_NAME;
|
||||||
if (post != null) schemaName = post.get("core", schemaName);
|
if (post != null) schemaName = post.get("core", schemaName);
|
||||||
|
|
||||||
// Delete by URL Matching
|
// Delete by URL Matching
|
||||||
String urldelete = post == null ? "" : post.get("urldelete", "");
|
String urldelete = post == null ? "" : post.get("urldelete", "");
|
||||||
boolean urldelete_mm_subpath_checked = post == null ? true : post.get("urldelete-mm", "subpath").equals("subpath");
|
boolean urldelete_mm_subpath_checked = post == null ? true : post.get("urldelete-mm", "subpath").equals("subpath");
|
||||||
|
@ -70,7 +70,7 @@ public class IndexDeletion_p {
|
||||||
prop.put("urldelete-mm-subpath-checked", urldelete_mm_subpath_checked ? 1 : 0);
|
prop.put("urldelete-mm-subpath-checked", urldelete_mm_subpath_checked ? 1 : 0);
|
||||||
prop.put("urldelete-mm-regexp-checked", urldelete_mm_subpath_checked ? 0 : 1);
|
prop.put("urldelete-mm-regexp-checked", urldelete_mm_subpath_checked ? 0 : 1);
|
||||||
prop.put("urldelete-active", 0);
|
prop.put("urldelete-active", 0);
|
||||||
|
|
||||||
// Delete by Age
|
// Delete by Age
|
||||||
int timedelete_number = post == null ? 14 : post.getInt("timedelete-number", 14);
|
int timedelete_number = post == null ? 14 : post.getInt("timedelete-number", 14);
|
||||||
String timedelete_unit = post == null ? "day" : post.get("timedelete-unit", "day");
|
String timedelete_unit = post == null ? "day" : post.get("timedelete-unit", "day");
|
||||||
|
@ -84,7 +84,7 @@ public class IndexDeletion_p {
|
||||||
prop.put("timedelete-source-loaddate-checked", timedelete_source_loaddate_checked ? 1 : 0);
|
prop.put("timedelete-source-loaddate-checked", timedelete_source_loaddate_checked ? 1 : 0);
|
||||||
prop.put("timedelete-source-lastmodified-checked", timedelete_source_loaddate_checked ? 0 : 1);
|
prop.put("timedelete-source-lastmodified-checked", timedelete_source_loaddate_checked ? 0 : 1);
|
||||||
prop.put("timedelete-active", 0);
|
prop.put("timedelete-active", 0);
|
||||||
|
|
||||||
// Delete Collections
|
// Delete Collections
|
||||||
boolean collectiondelete_mode_unassigned_checked = post == null ? true : post.get("collectiondelete-mode", "unassigned").equals("unassigned");
|
boolean collectiondelete_mode_unassigned_checked = post == null ? true : post.get("collectiondelete-mode", "unassigned").equals("unassigned");
|
||||||
String collectiondelete = post == null ? "" : post.get("collectiondelete", "");
|
String collectiondelete = post == null ? "" : post.get("collectiondelete", "");
|
||||||
|
@ -112,7 +112,7 @@ public class IndexDeletion_p {
|
||||||
prop.put("collectiondelete-mode-assigned-checked", collectiondelete_mode_unassigned_checked ? 0 : 1);
|
prop.put("collectiondelete-mode-assigned-checked", collectiondelete_mode_unassigned_checked ? 0 : 1);
|
||||||
prop.putHTML("collectiondelete-select_collectiondelete", collectiondelete);
|
prop.putHTML("collectiondelete-select_collectiondelete", collectiondelete);
|
||||||
prop.put("collectiondelete-active", 0);
|
prop.put("collectiondelete-active", 0);
|
||||||
|
|
||||||
// Delete by Solr Query
|
// Delete by Solr Query
|
||||||
prop.put("querydelete", "");
|
prop.put("querydelete", "");
|
||||||
String querydelete = post == null ? "" : post.get("querydelete", "");
|
String querydelete = post == null ? "" : post.get("querydelete", "");
|
||||||
|
@ -121,16 +121,16 @@ public class IndexDeletion_p {
|
||||||
prop.putHTML("querydelete", querydelete);
|
prop.putHTML("querydelete", querydelete);
|
||||||
prop.put("querydelete-active", 0);
|
prop.put("querydelete-active", 0);
|
||||||
|
|
||||||
|
|
||||||
int count = post == null ? -1 : post.getInt("count", -1);
|
int count = post == null ? -1 : post.getInt("count", -1);
|
||||||
|
|
||||||
if (post != null && (post.containsKey("simulate-urldelete") || post.containsKey("engage-urldelete"))) {
|
if (post != null && (post.containsKey("simulate-urldelete") || post.containsKey("engage-urldelete"))) {
|
||||||
/* Check the transaction is valid */
|
/* Check the transaction is valid */
|
||||||
TransactionManager.checkPostTransaction(header, post);
|
TransactionManager.checkPostTransaction(header, post);
|
||||||
|
|
||||||
boolean simulate = post.containsKey("simulate-urldelete");
|
boolean simulate = post.containsKey("simulate-urldelete");
|
||||||
// parse the input
|
// parse the input
|
||||||
urldelete = urldelete.trim();
|
urldelete = urldelete.trim();
|
||||||
if (urldelete_mm_subpath_checked) {
|
if (urldelete_mm_subpath_checked) {
|
||||||
// collect using url stubs
|
// collect using url stubs
|
||||||
Set<String> ids = new HashSet<String>();
|
Set<String> ids = new HashSet<String>();
|
||||||
|
@ -154,13 +154,14 @@ public class IndexDeletion_p {
|
||||||
}
|
}
|
||||||
} catch (final MalformedURLException e) {}
|
} catch (final MalformedURLException e) {}
|
||||||
}
|
}
|
||||||
|
|
||||||
if (simulate) {
|
if (simulate) {
|
||||||
count = ids.size();
|
count = ids.size();
|
||||||
prop.put("urldelete-active", count == 0 ? 2 : 1);
|
prop.put("urldelete-active", count == 0 ? 2 : 1);
|
||||||
} else {
|
} else {
|
||||||
sb.remove(ids);
|
sb.remove(ids);
|
||||||
defaultConnector.commit(false);
|
defaultConnector.commit(false);
|
||||||
|
ids.forEach(urlhash -> {try {sb.index.loadTimeIndex().remove(urlhash.getBytes());} catch (IOException e) {}});
|
||||||
sb.tables.recordAPICall(post, "IndexDeletion_p.html", WorkTables.TABLE_API_TYPE_DELETION, "deletion, docs matching with " + urldelete);
|
sb.tables.recordAPICall(post, "IndexDeletion_p.html", WorkTables.TABLE_API_TYPE_DELETION, "deletion, docs matching with " + urldelete);
|
||||||
prop.put("urldelete-active", 2);
|
prop.put("urldelete-active", 2);
|
||||||
}
|
}
|
||||||
|
@ -177,6 +178,7 @@ public class IndexDeletion_p {
|
||||||
try {
|
try {
|
||||||
defaultConnector.deleteByQuery(regexquery);
|
defaultConnector.deleteByQuery(regexquery);
|
||||||
defaultConnector.commit(false);
|
defaultConnector.commit(false);
|
||||||
|
try {sb.index.loadTimeIndex().clear();} catch (IOException e) {}
|
||||||
sb.tables.recordAPICall(post, "IndexDeletion_p.html", WorkTables.TABLE_API_TYPE_DELETION, "deletion, regex match = " + urldelete);
|
sb.tables.recordAPICall(post, "IndexDeletion_p.html", WorkTables.TABLE_API_TYPE_DELETION, "deletion, regex match = " + urldelete);
|
||||||
} catch (final IOException e) {
|
} catch (final IOException e) {
|
||||||
}
|
}
|
||||||
|
@ -187,9 +189,9 @@ public class IndexDeletion_p {
|
||||||
}
|
}
|
||||||
|
|
||||||
if (post != null && (post.containsKey("simulate-timedelete") || post.containsKey("engage-timedelete"))) {
|
if (post != null && (post.containsKey("simulate-timedelete") || post.containsKey("engage-timedelete"))) {
|
||||||
/* Check the transaction is valid */
|
/* Check the transaction is valid */
|
||||||
TransactionManager.checkPostTransaction(header, post);
|
TransactionManager.checkPostTransaction(header, post);
|
||||||
|
|
||||||
boolean simulate = post.containsKey("simulate-timedelete");
|
boolean simulate = post.containsKey("simulate-timedelete");
|
||||||
Date deleteageDate = null;
|
Date deleteageDate = null;
|
||||||
long t = timeParser(timedelete_number, timedelete_unit); // year, month, day, hour
|
long t = timeParser(timedelete_number, timedelete_unit); // year, month, day, hour
|
||||||
|
@ -206,6 +208,7 @@ public class IndexDeletion_p {
|
||||||
try {
|
try {
|
||||||
defaultConnector.deleteByQuery(collection1Query);
|
defaultConnector.deleteByQuery(collection1Query);
|
||||||
defaultConnector.commit(false);
|
defaultConnector.commit(false);
|
||||||
|
try {sb.index.loadTimeIndex().clear();} catch (IOException e) {}
|
||||||
if (webgraphConnector != null) webgraphConnector.deleteByQuery(webgraphQuery);
|
if (webgraphConnector != null) webgraphConnector.deleteByQuery(webgraphQuery);
|
||||||
sb.tables.recordAPICall(post, "IndexDeletion_p.html", WorkTables.TABLE_API_TYPE_DELETION, "deletion, docs older than " + timedelete_number + " " + timedelete_unit);
|
sb.tables.recordAPICall(post, "IndexDeletion_p.html", WorkTables.TABLE_API_TYPE_DELETION, "deletion, docs older than " + timedelete_number + " " + timedelete_unit);
|
||||||
} catch (final IOException e) {
|
} catch (final IOException e) {
|
||||||
|
@ -214,11 +217,11 @@ public class IndexDeletion_p {
|
||||||
}
|
}
|
||||||
prop.put("timedelete-active_count", count);
|
prop.put("timedelete-active_count", count);
|
||||||
}
|
}
|
||||||
|
|
||||||
if (post != null && (post.containsKey("simulate-collectiondelete") || post.containsKey("engage-collectiondelete"))) {
|
if (post != null && (post.containsKey("simulate-collectiondelete") || post.containsKey("engage-collectiondelete"))) {
|
||||||
/* Check the transaction is valid */
|
/* Check the transaction is valid */
|
||||||
TransactionManager.checkPostTransaction(header, post);
|
TransactionManager.checkPostTransaction(header, post);
|
||||||
|
|
||||||
boolean simulate = post.containsKey("simulate-collectiondelete");
|
boolean simulate = post.containsKey("simulate-collectiondelete");
|
||||||
collectiondelete = collectiondelete.replaceAll(" ","").replaceAll(",", "|");
|
collectiondelete = collectiondelete.replaceAll(" ","").replaceAll(",", "|");
|
||||||
String query = collectiondelete_mode_unassigned_checked ? "-" + CollectionSchema.collection_sxt + AbstractSolrConnector.CATCHALL_DTERM : collectiondelete.length() == 0 ? CollectionSchema.collection_sxt + ":\"\"" : QueryModifier.parseCollectionExpression(collectiondelete);
|
String query = collectiondelete_mode_unassigned_checked ? "-" + CollectionSchema.collection_sxt + AbstractSolrConnector.CATCHALL_DTERM : collectiondelete.length() == 0 ? CollectionSchema.collection_sxt + ":\"\"" : QueryModifier.parseCollectionExpression(collectiondelete);
|
||||||
|
@ -232,6 +235,7 @@ public class IndexDeletion_p {
|
||||||
try {
|
try {
|
||||||
defaultConnector.deleteByQuery(query);
|
defaultConnector.deleteByQuery(query);
|
||||||
defaultConnector.commit(false);
|
defaultConnector.commit(false);
|
||||||
|
try {sb.index.loadTimeIndex().clear();} catch (IOException e) {}
|
||||||
sb.tables.recordAPICall(post, "IndexDeletion_p.html", WorkTables.TABLE_API_TYPE_DELETION, "deletion, collection " + collectiondelete);
|
sb.tables.recordAPICall(post, "IndexDeletion_p.html", WorkTables.TABLE_API_TYPE_DELETION, "deletion, collection " + collectiondelete);
|
||||||
} catch (final IOException e) {
|
} catch (final IOException e) {
|
||||||
}
|
}
|
||||||
|
@ -239,11 +243,11 @@ public class IndexDeletion_p {
|
||||||
}
|
}
|
||||||
prop.put("collectiondelete-active_count", count);
|
prop.put("collectiondelete-active_count", count);
|
||||||
}
|
}
|
||||||
|
|
||||||
if (post != null && (post.containsKey("simulate-querydelete") || post.containsKey("engage-querydelete"))) {
|
if (post != null && (post.containsKey("simulate-querydelete") || post.containsKey("engage-querydelete"))) {
|
||||||
/* Check the transaction is valid */
|
/* Check the transaction is valid */
|
||||||
TransactionManager.checkPostTransaction(header, post);
|
TransactionManager.checkPostTransaction(header, post);
|
||||||
|
|
||||||
boolean simulate = post.containsKey("simulate-querydelete");
|
boolean simulate = post.containsKey("simulate-querydelete");
|
||||||
|
|
||||||
SolrConnector connector = schemaName.equals(CollectionSchema.CORE_NAME) ? defaultConnector : sb.index.fulltext().getWebgraphConnector();
|
SolrConnector connector = schemaName.equals(CollectionSchema.CORE_NAME) ? defaultConnector : sb.index.fulltext().getWebgraphConnector();
|
||||||
|
@ -258,6 +262,7 @@ public class IndexDeletion_p {
|
||||||
ConcurrentLog.info("IndexDeletion", "delete by query \"" + querydelete + "\", size before deletion = " + connector.getSize());
|
ConcurrentLog.info("IndexDeletion", "delete by query \"" + querydelete + "\", size before deletion = " + connector.getSize());
|
||||||
connector.deleteByQuery(querydelete);
|
connector.deleteByQuery(querydelete);
|
||||||
connector.commit(false);
|
connector.commit(false);
|
||||||
|
try {sb.index.loadTimeIndex().clear();} catch (IOException e) {}
|
||||||
ConcurrentLog.info("IndexDeletion", "delete by query \"" + querydelete + "\", size after commit = " + connector.getSize());
|
ConcurrentLog.info("IndexDeletion", "delete by query \"" + querydelete + "\", size after commit = " + connector.getSize());
|
||||||
sb.tables.recordAPICall(post, "IndexDeletion_p.html", WorkTables.TABLE_API_TYPE_DELETION, "deletion, solr query, q = " + querydelete);
|
sb.tables.recordAPICall(post, "IndexDeletion_p.html", WorkTables.TABLE_API_TYPE_DELETION, "deletion, solr query, q = " + querydelete);
|
||||||
} catch (final IOException e) {
|
} catch (final IOException e) {
|
||||||
|
@ -267,14 +272,14 @@ public class IndexDeletion_p {
|
||||||
prop.put("querydelete-active_count", count);
|
prop.put("querydelete-active_count", count);
|
||||||
}
|
}
|
||||||
prop.put("doccount", defaultConnector.getSize());
|
prop.put("doccount", defaultConnector.getSize());
|
||||||
|
|
||||||
|
|
||||||
prop.put("cores_" + 0 + "_name", CollectionSchema.CORE_NAME);
|
prop.put("cores_" + 0 + "_name", CollectionSchema.CORE_NAME);
|
||||||
prop.put("cores_" + 0 + "_selected", CollectionSchema.CORE_NAME.equals(schemaName) ? 1 : 0);
|
prop.put("cores_" + 0 + "_selected", CollectionSchema.CORE_NAME.equals(schemaName) ? 1 : 0);
|
||||||
prop.put("cores_" + 1 + "_name", WebgraphSchema.CORE_NAME);
|
prop.put("cores_" + 1 + "_name", WebgraphSchema.CORE_NAME);
|
||||||
prop.put("cores_" + 1 + "_selected", WebgraphSchema.CORE_NAME.equals(schemaName) ? 1 : 0);
|
prop.put("cores_" + 1 + "_selected", WebgraphSchema.CORE_NAME.equals(schemaName) ? 1 : 0);
|
||||||
prop.put("cores", 2);
|
prop.put("cores", 2);
|
||||||
|
|
||||||
// return rewrite properties
|
// return rewrite properties
|
||||||
return prop;
|
return prop;
|
||||||
}
|
}
|
||||||
|
|
Loading…
Reference in New Issue
Block a user