mirror of
https://github.com/yacy/yacy_search_server.git
synced 2024-09-19 00:01:41 +02:00
proper deletion of loadtime index
This commit is contained in:
parent
bd3f2483a1
commit
9c38b1254e
|
@ -416,6 +416,7 @@ public class Crawler_p {
|
|||
if ("smb.ftp".indexOf(u.getProtocol()) >= 0 || "pdf".equals(MultiProtocolURL.getFileExtension(u.getFileName()))) anysmbftporpdf = true;
|
||||
}
|
||||
sb.index.fulltext().remove(deleteIDs);
|
||||
deleteIDs.forEach(urlhash -> {try {sb.index.loadTimeIndex().remove(urlhash.getBytes());} catch (IOException e) {}});
|
||||
sb.crawlQueues.removeHosts(hosthashes);
|
||||
sb.index.fulltext().commit(true);
|
||||
|
||||
|
@ -437,6 +438,7 @@ public class Crawler_p {
|
|||
String basepath = u.toNormalform(true);
|
||||
if (!basepath.endsWith("/")) {final int p = basepath.lastIndexOf("/"); if (p > 0) basepath = basepath.substring(0, p + 1);}
|
||||
final int count = sb.index.fulltext().remove(basepath, deleteageDate);
|
||||
try {sb.index.loadTimeIndex().clear();} catch (IOException e) {}
|
||||
if (count > 0) ConcurrentLog.info("Crawler_p", "deleted " + count + " documents for host " + u.getHost());
|
||||
}
|
||||
}
|
||||
|
|
|
@ -143,7 +143,6 @@ public class IndexControlURLs_p {
|
|||
if ( post.get("deleteFirstSeen", "").equals("on")) {
|
||||
try {
|
||||
segment.firstSeenIndex().clear();
|
||||
segment.loadTimeIndex().clear();
|
||||
} catch (final IOException e) {}
|
||||
}
|
||||
if ( post.get("deleteCrawlQueues", "").equals("on") ) {
|
||||
|
@ -166,6 +165,7 @@ public class IndexControlURLs_p {
|
|||
|
||||
ClientIdentification.Agent agent = ClientIdentification.getAgent(post.get("agentName", ClientIdentification.yacyInternetCrawlerAgentName));
|
||||
int i = segment.removeAllUrlReferences(urlhash.getBytes(), sb.loader, agent, CacheStrategy.IFEXIST);
|
||||
try {segment.loadTimeIndex().remove(urlhash.getBytes());} catch (IOException e) {}
|
||||
prop.put("result", "Deleted URL and " + i + " references from " + i + " word indexes.");
|
||||
}
|
||||
|
||||
|
@ -183,6 +183,7 @@ public class IndexControlURLs_p {
|
|||
sb.urlRemove(segment, urlhash.getBytes());
|
||||
prop.putHTML("result", "Removed URL " + url);
|
||||
}
|
||||
segment.loadTimeIndex().remove(urlhash.getBytes());
|
||||
} catch (IOException e) {
|
||||
prop.putHTML("result", "Error when querying the url hash " + urlhash + ":" + e.getMessage());
|
||||
}
|
||||
|
@ -201,6 +202,7 @@ public class IndexControlURLs_p {
|
|||
prop.put("result", "No input given; nothing deleted.");
|
||||
} else {
|
||||
sb.urlRemove(segment, urlhash.getBytes());
|
||||
try {segment.loadTimeIndex().remove(urlhash.getBytes());} catch (IOException e) {}
|
||||
prop.putHTML("result", "Removed URL " + urlstring);
|
||||
}
|
||||
}
|
||||
|
@ -267,6 +269,7 @@ public class IndexControlURLs_p {
|
|||
Set<String> hostnames = new HashSet<String>();
|
||||
hostnames.add(domain);
|
||||
segment.fulltext().deleteStaleDomainNames(hostnames, null);
|
||||
try {segment.loadTimeIndex().clear();} catch (IOException e) {} // delete all to prevent that existing entries reject reloading
|
||||
// trigger the loading of the table
|
||||
post.put("statistics", "");
|
||||
}
|
||||
|
|
|
@ -52,7 +52,7 @@ public class IndexDeletion_p {
|
|||
// return variable that accumulates replacements
|
||||
final Switchboard sb = (Switchboard) env;
|
||||
final serverObjects prop = new serverObjects();
|
||||
|
||||
|
||||
/* Acquire a transaction token for the next POST form submission */
|
||||
prop.put(TransactionManager.TRANSACTION_TOKEN_PARAM, TransactionManager.getTransactionToken(header));
|
||||
|
||||
|
@ -61,8 +61,8 @@ public class IndexDeletion_p {
|
|||
if (post == null || post.size() == 0) defaultConnector.commit(false); // we must do a commit here because the user cannot see a proper count.
|
||||
|
||||
String schemaName = CollectionSchema.CORE_NAME;
|
||||
if (post != null) schemaName = post.get("core", schemaName);
|
||||
|
||||
if (post != null) schemaName = post.get("core", schemaName);
|
||||
|
||||
// Delete by URL Matching
|
||||
String urldelete = post == null ? "" : post.get("urldelete", "");
|
||||
boolean urldelete_mm_subpath_checked = post == null ? true : post.get("urldelete-mm", "subpath").equals("subpath");
|
||||
|
@ -70,7 +70,7 @@ public class IndexDeletion_p {
|
|||
prop.put("urldelete-mm-subpath-checked", urldelete_mm_subpath_checked ? 1 : 0);
|
||||
prop.put("urldelete-mm-regexp-checked", urldelete_mm_subpath_checked ? 0 : 1);
|
||||
prop.put("urldelete-active", 0);
|
||||
|
||||
|
||||
// Delete by Age
|
||||
int timedelete_number = post == null ? 14 : post.getInt("timedelete-number", 14);
|
||||
String timedelete_unit = post == null ? "day" : post.get("timedelete-unit", "day");
|
||||
|
@ -84,7 +84,7 @@ public class IndexDeletion_p {
|
|||
prop.put("timedelete-source-loaddate-checked", timedelete_source_loaddate_checked ? 1 : 0);
|
||||
prop.put("timedelete-source-lastmodified-checked", timedelete_source_loaddate_checked ? 0 : 1);
|
||||
prop.put("timedelete-active", 0);
|
||||
|
||||
|
||||
// Delete Collections
|
||||
boolean collectiondelete_mode_unassigned_checked = post == null ? true : post.get("collectiondelete-mode", "unassigned").equals("unassigned");
|
||||
String collectiondelete = post == null ? "" : post.get("collectiondelete", "");
|
||||
|
@ -112,7 +112,7 @@ public class IndexDeletion_p {
|
|||
prop.put("collectiondelete-mode-assigned-checked", collectiondelete_mode_unassigned_checked ? 0 : 1);
|
||||
prop.putHTML("collectiondelete-select_collectiondelete", collectiondelete);
|
||||
prop.put("collectiondelete-active", 0);
|
||||
|
||||
|
||||
// Delete by Solr Query
|
||||
prop.put("querydelete", "");
|
||||
String querydelete = post == null ? "" : post.get("querydelete", "");
|
||||
|
@ -121,16 +121,16 @@ public class IndexDeletion_p {
|
|||
prop.putHTML("querydelete", querydelete);
|
||||
prop.put("querydelete-active", 0);
|
||||
|
||||
|
||||
|
||||
int count = post == null ? -1 : post.getInt("count", -1);
|
||||
|
||||
if (post != null && (post.containsKey("simulate-urldelete") || post.containsKey("engage-urldelete"))) {
|
||||
/* Check the transaction is valid */
|
||||
TransactionManager.checkPostTransaction(header, post);
|
||||
|
||||
/* Check the transaction is valid */
|
||||
TransactionManager.checkPostTransaction(header, post);
|
||||
|
||||
boolean simulate = post.containsKey("simulate-urldelete");
|
||||
// parse the input
|
||||
urldelete = urldelete.trim();
|
||||
urldelete = urldelete.trim();
|
||||
if (urldelete_mm_subpath_checked) {
|
||||
// collect using url stubs
|
||||
Set<String> ids = new HashSet<String>();
|
||||
|
@ -154,13 +154,14 @@ public class IndexDeletion_p {
|
|||
}
|
||||
} catch (final MalformedURLException e) {}
|
||||
}
|
||||
|
||||
|
||||
if (simulate) {
|
||||
count = ids.size();
|
||||
prop.put("urldelete-active", count == 0 ? 2 : 1);
|
||||
} else {
|
||||
sb.remove(ids);
|
||||
defaultConnector.commit(false);
|
||||
ids.forEach(urlhash -> {try {sb.index.loadTimeIndex().remove(urlhash.getBytes());} catch (IOException e) {}});
|
||||
sb.tables.recordAPICall(post, "IndexDeletion_p.html", WorkTables.TABLE_API_TYPE_DELETION, "deletion, docs matching with " + urldelete);
|
||||
prop.put("urldelete-active", 2);
|
||||
}
|
||||
|
@ -177,6 +178,7 @@ public class IndexDeletion_p {
|
|||
try {
|
||||
defaultConnector.deleteByQuery(regexquery);
|
||||
defaultConnector.commit(false);
|
||||
try {sb.index.loadTimeIndex().clear();} catch (IOException e) {}
|
||||
sb.tables.recordAPICall(post, "IndexDeletion_p.html", WorkTables.TABLE_API_TYPE_DELETION, "deletion, regex match = " + urldelete);
|
||||
} catch (final IOException e) {
|
||||
}
|
||||
|
@ -187,9 +189,9 @@ public class IndexDeletion_p {
|
|||
}
|
||||
|
||||
if (post != null && (post.containsKey("simulate-timedelete") || post.containsKey("engage-timedelete"))) {
|
||||
/* Check the transaction is valid */
|
||||
TransactionManager.checkPostTransaction(header, post);
|
||||
|
||||
/* Check the transaction is valid */
|
||||
TransactionManager.checkPostTransaction(header, post);
|
||||
|
||||
boolean simulate = post.containsKey("simulate-timedelete");
|
||||
Date deleteageDate = null;
|
||||
long t = timeParser(timedelete_number, timedelete_unit); // year, month, day, hour
|
||||
|
@ -206,6 +208,7 @@ public class IndexDeletion_p {
|
|||
try {
|
||||
defaultConnector.deleteByQuery(collection1Query);
|
||||
defaultConnector.commit(false);
|
||||
try {sb.index.loadTimeIndex().clear();} catch (IOException e) {}
|
||||
if (webgraphConnector != null) webgraphConnector.deleteByQuery(webgraphQuery);
|
||||
sb.tables.recordAPICall(post, "IndexDeletion_p.html", WorkTables.TABLE_API_TYPE_DELETION, "deletion, docs older than " + timedelete_number + " " + timedelete_unit);
|
||||
} catch (final IOException e) {
|
||||
|
@ -214,11 +217,11 @@ public class IndexDeletion_p {
|
|||
}
|
||||
prop.put("timedelete-active_count", count);
|
||||
}
|
||||
|
||||
|
||||
if (post != null && (post.containsKey("simulate-collectiondelete") || post.containsKey("engage-collectiondelete"))) {
|
||||
/* Check the transaction is valid */
|
||||
TransactionManager.checkPostTransaction(header, post);
|
||||
|
||||
/* Check the transaction is valid */
|
||||
TransactionManager.checkPostTransaction(header, post);
|
||||
|
||||
boolean simulate = post.containsKey("simulate-collectiondelete");
|
||||
collectiondelete = collectiondelete.replaceAll(" ","").replaceAll(",", "|");
|
||||
String query = collectiondelete_mode_unassigned_checked ? "-" + CollectionSchema.collection_sxt + AbstractSolrConnector.CATCHALL_DTERM : collectiondelete.length() == 0 ? CollectionSchema.collection_sxt + ":\"\"" : QueryModifier.parseCollectionExpression(collectiondelete);
|
||||
|
@ -232,6 +235,7 @@ public class IndexDeletion_p {
|
|||
try {
|
||||
defaultConnector.deleteByQuery(query);
|
||||
defaultConnector.commit(false);
|
||||
try {sb.index.loadTimeIndex().clear();} catch (IOException e) {}
|
||||
sb.tables.recordAPICall(post, "IndexDeletion_p.html", WorkTables.TABLE_API_TYPE_DELETION, "deletion, collection " + collectiondelete);
|
||||
} catch (final IOException e) {
|
||||
}
|
||||
|
@ -239,11 +243,11 @@ public class IndexDeletion_p {
|
|||
}
|
||||
prop.put("collectiondelete-active_count", count);
|
||||
}
|
||||
|
||||
|
||||
if (post != null && (post.containsKey("simulate-querydelete") || post.containsKey("engage-querydelete"))) {
|
||||
/* Check the transaction is valid */
|
||||
TransactionManager.checkPostTransaction(header, post);
|
||||
|
||||
/* Check the transaction is valid */
|
||||
TransactionManager.checkPostTransaction(header, post);
|
||||
|
||||
boolean simulate = post.containsKey("simulate-querydelete");
|
||||
|
||||
SolrConnector connector = schemaName.equals(CollectionSchema.CORE_NAME) ? defaultConnector : sb.index.fulltext().getWebgraphConnector();
|
||||
|
@ -258,6 +262,7 @@ public class IndexDeletion_p {
|
|||
ConcurrentLog.info("IndexDeletion", "delete by query \"" + querydelete + "\", size before deletion = " + connector.getSize());
|
||||
connector.deleteByQuery(querydelete);
|
||||
connector.commit(false);
|
||||
try {sb.index.loadTimeIndex().clear();} catch (IOException e) {}
|
||||
ConcurrentLog.info("IndexDeletion", "delete by query \"" + querydelete + "\", size after commit = " + connector.getSize());
|
||||
sb.tables.recordAPICall(post, "IndexDeletion_p.html", WorkTables.TABLE_API_TYPE_DELETION, "deletion, solr query, q = " + querydelete);
|
||||
} catch (final IOException e) {
|
||||
|
@ -267,14 +272,14 @@ public class IndexDeletion_p {
|
|||
prop.put("querydelete-active_count", count);
|
||||
}
|
||||
prop.put("doccount", defaultConnector.getSize());
|
||||
|
||||
|
||||
|
||||
prop.put("cores_" + 0 + "_name", CollectionSchema.CORE_NAME);
|
||||
prop.put("cores_" + 0 + "_selected", CollectionSchema.CORE_NAME.equals(schemaName) ? 1 : 0);
|
||||
prop.put("cores_" + 1 + "_name", WebgraphSchema.CORE_NAME);
|
||||
prop.put("cores_" + 1 + "_selected", WebgraphSchema.CORE_NAME.equals(schemaName) ? 1 : 0);
|
||||
prop.put("cores", 2);
|
||||
|
||||
|
||||
// return rewrite properties
|
||||
return prop;
|
||||
}
|
||||
|
|
Loading…
Reference in New Issue
Block a user