From 354ef8000d9a75802afb0440ec40a6330a3dbea2 Mon Sep 17 00:00:00 2001 From: orbiter Date: Sun, 4 Nov 2012 02:58:26 +0100 Subject: [PATCH] - added 'deleteold' option to crawler which causes that documents are deleted which are selected by a crawl filter (host or subpath) - site crawl used this option be default now - made option to deleteDomain() concurrency --- htroot/CrawlResults.java | 9 +-- htroot/CrawlStartExpert_p.html | 7 +- htroot/CrawlStartSite_p.html | 1 + htroot/Crawler_p.java | 13 ++- htroot/IndexControlURLs_p.java | 7 +- .../net/yacy/crawler/data/CrawlProfile.java | 24 +++--- source/net/yacy/search/index/Fulltext.java | 79 +++++++++++-------- 7 files changed, 76 insertions(+), 64 deletions(-) diff --git a/htroot/CrawlResults.java b/htroot/CrawlResults.java index b6b982072..19081f27c 100644 --- a/htroot/CrawlResults.java +++ b/htroot/CrawlResults.java @@ -127,13 +127,8 @@ public class CrawlResults { final String domain = post.get("domain", null); final String hashpart = domain == null ? null : DigestURI.hosthash6(domain); if (hashpart != null) { - // delete all urls for this domain from database - try { - sb.index.fulltext().deleteDomain(hashpart); - ResultURLs.deleteDomain(tabletype, domain, hashpart); - } catch (final IOException e) { - Log.logException(e); - } + sb.index.fulltext().deleteDomain(hashpart, false); + ResultURLs.deleteDomain(tabletype, domain, hashpart); } } diff --git a/htroot/CrawlStartExpert_p.html b/htroot/CrawlStartExpert_p.html index 59a7dff61..5a703e168 100644 --- a/htroot/CrawlStartExpert_p.html +++ b/htroot/CrawlStartExpert_p.html @@ -153,10 +153,11 @@ : - Use filter   + Use filter  
- Restrict to start domain
- Restrict to sub-path + Restrict to start domain
+ Restrict to sub-path
+ Delete all old documents in domain/subpath The filter is a regular expression diff --git a/htroot/CrawlStartSite_p.html b/htroot/CrawlStartSite_p.html index 15978b66b..76dc82f57 100644 --- a/htroot/CrawlStartSite_p.html +++ b/htroot/CrawlStartSite_p.html @@ -81,6 +81,7 @@
load all files in domain
load only files in a sub-path of given url + diff --git a/htroot/Crawler_p.java b/htroot/Crawler_p.java index c54b08494..454eafd72 100644 --- a/htroot/Crawler_p.java +++ b/htroot/Crawler_p.java @@ -150,7 +150,8 @@ public class Crawler_p { if (newcrawlingMustMatch.length() < 2) newcrawlingMustMatch = CrawlProfile.MATCH_ALL_STRING; // avoid that all urls are filtered out if bad value was submitted final boolean fullDomain = "domain".equals(post.get("range", "wide")); // special property in simple crawl start final boolean subPath = "subpath".equals(post.get("range", "wide")); // special property in simple crawl start - + final boolean deleteold = (fullDomain || subPath) && post.getBoolean("deleteold"); + String crawlingStart0 = post.get("crawlingURL","").trim(); // the crawljob start url String[] rootURLs0 = crawlingStart0.indexOf('\n') > 0 || crawlingStart0.indexOf('\r') > 0 ? crawlingStart0.split("[\\r\\n]+") : crawlingStart0.split(Pattern.quote("|")); Set rootURLs = new HashSet(); @@ -301,8 +302,18 @@ public class Crawler_p { String siteFilter = ".*"; if (fullDomain) { siteFilter = CrawlProfile.siteFilter(rootURLs); + if (deleteold) { + for (DigestURI u: rootURLs) sb.index.fulltext().deleteDomain(u.hosthash(), true); + } } else if (subPath) { siteFilter = CrawlProfile.subpathFilter(rootURLs); + if (deleteold) { + for (DigestURI u: rootURLs) { + String subpath = CrawlProfile.mustMatchSubpath(u); + if (subpath.endsWith(".*")) subpath = subpath.substring(0, subpath.length() - 2); + sb.index.fulltext().remove(subpath, true); + } + } } if (CrawlProfile.MATCH_ALL_STRING.equals(newcrawlingMustMatch)) { newcrawlingMustMatch = siteFilter; diff --git a/htroot/IndexControlURLs_p.java b/htroot/IndexControlURLs_p.java index 1384d81c8..19b537348 100644 --- a/htroot/IndexControlURLs_p.java +++ b/htroot/IndexControlURLs_p.java @@ -297,12 +297,7 @@ public class IndexControlURLs_p { if (post.containsKey("deletedomain")) { final String hp = post.get("hashpart"); - try { - segment.fulltext().deleteDomain(hp); - } catch (final IOException e) { - // TODO Auto-generated catch block - Log.logException(e); - } + segment.fulltext().deleteDomain(hp, false); // trigger the loading of the table post.put("statistics", ""); } diff --git a/source/net/yacy/crawler/data/CrawlProfile.java b/source/net/yacy/crawler/data/CrawlProfile.java index 54699513a..17f1b2162 100644 --- a/source/net/yacy/crawler/data/CrawlProfile.java +++ b/source/net/yacy/crawler/data/CrawlProfile.java @@ -482,6 +482,12 @@ public class CrawlProfile extends ConcurrentHashMap implements M return System.currentTimeMillis() - (60000L * oldTimeMinutes); } + public static String siteFilter(final Set uris) { + final StringBuilder filter = new StringBuilder(); + for (final MultiProtocolURI uri: uris) filter.append('|').append(mustMatchFilterFullDomain(uri)); + return filter.length() > 0 ? filter.substring(1) : CrawlProfile.MATCH_ALL_STRING; + } + public static String mustMatchFilterFullDomain(final MultiProtocolURI uri) { String host = uri.getHost(); if (host.startsWith("www.")) host = host.substring(4); @@ -490,24 +496,18 @@ public class CrawlProfile extends ConcurrentHashMap implements M return new StringBuilder(host.length() + 20).append(protocol).append("://(www.)?").append(Pattern.quote(host)).append(".*").toString(); } - private static String mustMatchSubpath(final MultiProtocolURI uri) { - String u = uri.toNormalform(true); - if (!u.endsWith("/")) {int p = u.lastIndexOf("/"); if (p > 0) u = u.substring(0, p + 1);} - return new StringBuilder(u.length() + 5).append(Pattern.quote(u)).append(".*").toString(); - } - - public static String siteFilter(final Set uris) { - final StringBuilder filter = new StringBuilder(); - for (final MultiProtocolURI uri: uris) filter.append('|').append(mustMatchFilterFullDomain(uri)); - return filter.length() > 0 ? filter.substring(1) : CrawlProfile.MATCH_ALL_STRING; - } - public static String subpathFilter(final Set uris) { final StringBuilder filter = new StringBuilder(); for (final MultiProtocolURI uri: uris) filter.append('|').append(mustMatchSubpath(uri)); return filter.length() > 0 ? filter.substring(1) : CrawlProfile.MATCH_ALL_STRING; } + public static String mustMatchSubpath(final MultiProtocolURI uri) { + String u = uri.toNormalform(true); + if (!u.endsWith("/")) {int p = u.lastIndexOf("/"); if (p > 0) u = u.substring(0, p + 1);} + return new StringBuilder(u.length() + 5).append(Pattern.quote(u)).append(".*").toString(); + } + public static final Set ignoreNames = new HashSet(); static { ignoreNames.add(CrawlSwitchboard.CRAWL_PROFILE_PROXY); diff --git a/source/net/yacy/search/index/Fulltext.java b/source/net/yacy/search/index/Fulltext.java index 1042e432a..912250f97 100644 --- a/source/net/yacy/search/index/Fulltext.java +++ b/source/net/yacy/search/index/Fulltext.java @@ -795,44 +795,53 @@ public final class Fulltext implements Iterable { * @return number of deleted domains * @throws IOException */ - public int deleteDomain(final String hosthash) throws IOException { + public void deleteDomain(final String hosthash, boolean concurrent) { // first collect all url hashes that belong to the domain assert hosthash.length() == 6; - // delete in solr - synchronized (this.solr) { - this.solr.deleteByQuery(YaCySchema.host_id_s.name() + ":\"" + hosthash + "\""); - } - - // delete in old metadata structure - final ArrayList l = new ArrayList(); - synchronized (this) { - final CloneableIterator i = this.urlIndexFile.keys(true, null); - String hash; - while (i != null && i.hasNext()) { - hash = ASCII.String(i.next()); - if (hosthash.equals(hash.substring(6))) l.add(hash); - } - } - - // then delete the urls using this list - int cnt = 0; - for (final String h: l) { - if (this.urlIndexFile.delete(ASCII.getBytes(h))) cnt++; - } - - // finally remove the line with statistics - if (this.statsDump != null) { - final Iterator hsi = this.statsDump.iterator(); - HostStat hs; - while (hsi.hasNext()) { - hs = hsi.next(); - if (hs.hosthash.equals(hosthash)) { - hsi.remove(); - break; + + Thread t = new Thread() { + public void run() { + // delete in solr + synchronized (Fulltext.this.solr) { + try { + Fulltext.this.solr.deleteByQuery(YaCySchema.host_id_s.name() + ":\"" + hosthash + "\""); + Fulltext.this.solr.commit(); + } catch (IOException e) {} + } + + // delete in old metadata structure + if (Fulltext.this.urlIndexFile != null) { + final ArrayList l = new ArrayList(); + synchronized (this) { + CloneableIterator i; + try { + i = Fulltext.this.urlIndexFile.keys(true, null); + String hash; + while (i != null && i.hasNext()) { + hash = ASCII.String(i.next()); + if (hosthash.equals(hash.substring(6))) l.add(hash); + } + + // then delete the urls using this list + for (final String h: l) Fulltext.this.urlIndexFile.delete(ASCII.getBytes(h)); + } catch (IOException e) {} + } + } + + // finally remove the line with statistics + if (Fulltext.this.statsDump != null) { + final Iterator hsi = Fulltext.this.statsDump.iterator(); + HostStat hs; + while (hsi.hasNext()) { + hs = hsi.next(); + if (hs.hosthash.equals(hosthash)) { + hsi.remove(); + break; + } + } } } - } - - return cnt; + }; + if (concurrent) t.start(); else t.run(); } }