From 354ef8000d9a75802afb0440ec40a6330a3dbea2 Mon Sep 17 00:00:00 2001
From: orbiter <mc@yacy.net>
Date: Sun, 4 Nov 2012 02:58:26 +0100
Subject: [PATCH] - added 'deleteold' option to crawler which causes that
 documents are deleted which are selected by a crawl filter (host or subpath)
 - site crawl used this option be default now - made option to deleteDomain()
 concurrency

---
 htroot/CrawlResults.java                      |  9 +--
 htroot/CrawlStartExpert_p.html                |  7 +-
 htroot/CrawlStartSite_p.html                  |  1 +
 htroot/Crawler_p.java                         | 13 ++-
 htroot/IndexControlURLs_p.java                |  7 +-
 .../net/yacy/crawler/data/CrawlProfile.java   | 24 +++---
 source/net/yacy/search/index/Fulltext.java    | 79 +++++++++++--------
 7 files changed, 76 insertions(+), 64 deletions(-)
diff --git a/htroot/CrawlResults.java b/htroot/CrawlResults.java
index b6b982072..19081f27c 100644
--- a/htroot/CrawlResults.java
+++ b/htroot/CrawlResults.java
@@ -127,13 +127,8 @@ public class CrawlResults {
                 final String domain = post.get("domain", null);
                 final String hashpart = domain == null ? null : DigestURI.hosthash6(domain);
                 if (hashpart != null) {
-                    // delete all urls for this domain from database
-                    try {
-                        sb.index.fulltext().deleteDomain(hashpart);
-                        ResultURLs.deleteDomain(tabletype, domain, hashpart);
-                    } catch (final IOException e) {
-                        Log.logException(e);
-                    }
+                    sb.index.fulltext().deleteDomain(hashpart, false);
+                    ResultURLs.deleteDomain(tabletype, domain, hashpart);
                 }
             }
 
diff --git a/htroot/CrawlStartExpert_p.html b/htroot/CrawlStartExpert_p.html
index 59a7dff61..5a703e168 100644
--- a/htroot/CrawlStartExpert_p.html
+++ b/htroot/CrawlStartExpert_p.html
@@ -153,10 +153,11 @@
         <tr valign="top" class="TableCellLight">
           <td><label for="mustmatch">Must-Match Filter for URLs for crawling</label>:</td>
           <td>
-			<input type="radio" name="range" id="rangeWide" value="wide" checked="checked" />Use filter&nbsp;&nbsp;
+			<input type="radio" name="range" id="rangeWide" value="wide" checked="checked" onclick="document.getElementById('deleteold').checked=false;document.getElementById('deleteold').disabled=true;"/>Use filter&nbsp;&nbsp;
 			<input name="mustmatch" id="mustmatch" type="text" size="60" maxlength="100" value="#[mustmatch]#" /><br />
-			<input type="radio" name="range" id="rangeDomain" value="domain" />Restrict to start domain<br />
-			<input type="radio" name="range" id="rangeSubpath" value="subpath" />Restrict to sub-path
+			<input type="radio" name="range" id="rangeDomain" value="domain" onclick="document.getElementById('deleteold').disabled=false;document.getElementById('deleteold').checked=true;"/>Restrict to start domain<br />
+			<input type="radio" name="range" id="rangeSubpath" value="subpath" onclick="document.getElementById('deleteold').disabled=false;document.getElementById('deleteold').checked=true;" />Restrict to sub-path<br />
+			<input type="checkbox" name="deleteold" id="deleteold" disabled/>Delete all old documents in domain/subpath
 		  </td>
           <td>
             The filter is a <b><a href="http://download.oracle.com/javase/6/docs/api/java/util/regex/Pattern.html">regular expression</a></b>
diff --git a/htroot/CrawlStartSite_p.html b/htroot/CrawlStartSite_p.html
index 15978b66b..76dc82f57 100644
--- a/htroot/CrawlStartSite_p.html
+++ b/htroot/CrawlStartSite_p.html
@@ -81,6 +81,7 @@
         <dd>
             <input type="radio" name="range" id="rangeDomain" value="domain" checked="checked"/>load all files in domain<br />
 			<input type="radio" name="range" id="rangeSubpath" value="subpath" />load only files in a sub-path of given url
+			<input type="hidden" name="deleteold" id="deleteold" value="on" />
 			<input type="hidden" name="mustnotmatch" id="mustnotmatch" value="" />
 			<input type="hidden" name="crawlingDomFilterCheck" id="crawlingDomFilterCheck" value="off" />
 			<input type="hidden" name="crawlingDomFilterDepth" id="crawlingDomFilterDepth" value="#[crawlingDomFilterDepth]#" />
diff --git a/htroot/Crawler_p.java b/htroot/Crawler_p.java
index c54b08494..454eafd72 100644
--- a/htroot/Crawler_p.java
+++ b/htroot/Crawler_p.java
@@ -150,7 +150,8 @@ public class Crawler_p {
                 if (newcrawlingMustMatch.length() < 2) newcrawlingMustMatch = CrawlProfile.MATCH_ALL_STRING; // avoid that all urls are filtered out if bad value was submitted
                 final boolean fullDomain = "domain".equals(post.get("range", "wide")); // special property in simple crawl start
                 final boolean subPath    = "subpath".equals(post.get("range", "wide")); // special property in simple crawl start
-
+                final boolean deleteold =  (fullDomain || subPath) && post.getBoolean("deleteold");
+                
                 String crawlingStart0 = post.get("crawlingURL","").trim(); // the crawljob start url
                 String[] rootURLs0 = crawlingStart0.indexOf('\n') > 0 || crawlingStart0.indexOf('\r') > 0 ? crawlingStart0.split("[\\r\\n]+") : crawlingStart0.split(Pattern.quote("|"));
                 Set<DigestURI> rootURLs = new HashSet<DigestURI>();
@@ -301,8 +302,18 @@ public class Crawler_p {
                     String siteFilter = ".*";
                     if (fullDomain) {
                         siteFilter = CrawlProfile.siteFilter(rootURLs);
+                        if (deleteold) {
+                            for (DigestURI u: rootURLs) sb.index.fulltext().deleteDomain(u.hosthash(), true);
+                        }
                     } else if (subPath) {
                         siteFilter = CrawlProfile.subpathFilter(rootURLs);
+                        if (deleteold) {
+                            for (DigestURI u: rootURLs) {
+                                String subpath = CrawlProfile.mustMatchSubpath(u);
+                                if (subpath.endsWith(".*")) subpath = subpath.substring(0, subpath.length() - 2);
+                                sb.index.fulltext().remove(subpath, true);
+                            }
+                        }
                     }
                     if (CrawlProfile.MATCH_ALL_STRING.equals(newcrawlingMustMatch)) {
                         newcrawlingMustMatch = siteFilter;
diff --git a/htroot/IndexControlURLs_p.java b/htroot/IndexControlURLs_p.java
index 1384d81c8..19b537348 100644
--- a/htroot/IndexControlURLs_p.java
+++ b/htroot/IndexControlURLs_p.java
@@ -297,12 +297,7 @@ public class IndexControlURLs_p {
 
         if (post.containsKey("deletedomain")) {
             final String hp = post.get("hashpart");
-            try {
-                segment.fulltext().deleteDomain(hp);
-            } catch (final IOException e) {
-                // TODO Auto-generated catch block
-                Log.logException(e);
-            }
+            segment.fulltext().deleteDomain(hp, false);
             // trigger the loading of the table
             post.put("statistics", "");
         }
diff --git a/source/net/yacy/crawler/data/CrawlProfile.java b/source/net/yacy/crawler/data/CrawlProfile.java
index 54699513a..17f1b2162 100644
--- a/source/net/yacy/crawler/data/CrawlProfile.java
+++ b/source/net/yacy/crawler/data/CrawlProfile.java
@@ -482,6 +482,12 @@ public class CrawlProfile extends ConcurrentHashMap<String, String> implements M
         return System.currentTimeMillis() - (60000L * oldTimeMinutes);
     }
 
+    public static String siteFilter(final Set<? extends MultiProtocolURI> uris) {
+        final StringBuilder filter = new StringBuilder();
+        for (final MultiProtocolURI uri: uris) filter.append('|').append(mustMatchFilterFullDomain(uri));
+        return filter.length() > 0 ? filter.substring(1) : CrawlProfile.MATCH_ALL_STRING;
+    }
+
     public static String mustMatchFilterFullDomain(final MultiProtocolURI uri) {
         String host = uri.getHost();
         if (host.startsWith("www.")) host = host.substring(4);
@@ -490,24 +496,18 @@ public class CrawlProfile extends ConcurrentHashMap<String, String> implements M
         return new StringBuilder(host.length() + 20).append(protocol).append("://(www.)?").append(Pattern.quote(host)).append(".*").toString();
     }
 
-    private static String mustMatchSubpath(final MultiProtocolURI uri) {
-        String u = uri.toNormalform(true);
-        if (!u.endsWith("/")) {int p = u.lastIndexOf("/"); if (p > 0) u = u.substring(0, p + 1);}
-        return new StringBuilder(u.length() + 5).append(Pattern.quote(u)).append(".*").toString();
-    }
-
-    public static String siteFilter(final Set<? extends MultiProtocolURI> uris) {
-        final StringBuilder filter = new StringBuilder();
-        for (final MultiProtocolURI uri: uris) filter.append('|').append(mustMatchFilterFullDomain(uri));
-        return filter.length() > 0 ? filter.substring(1) : CrawlProfile.MATCH_ALL_STRING;
-    }
-
     public static String subpathFilter(final Set<? extends MultiProtocolURI> uris) {
         final StringBuilder filter = new StringBuilder();
         for (final MultiProtocolURI uri: uris) filter.append('|').append(mustMatchSubpath(uri));
         return filter.length() > 0 ? filter.substring(1) : CrawlProfile.MATCH_ALL_STRING;
     }
 
+    public static String mustMatchSubpath(final MultiProtocolURI uri) {
+        String u = uri.toNormalform(true);
+        if (!u.endsWith("/")) {int p = u.lastIndexOf("/"); if (p > 0) u = u.substring(0, p + 1);}
+        return new StringBuilder(u.length() + 5).append(Pattern.quote(u)).append(".*").toString();
+    }
+
     public static final Set<String> ignoreNames = new HashSet<String>();
     static {
         ignoreNames.add(CrawlSwitchboard.CRAWL_PROFILE_PROXY);
diff --git a/source/net/yacy/search/index/Fulltext.java b/source/net/yacy/search/index/Fulltext.java
index 1042e432a..912250f97 100644
--- a/source/net/yacy/search/index/Fulltext.java
+++ b/source/net/yacy/search/index/Fulltext.java
@@ -795,44 +795,53 @@ public final class Fulltext implements Iterable<byte[]> {
      * @return number of deleted domains
      * @throws IOException
      */
-    public int deleteDomain(final String hosthash) throws IOException {
+    public void deleteDomain(final String hosthash, boolean concurrent) {
         // first collect all url hashes that belong to the domain
         assert hosthash.length() == 6;
-        // delete in solr
-        synchronized (this.solr) {
-            this.solr.deleteByQuery(YaCySchema.host_id_s.name() + ":\"" + hosthash + "\"");
-        }
-
-        // delete in old metadata structure
-        final ArrayList<String> l = new ArrayList<String>();
-        synchronized (this) {
-            final CloneableIterator<byte[]> i = this.urlIndexFile.keys(true, null);
-            String hash;
-            while (i != null && i.hasNext()) {
-                hash = ASCII.String(i.next());
-                if (hosthash.equals(hash.substring(6))) l.add(hash);
-            }
-        }
-
-        // then delete the urls using this list
-        int cnt = 0;
-        for (final String h: l) {
-            if (this.urlIndexFile.delete(ASCII.getBytes(h))) cnt++;
-        }
-
-        // finally remove the line with statistics
-        if (this.statsDump != null) {
-            final Iterator<HostStat> hsi = this.statsDump.iterator();
-            HostStat hs;
-            while (hsi.hasNext()) {
-                hs = hsi.next();
-                if (hs.hosthash.equals(hosthash)) {
-                    hsi.remove();
-                    break;
+        
+        Thread t = new Thread() {
+            public void run() {
+                // delete in solr
+                synchronized (Fulltext.this.solr) {
+                    try {
+                        Fulltext.this.solr.deleteByQuery(YaCySchema.host_id_s.name() + ":\"" + hosthash + "\"");
+                        Fulltext.this.solr.commit();
+                    } catch (IOException e) {}
+                }
+        
+                // delete in old metadata structure
+                if (Fulltext.this.urlIndexFile != null) {
+                    final ArrayList<String> l = new ArrayList<String>();
+                    synchronized (this) {
+                        CloneableIterator<byte[]> i;
+                        try {
+                            i = Fulltext.this.urlIndexFile.keys(true, null);
+                            String hash;
+                            while (i != null && i.hasNext()) {
+                                hash = ASCII.String(i.next());
+                                if (hosthash.equals(hash.substring(6))) l.add(hash);
+                            }
+                            
+                            // then delete the urls using this list
+                            for (final String h: l) Fulltext.this.urlIndexFile.delete(ASCII.getBytes(h));
+                        } catch (IOException e) {}
+                    }
+                }
+        
+                // finally remove the line with statistics
+                if (Fulltext.this.statsDump != null) {
+                    final Iterator<HostStat> hsi = Fulltext.this.statsDump.iterator();
+                    HostStat hs;
+                    while (hsi.hasNext()) {
+                        hs = hsi.next();
+                        if (hs.hosthash.equals(hosthash)) {
+                            hsi.remove();
+                            break;
+                        }
+                    }
                 }
             }
-        }
-
-        return cnt;
+        };
+        if (concurrent) t.start(); else t.run();
     }
 }