optimizations when starting large crawl requests with many start urls in

one request: - allow larger match-fields in html interface - delete all host hashes at once from zurl - when deleting by host, do not count size of deleted entries since that was the reason it took so long
2024-09-19 00:01:41 +02:00 · 2013-01-31 13:15:28 +01:00 · 2013-01-31 13:15:28 +01:00 · 0b6566a389
commit 0b6566a389
parent be5d3a1066
11 changed files with 69 additions and 110 deletions
--- a/htroot/CrawlStartExpert_p.html
+++ b/htroot/CrawlStartExpert_p.html
@ -109,15 +109,13 @@
          <td><label for="mustmatch">Must-Match Filter</label>:</td>
          <td>
            <table border="0">
-          
-          
-          <tr><td width="160">on URLs for Crawling:<br/>
+            <tr><td width="160">on URLs for Crawling:<br/>
 			<input type="radio" name="range" id="rangeDomain" value="domain" onclick="document.getElementById('mustmatch').disabled=true;document.getElementById('deleteoldon').disabled=false;document.getElementById('deleteoldage').disabled=false;document.getElementById('deleteoldon').checked=true;"/>Restrict to start domain(s)<br />
 			<input type="radio" name="range" id="rangeSubpath" value="subpath" onclick="document.getElementById('mustmatch').disabled=true;document.getElementById('deleteoldon').disabled=false;document.getElementById('deleteoldage').disabled=false;document.getElementById('deleteoldon').checked=true;" />Restrict to sub-path(s)<br />
 			<input type="radio" name="range" id="rangeWide" value="wide" checked="checked" onclick="document.getElementById('mustmatch').disabled=false;document.getElementById('deleteoldoff').checked=true;document.getElementById('deleteoldon').disabled=true;document.getElementById('deleteoldage').disabled=true;"/>Use filter</td>
-			<td valign="bottom"><input name="mustmatch" id="mustmatch" type="text" size="55" maxlength="1000" value="#[mustmatch]#" /></td></tr>
-		    <tr><td>on IPs for Crawling:</td><td><input name="ipMustmatch" id="ipMustmatch" type="text" size="55" maxlength="100" value="#[ipMustmatch]#" /></td></tr>
-		    <tr><td>on URLs for Indexing</td><td><input name="indexmustmatch" id="indexmustmatch" type="text" size="55" maxlength="100" value="#[indexmustmatch]#" /></td></tr>
+			<td valign="bottom"><input name="mustmatch" id="mustmatch" type="text" size="55" maxlength="100000" value="#[mustmatch]#" onclick="document.getElementById('deleteoldon').disabled=false;document.getElementById('deleteoldage').disabled=false"/></td></tr>
+		    <tr><td>on IPs for Crawling:</td><td><input name="ipMustmatch" id="ipMustmatch" type="text" size="55" maxlength="100000" value="#[ipMustmatch]#" /></td></tr>
+		    <tr><td>on URLs for Indexing</td><td><input name="indexmustmatch" id="indexmustmatch" type="text" size="55" maxlength="100000" value="#[indexmustmatch]#" /></td></tr>
 			</table>
 		  </td>
          <td>
@ -131,9 +129,9 @@
          <td><label for="mustnotmatch">Must-Not-Match Filter</label>:</td>
          <td>
            <table border="0">
-            <tr><td width="160">on URLs for Crawling:</td><td><input name="mustnotmatch" id="mustnotmatch" type="text" size="55" maxlength="1000" value="#[mustnotmatch]#" /></td></tr>
-		    <tr><td>on IPs for Crawling:</td><td><input name="ipMustnotmatch" id="ipMustnotmatch" type="text" size="55" maxlength="1000" value="#[ipMustnotmatch]#" /></td></tr>
-		    <tr><td>on URLs for Indexing:</td><td><input name="indexmustnotmatch" id="indexmustnotmatch" type="text" size="55" maxlength="1000" value="#[indexmustnotmatch]#" /></td></tr>
+            <tr><td width="160">on URLs for Crawling:</td><td><input name="mustnotmatch" id="mustnotmatch" type="text" size="55" maxlength="100000" value="#[mustnotmatch]#" /></td></tr>
+		    <tr><td>on IPs for Crawling:</td><td><input name="ipMustnotmatch" id="ipMustnotmatch" type="text" size="55" maxlength="100000" value="#[ipMustnotmatch]#" /></td></tr>
+		    <tr><td>on URLs for Indexing:</td><td><input name="indexmustnotmatch" id="indexmustnotmatch" type="text" size="55" maxlength="100000" value="#[indexmustnotmatch]#" /></td></tr>
 			</table>
 		  </td>
          <td>
--- a/htroot/Crawler_p.java
+++ b/htroot/Crawler_p.java
@ -24,9 +24,11 @@ import java.io.FileInputStream;
 import java.io.IOException;
 import java.io.Writer;
 import java.net.MalformedURLException;
+import java.util.ArrayList;
 import java.util.Date;
 import java.util.HashMap;
 import java.util.HashSet;
+import java.util.List;
 import java.util.Map;
 import java.util.Properties;
 import java.util.Set;
@ -294,8 +296,7 @@ public class Crawler_p {
                        siteFilter = CrawlProfile.siteFilter(rootURLs);
                        if (deleteold) {
                            for (DigestURI u: rootURLs) {
-                                int count = sb.index.fulltext().deleteDomainHashpart(u.hosthash(), deleteageDate, rootURLs.size() > 1);
-                                if (count > 0) Log.logInfo("Crawler_p", "deleted " + count + " documents for host " + u.getHost());
+                                sb.index.fulltext().deleteDomainHashpart(u.hosthash(), deleteageDate, rootURLs.size() > 1);
                            }
                        }
                    } else if (subPath) {
@ -366,14 +367,17 @@ public class Crawler_p {
                try {sb.crawlQueues.noticeURL.removeByProfileHandle(profile.handle(), 10000);} catch (SpaceExceededException e1) {}
                
                // delete all error urls for that domain
+                List<byte[]> hosthashes = new ArrayList<byte[]>();
                for (DigestURI u: rootURLs) {
-                    String hosthash = u.hosthash();
+                    hosthashes.add(ASCII.getBytes(u.hosthash()));
+                }
+                sb.crawlQueues.errorURL.removeHost(hosthashes, true);
+                for (byte[] hosthash: hosthashes) {
                    try {
-                        sb.crawlQueues.errorURL.removeHost(ASCII.getBytes(hosthash));
-                        sb.index.fulltext().getSolr().deleteByQuery(YaCySchema.host_id_s.getSolrFieldName() + ":\"" + hosthash + "\" AND " + YaCySchema.failreason_t.getSolrFieldName() + ":[* TO *]");
-                        sb.index.fulltext().commit(true);
+                        sb.index.fulltext().getSolr().deleteByQuery(YaCySchema.host_id_s.getSolrFieldName() + ":\"" + ASCII.String(hosthash) + "\" AND " + YaCySchema.failreason_t.getSolrFieldName() + ":[* TO *]");
                    } catch (IOException e) {Log.logException(e);}
                }
+                sb.index.fulltext().commit(true);
                
                // start the crawl
                if ("url".equals(crawlingMode)) {
--- a/source/net/yacy/cora/federate/solr/connector/MirrorSolrConnector.java
+++ b/source/net/yacy/cora/federate/solr/connector/MirrorSolrConnector.java
@ -255,12 +255,10 @@ public class MirrorSolrConnector extends AbstractSolrConnector implements SolrCo
    }

    @Override
-    public int deleteByQuery(final String querystring) throws IOException {
-        int count = 0;
-        if (this.solr0 != null) count += this.solr0.deleteByQuery(querystring);
-        if (this.solr1 != null) count += this.solr1.deleteByQuery(querystring);
+    public void deleteByQuery(final String querystring) throws IOException {
+        if (this.solr0 != null) this.solr0.deleteByQuery(querystring);
+        if (this.solr1 != null) this.solr1.deleteByQuery(querystring);
        this.clearCache();
-        return count;
    }

    @Override
--- a/source/net/yacy/cora/federate/solr/connector/MultipleSolrConnector.java
+++ b/source/net/yacy/cora/federate/solr/connector/MultipleSolrConnector.java
@ -146,8 +146,8 @@ public class MultipleSolrConnector extends AbstractSolrConnector implements Solr
    }

    @Override
-    public int deleteByQuery(final String querystring) throws IOException {
-        return this.solr.deleteByQuery(querystring);
+    public void deleteByQuery(final String querystring) throws IOException {
+        this.solr.deleteByQuery(querystring);
    }

    @Override
--- a/source/net/yacy/cora/federate/solr/connector/RetrySolrConnector.java
+++ b/source/net/yacy/cora/federate/solr/connector/RetrySolrConnector.java
@ -122,18 +122,18 @@ public class RetrySolrConnector extends AbstractSolrConnector implements SolrCon
    }

    @Override
-    public int deleteByQuery(final String querystring) throws IOException {
+    public void deleteByQuery(final String querystring) throws IOException {
        final long t = System.currentTimeMillis() + this.retryMaxTime;
        Throwable ee = null;
        while (System.currentTimeMillis() < t) try {
-            return this.solrConnector.deleteByQuery(querystring);
+            this.solrConnector.deleteByQuery(querystring);
+            return;
        } catch (final Throwable e) {
            ee = e;
            try {Thread.sleep(10);} catch (final InterruptedException e1) {}
            continue;
        }
        if (ee != null) throw (ee instanceof IOException) ? (IOException) ee : new IOException(ee.getMessage());
-        return 0;
    }

    @Override
--- a/source/net/yacy/cora/federate/solr/connector/ShardSolrConnector.java
+++ b/source/net/yacy/cora/federate/solr/connector/ShardSolrConnector.java
@ -120,10 +120,8 @@ public class ShardSolrConnector extends AbstractSolrConnector implements SolrCon
    }

    @Override
-    public int deleteByQuery(final String querystring) throws IOException {
-        int count = 0;
-        for (final SolrConnector connector: this.connectors) count += connector.deleteByQuery(querystring);
-        return count;
+    public void deleteByQuery(final String querystring) throws IOException {
+        for (final SolrConnector connector: this.connectors) connector.deleteByQuery(querystring);
    }

    /**
--- a/source/net/yacy/cora/federate/solr/connector/SolrConnector.java
+++ b/source/net/yacy/cora/federate/solr/connector/SolrConnector.java
@ -90,7 +90,7 @@ public interface SolrConnector extends Iterable<String> /* Iterable of document
     * @return the number of deletions
     * @throws IOException
     */
-    public int deleteByQuery(final String querystring) throws IOException;
+    public void deleteByQuery(final String querystring) throws IOException;

    /**
     * check if a given key exists in solr at the field fieldName
--- a/source/net/yacy/cora/federate/solr/connector/SolrServerConnector.java
+++ b/source/net/yacy/cora/federate/solr/connector/SolrServerConnector.java
@ -242,14 +242,10 @@ public abstract class SolrServerConnector extends AbstractSolrConnector implemen
     * @throws IOException
     */
    @Override
-    public int deleteByQuery(final String querystring) throws IOException {
+    public void deleteByQuery(final String querystring) throws IOException {
        try {
            synchronized (this.server) {
-                long c0 = this.getQueryCount(querystring);
                this.server.deleteByQuery(querystring, this.commitWithinMs);
-                this.commit(true);
-                long c1 = this.getQueryCount(querystring);
-                return (int) (c1 - c0);
            }
        } catch (final Throwable e) {
            throw new IOException(e);
--- a/source/net/yacy/cora/protocol/http/HTTPClient.java
+++ b/source/net/yacy/cora/protocol/http/HTTPClient.java
@ -605,7 +605,7 @@ public class HTTPClient {
        } catch (final IOException e) {
            ConnectionInfo.removeConnection(httpUriRequest.hashCode());
            httpUriRequest.abort();
-            throw new IOException("Client can't execute: " + e.getMessage());
+            throw new IOException("Client can't execute: " + e.getCause().getMessage());
        }
    }

--- a/source/net/yacy/crawler/data/ZURL.java
+++ b/source/net/yacy/crawler/data/ZURL.java
@ -147,20 +147,31 @@ public class ZURL implements Iterable<ZURL.Entry> {
        }
    }
    
-    public void removeHost(final byte[] hosthash) throws IOException {
-        if (hosthash == null) return;
-        Iterator<byte[]> i = this.urlIndex.keys(true, null);
-        List<byte[]> r = new ArrayList<byte[]>();
-        while (i.hasNext()) {
-            byte[] b = i.next();
-            if (NaturalOrder.naturalOrder.equal(hosthash, 0, b, 6, 6)) r.add(b);
-        }
-        for (byte[] b: r) this.urlIndex.remove(b);
-        i = this.stack.iterator();
-        while (i.hasNext()) {
-            byte[] b = i.next();
-            if (NaturalOrder.naturalOrder.equal(hosthash, 0, b, 6, 6)) i.remove();
-        }
+    public void removeHost(final Iterable<byte[]> hosthashes, final boolean concurrent) {
+        if (hosthashes == null) return;
+        Thread t = new Thread() {
+            public void run() {
+                try {
+                    Iterator<byte[]> i = ZURL.this.urlIndex.keys(true, null);
+                    List<byte[]> r = new ArrayList<byte[]>();
+                    while (i.hasNext()) {
+                        byte[] b = i.next();
+                        for (byte[] hosthash: hosthashes) {
+                            if (NaturalOrder.naturalOrder.equal(hosthash, 0, b, 6, 6)) r.add(b);
+                        }
+                    }
+                    for (byte[] b: r) ZURL.this.urlIndex.remove(b);
+                    i = ZURL.this.stack.iterator();
+                    while (i.hasNext()) {
+                        byte[] b = i.next();
+                        for (byte[] hosthash: hosthashes) {
+                            if (NaturalOrder.naturalOrder.equal(hosthash, 0, b, 6, 6)) i.remove();
+                        }
+                    }
+                } catch (IOException e) {}
+            }
+        };
+        if (concurrent) t.start(); else t.run();
    }

    public void push(
--- a/source/net/yacy/search/index/Fulltext.java
+++ b/source/net/yacy/search/index/Fulltext.java
@ -30,7 +30,6 @@ import java.io.PrintWriter;
 import java.net.MalformedURLException;
 import java.util.ArrayList;
 import java.util.Date;
-import java.util.HashMap;
 import java.util.Iterator;
 import java.util.List;
 import java.util.Map;
@ -48,7 +47,6 @@ import net.yacy.cora.federate.solr.connector.EmbeddedSolrConnector;
 import net.yacy.cora.federate.solr.connector.MirrorSolrConnector;
 import net.yacy.cora.federate.solr.connector.SolrConnector;
 import net.yacy.cora.order.CloneableIterator;
-import net.yacy.cora.sorting.ConcurrentScoreMap;
 import net.yacy.cora.sorting.ReversibleScoreMap;
 import net.yacy.cora.sorting.ScoreMap;
 import net.yacy.cora.storage.ZIPReader;
@ -206,6 +204,7 @@ public final class Fulltext {
            this.urlIndexFile.clear();
        }
        this.statsDump = null;
+        this.getSolr().commit(true);
    }

    public void clearLocalSolr() throws IOException {
@ -356,22 +355,19 @@ public final class Fulltext {
     * here such a fragment can be used to delete all these domains at once
     * @param hosthash the hash of the host to be deleted
     * @param freshdate either NULL or a date in the past which is the limit for deletion. Only documents older than this date are deleted
-     * @return number of deleted domains
     * @throws IOException
     */
-    public int deleteDomainHashpart(final String hosthash, Date freshdate, boolean concurrent) {
+    public void deleteDomainHashpart(final String hosthash, Date freshdate, boolean concurrent) {
        // first collect all url hashes that belong to the domain
        assert hosthash.length() == 6;
        final String q = YaCySchema.host_id_s.getSolrFieldName() + ":\"" + hosthash + "\"" +
                ((freshdate != null && freshdate.before(new Date())) ? (" AND " + YaCySchema.load_date_dt.getSolrFieldName() + ":[* TO " + ISO8601Formatter.FORMATTER.format(freshdate) + "]") : "");
-        final AtomicInteger count = new AtomicInteger(0);
        Thread t = new Thread() {
            public void run() {
                // delete in solr
                synchronized (Fulltext.this.solr) {
                    try {
-                        count.addAndGet(Fulltext.this.solr.deleteByQuery(q));
-                        if (count.get() > 0) Fulltext.this.solr.commit(true);
+                        Fulltext.this.solr.deleteByQuery(q);
                    } catch (IOException e) {}
                }
        
@ -408,22 +404,22 @@ public final class Fulltext {
                }
            }
        };
-        if (concurrent) t.start(); else t.run();
-        return count.get();
+        if (concurrent) t.start(); else {
+            t.run();
+            Fulltext.this.getSolr().commit(true);
+        }
    }

-    public int deleteDomainHostname(final String hostname, Date freshdate, boolean concurrent) {
+    public void deleteDomainHostname(final String hostname, Date freshdate, boolean concurrent) {
        // first collect all url hashes that belong to the domain
        final String q = YaCySchema.host_s.getSolrFieldName() + ":\"" + hostname + "\"" +
                ((freshdate != null && freshdate.before(new Date())) ? (" AND " + YaCySchema.load_date_dt.getSolrFieldName() + ":[* TO " + ISO8601Formatter.FORMATTER.format(freshdate) + "]") : "");
-        final AtomicInteger count = new AtomicInteger(0);
        Thread t = new Thread() {
            public void run() {
                // delete in solr
                synchronized (Fulltext.this.solr) {
                    try {
-                        count.addAndGet(Fulltext.this.solr.deleteByQuery(q));
-                        if (count.get() > 0) Fulltext.this.solr.commit(true);
+                        Fulltext.this.solr.deleteByQuery(q);
                    } catch (IOException e) {}
                }
                // finally remove the line with statistics
@ -440,8 +436,10 @@ public final class Fulltext {
                }
            }
        };
-        if (concurrent) t.start(); else t.run();
-        return count.get();
+        if (concurrent) t.start(); else {
+            t.run();
+            Fulltext.this.getSolr().commit(true);
+        }
    }

    /**
@ -748,42 +746,7 @@ public final class Fulltext {
        }

    }
-
-    /**
-     * calculate a score map for url hash samples: each sample is a single url hash
-     * that stands for all entries for the corresponding domain. The map counts the number
-     * of occurrences of the domain
-     * @param domainSamples a map from domain hashes to hash statistics
-     * @return a map from url hash samples to counters
-     */
-    public ScoreMap<String> urlSampleScores(final Map<String, URLHashCounter> domainSamples) {
-        final ScoreMap<String> urlSampleScore = new ConcurrentScoreMap<String>();
-        for (final Map.Entry<String, URLHashCounter> e: domainSamples.entrySet()) {
-            urlSampleScore.inc(ASCII.String(e.getValue().urlhashb), e.getValue().count);
-        }
-        return urlSampleScore;
-    }
-
-    /**
-     * calculate all domain names for all domain hashes
-     * @param domainSamples a map from domain hashes to hash statistics
-     * @return a map from domain hashes to host stats including domain names
-     */
-    public Map<String, HostStat> domainHashResolver(final Map<String, URLHashCounter> domainSamples) {
-        final HashMap<String, HostStat> hostMap = new HashMap<String, HostStat>();
-
-        final ScoreMap<String> hosthashScore = new ConcurrentScoreMap<String>();
-        for (final Map.Entry<String, URLHashCounter> e: domainSamples.entrySet()) {
-            hosthashScore.inc(ASCII.String(e.getValue().urlhashb, 6, 6), e.getValue().count);
-        }
-        DigestURI url;
-        for (final Map.Entry<String, URLHashCounter> e: domainSamples.entrySet()) {
-            url = this.getURL(e.getValue().urlhashb);
-            hostMap.put(e.getKey(), new HostStat(url.getHost(), url.getPort(), e.getKey(), hosthashScore.get(e.getKey())));
-        }
-        return hostMap;
-    }
-
+    
    public Iterator<HostStat> statistics(int count, final ScoreMap<String> domainScore) {
        // prevent too heavy IO.
        if (this.statsDump != null && count <= this.statsDump.size()) return this.statsDump.iterator();
@ -809,15 +772,6 @@ public final class Fulltext {
        return (this.statsDump == null) ? new ArrayList<HostStat>().iterator() : this.statsDump.iterator();
    }

-    private static class URLHashCounter {
-        public byte[] urlhashb;
-        public int count;
-        public URLHashCounter(final byte[] urlhashb) {
-            this.urlhashb = urlhashb;
-            this.count = 1;
-        }
-    }
-
    public static class HostStat {
        public String hostname, hosthash;
        public int port;