optimizations when starting large crawl requests with many start urls in

one request:
- allow larger match-fields in html interface
- delete all host hashes at once from zurl
- when deleting by host, do not count size of deleted entries since that
was the reason it took so long
This commit is contained in:
Michael Peter Christen 2013-01-31 13:15:28 +01:00
parent be5d3a1066
commit 0b6566a389
11 changed files with 69 additions and 110 deletions

View File

@ -109,15 +109,13 @@
<td><label for="mustmatch">Must-Match Filter</label>:</td>
<td>
<table border="0">
<tr><td width="160">on URLs for Crawling:<br/>
<tr><td width="160">on URLs for Crawling:<br/>
<input type="radio" name="range" id="rangeDomain" value="domain" onclick="document.getElementById('mustmatch').disabled=true;document.getElementById('deleteoldon').disabled=false;document.getElementById('deleteoldage').disabled=false;document.getElementById('deleteoldon').checked=true;"/>Restrict to start domain(s)<br />
<input type="radio" name="range" id="rangeSubpath" value="subpath" onclick="document.getElementById('mustmatch').disabled=true;document.getElementById('deleteoldon').disabled=false;document.getElementById('deleteoldage').disabled=false;document.getElementById('deleteoldon').checked=true;" />Restrict to sub-path(s)<br />
<input type="radio" name="range" id="rangeWide" value="wide" checked="checked" onclick="document.getElementById('mustmatch').disabled=false;document.getElementById('deleteoldoff').checked=true;document.getElementById('deleteoldon').disabled=true;document.getElementById('deleteoldage').disabled=true;"/>Use filter</td>
<td valign="bottom"><input name="mustmatch" id="mustmatch" type="text" size="55" maxlength="1000" value="#[mustmatch]#" /></td></tr>
<tr><td>on IPs for Crawling:</td><td><input name="ipMustmatch" id="ipMustmatch" type="text" size="55" maxlength="100" value="#[ipMustmatch]#" /></td></tr>
<tr><td>on URLs for Indexing</td><td><input name="indexmustmatch" id="indexmustmatch" type="text" size="55" maxlength="100" value="#[indexmustmatch]#" /></td></tr>
<td valign="bottom"><input name="mustmatch" id="mustmatch" type="text" size="55" maxlength="100000" value="#[mustmatch]#" onclick="document.getElementById('deleteoldon').disabled=false;document.getElementById('deleteoldage').disabled=false"/></td></tr>
<tr><td>on IPs for Crawling:</td><td><input name="ipMustmatch" id="ipMustmatch" type="text" size="55" maxlength="100000" value="#[ipMustmatch]#" /></td></tr>
<tr><td>on URLs for Indexing</td><td><input name="indexmustmatch" id="indexmustmatch" type="text" size="55" maxlength="100000" value="#[indexmustmatch]#" /></td></tr>
</table>
</td>
<td>
@ -131,9 +129,9 @@
<td><label for="mustnotmatch">Must-Not-Match Filter</label>:</td>
<td>
<table border="0">
<tr><td width="160">on URLs for Crawling:</td><td><input name="mustnotmatch" id="mustnotmatch" type="text" size="55" maxlength="1000" value="#[mustnotmatch]#" /></td></tr>
<tr><td>on IPs for Crawling:</td><td><input name="ipMustnotmatch" id="ipMustnotmatch" type="text" size="55" maxlength="1000" value="#[ipMustnotmatch]#" /></td></tr>
<tr><td>on URLs for Indexing:</td><td><input name="indexmustnotmatch" id="indexmustnotmatch" type="text" size="55" maxlength="1000" value="#[indexmustnotmatch]#" /></td></tr>
<tr><td width="160">on URLs for Crawling:</td><td><input name="mustnotmatch" id="mustnotmatch" type="text" size="55" maxlength="100000" value="#[mustnotmatch]#" /></td></tr>
<tr><td>on IPs for Crawling:</td><td><input name="ipMustnotmatch" id="ipMustnotmatch" type="text" size="55" maxlength="100000" value="#[ipMustnotmatch]#" /></td></tr>
<tr><td>on URLs for Indexing:</td><td><input name="indexmustnotmatch" id="indexmustnotmatch" type="text" size="55" maxlength="100000" value="#[indexmustnotmatch]#" /></td></tr>
</table>
</td>
<td>

View File

@ -24,9 +24,11 @@ import java.io.FileInputStream;
import java.io.IOException;
import java.io.Writer;
import java.net.MalformedURLException;
import java.util.ArrayList;
import java.util.Date;
import java.util.HashMap;
import java.util.HashSet;
import java.util.List;
import java.util.Map;
import java.util.Properties;
import java.util.Set;
@ -294,8 +296,7 @@ public class Crawler_p {
siteFilter = CrawlProfile.siteFilter(rootURLs);
if (deleteold) {
for (DigestURI u: rootURLs) {
int count = sb.index.fulltext().deleteDomainHashpart(u.hosthash(), deleteageDate, rootURLs.size() > 1);
if (count > 0) Log.logInfo("Crawler_p", "deleted " + count + " documents for host " + u.getHost());
sb.index.fulltext().deleteDomainHashpart(u.hosthash(), deleteageDate, rootURLs.size() > 1);
}
}
} else if (subPath) {
@ -366,14 +367,17 @@ public class Crawler_p {
try {sb.crawlQueues.noticeURL.removeByProfileHandle(profile.handle(), 10000);} catch (SpaceExceededException e1) {}
// delete all error urls for that domain
List<byte[]> hosthashes = new ArrayList<byte[]>();
for (DigestURI u: rootURLs) {
String hosthash = u.hosthash();
hosthashes.add(ASCII.getBytes(u.hosthash()));
}
sb.crawlQueues.errorURL.removeHost(hosthashes, true);
for (byte[] hosthash: hosthashes) {
try {
sb.crawlQueues.errorURL.removeHost(ASCII.getBytes(hosthash));
sb.index.fulltext().getSolr().deleteByQuery(YaCySchema.host_id_s.getSolrFieldName() + ":\"" + hosthash + "\" AND " + YaCySchema.failreason_t.getSolrFieldName() + ":[* TO *]");
sb.index.fulltext().commit(true);
sb.index.fulltext().getSolr().deleteByQuery(YaCySchema.host_id_s.getSolrFieldName() + ":\"" + ASCII.String(hosthash) + "\" AND " + YaCySchema.failreason_t.getSolrFieldName() + ":[* TO *]");
} catch (IOException e) {Log.logException(e);}
}
sb.index.fulltext().commit(true);
// start the crawl
if ("url".equals(crawlingMode)) {

View File

@ -255,12 +255,10 @@ public class MirrorSolrConnector extends AbstractSolrConnector implements SolrCo
}
@Override
public int deleteByQuery(final String querystring) throws IOException {
int count = 0;
if (this.solr0 != null) count += this.solr0.deleteByQuery(querystring);
if (this.solr1 != null) count += this.solr1.deleteByQuery(querystring);
public void deleteByQuery(final String querystring) throws IOException {
if (this.solr0 != null) this.solr0.deleteByQuery(querystring);
if (this.solr1 != null) this.solr1.deleteByQuery(querystring);
this.clearCache();
return count;
}
@Override

View File

@ -146,8 +146,8 @@ public class MultipleSolrConnector extends AbstractSolrConnector implements Solr
}
@Override
public int deleteByQuery(final String querystring) throws IOException {
return this.solr.deleteByQuery(querystring);
public void deleteByQuery(final String querystring) throws IOException {
this.solr.deleteByQuery(querystring);
}
@Override

View File

@ -122,18 +122,18 @@ public class RetrySolrConnector extends AbstractSolrConnector implements SolrCon
}
@Override
public int deleteByQuery(final String querystring) throws IOException {
public void deleteByQuery(final String querystring) throws IOException {
final long t = System.currentTimeMillis() + this.retryMaxTime;
Throwable ee = null;
while (System.currentTimeMillis() < t) try {
return this.solrConnector.deleteByQuery(querystring);
this.solrConnector.deleteByQuery(querystring);
return;
} catch (final Throwable e) {
ee = e;
try {Thread.sleep(10);} catch (final InterruptedException e1) {}
continue;
}
if (ee != null) throw (ee instanceof IOException) ? (IOException) ee : new IOException(ee.getMessage());
return 0;
}
@Override

View File

@ -120,10 +120,8 @@ public class ShardSolrConnector extends AbstractSolrConnector implements SolrCon
}
@Override
public int deleteByQuery(final String querystring) throws IOException {
int count = 0;
for (final SolrConnector connector: this.connectors) count += connector.deleteByQuery(querystring);
return count;
public void deleteByQuery(final String querystring) throws IOException {
for (final SolrConnector connector: this.connectors) connector.deleteByQuery(querystring);
}
/**

View File

@ -90,7 +90,7 @@ public interface SolrConnector extends Iterable<String> /* Iterable of document
* @return the number of deletions
* @throws IOException
*/
public int deleteByQuery(final String querystring) throws IOException;
public void deleteByQuery(final String querystring) throws IOException;
/**
* check if a given key exists in solr at the field fieldName

View File

@ -242,14 +242,10 @@ public abstract class SolrServerConnector extends AbstractSolrConnector implemen
* @throws IOException
*/
@Override
public int deleteByQuery(final String querystring) throws IOException {
public void deleteByQuery(final String querystring) throws IOException {
try {
synchronized (this.server) {
long c0 = this.getQueryCount(querystring);
this.server.deleteByQuery(querystring, this.commitWithinMs);
this.commit(true);
long c1 = this.getQueryCount(querystring);
return (int) (c1 - c0);
}
} catch (final Throwable e) {
throw new IOException(e);

View File

@ -605,7 +605,7 @@ public class HTTPClient {
} catch (final IOException e) {
ConnectionInfo.removeConnection(httpUriRequest.hashCode());
httpUriRequest.abort();
throw new IOException("Client can't execute: " + e.getMessage());
throw new IOException("Client can't execute: " + e.getCause().getMessage());
}
}

View File

@ -147,20 +147,31 @@ public class ZURL implements Iterable<ZURL.Entry> {
}
}
public void removeHost(final byte[] hosthash) throws IOException {
if (hosthash == null) return;
Iterator<byte[]> i = this.urlIndex.keys(true, null);
List<byte[]> r = new ArrayList<byte[]>();
while (i.hasNext()) {
byte[] b = i.next();
if (NaturalOrder.naturalOrder.equal(hosthash, 0, b, 6, 6)) r.add(b);
}
for (byte[] b: r) this.urlIndex.remove(b);
i = this.stack.iterator();
while (i.hasNext()) {
byte[] b = i.next();
if (NaturalOrder.naturalOrder.equal(hosthash, 0, b, 6, 6)) i.remove();
}
public void removeHost(final Iterable<byte[]> hosthashes, final boolean concurrent) {
if (hosthashes == null) return;
Thread t = new Thread() {
public void run() {
try {
Iterator<byte[]> i = ZURL.this.urlIndex.keys(true, null);
List<byte[]> r = new ArrayList<byte[]>();
while (i.hasNext()) {
byte[] b = i.next();
for (byte[] hosthash: hosthashes) {
if (NaturalOrder.naturalOrder.equal(hosthash, 0, b, 6, 6)) r.add(b);
}
}
for (byte[] b: r) ZURL.this.urlIndex.remove(b);
i = ZURL.this.stack.iterator();
while (i.hasNext()) {
byte[] b = i.next();
for (byte[] hosthash: hosthashes) {
if (NaturalOrder.naturalOrder.equal(hosthash, 0, b, 6, 6)) i.remove();
}
}
} catch (IOException e) {}
}
};
if (concurrent) t.start(); else t.run();
}
public void push(

View File

@ -30,7 +30,6 @@ import java.io.PrintWriter;
import java.net.MalformedURLException;
import java.util.ArrayList;
import java.util.Date;
import java.util.HashMap;
import java.util.Iterator;
import java.util.List;
import java.util.Map;
@ -48,7 +47,6 @@ import net.yacy.cora.federate.solr.connector.EmbeddedSolrConnector;
import net.yacy.cora.federate.solr.connector.MirrorSolrConnector;
import net.yacy.cora.federate.solr.connector.SolrConnector;
import net.yacy.cora.order.CloneableIterator;
import net.yacy.cora.sorting.ConcurrentScoreMap;
import net.yacy.cora.sorting.ReversibleScoreMap;
import net.yacy.cora.sorting.ScoreMap;
import net.yacy.cora.storage.ZIPReader;
@ -206,6 +204,7 @@ public final class Fulltext {
this.urlIndexFile.clear();
}
this.statsDump = null;
this.getSolr().commit(true);
}
public void clearLocalSolr() throws IOException {
@ -356,22 +355,19 @@ public final class Fulltext {
* here such a fragment can be used to delete all these domains at once
* @param hosthash the hash of the host to be deleted
* @param freshdate either NULL or a date in the past which is the limit for deletion. Only documents older than this date are deleted
* @return number of deleted domains
* @throws IOException
*/
public int deleteDomainHashpart(final String hosthash, Date freshdate, boolean concurrent) {
public void deleteDomainHashpart(final String hosthash, Date freshdate, boolean concurrent) {
// first collect all url hashes that belong to the domain
assert hosthash.length() == 6;
final String q = YaCySchema.host_id_s.getSolrFieldName() + ":\"" + hosthash + "\"" +
((freshdate != null && freshdate.before(new Date())) ? (" AND " + YaCySchema.load_date_dt.getSolrFieldName() + ":[* TO " + ISO8601Formatter.FORMATTER.format(freshdate) + "]") : "");
final AtomicInteger count = new AtomicInteger(0);
Thread t = new Thread() {
public void run() {
// delete in solr
synchronized (Fulltext.this.solr) {
try {
count.addAndGet(Fulltext.this.solr.deleteByQuery(q));
if (count.get() > 0) Fulltext.this.solr.commit(true);
Fulltext.this.solr.deleteByQuery(q);
} catch (IOException e) {}
}
@ -408,22 +404,22 @@ public final class Fulltext {
}
}
};
if (concurrent) t.start(); else t.run();
return count.get();
if (concurrent) t.start(); else {
t.run();
Fulltext.this.getSolr().commit(true);
}
}
public int deleteDomainHostname(final String hostname, Date freshdate, boolean concurrent) {
public void deleteDomainHostname(final String hostname, Date freshdate, boolean concurrent) {
// first collect all url hashes that belong to the domain
final String q = YaCySchema.host_s.getSolrFieldName() + ":\"" + hostname + "\"" +
((freshdate != null && freshdate.before(new Date())) ? (" AND " + YaCySchema.load_date_dt.getSolrFieldName() + ":[* TO " + ISO8601Formatter.FORMATTER.format(freshdate) + "]") : "");
final AtomicInteger count = new AtomicInteger(0);
Thread t = new Thread() {
public void run() {
// delete in solr
synchronized (Fulltext.this.solr) {
try {
count.addAndGet(Fulltext.this.solr.deleteByQuery(q));
if (count.get() > 0) Fulltext.this.solr.commit(true);
Fulltext.this.solr.deleteByQuery(q);
} catch (IOException e) {}
}
// finally remove the line with statistics
@ -440,8 +436,10 @@ public final class Fulltext {
}
}
};
if (concurrent) t.start(); else t.run();
return count.get();
if (concurrent) t.start(); else {
t.run();
Fulltext.this.getSolr().commit(true);
}
}
/**
@ -748,42 +746,7 @@ public final class Fulltext {
}
}
/**
* calculate a score map for url hash samples: each sample is a single url hash
* that stands for all entries for the corresponding domain. The map counts the number
* of occurrences of the domain
* @param domainSamples a map from domain hashes to hash statistics
* @return a map from url hash samples to counters
*/
public ScoreMap<String> urlSampleScores(final Map<String, URLHashCounter> domainSamples) {
final ScoreMap<String> urlSampleScore = new ConcurrentScoreMap<String>();
for (final Map.Entry<String, URLHashCounter> e: domainSamples.entrySet()) {
urlSampleScore.inc(ASCII.String(e.getValue().urlhashb), e.getValue().count);
}
return urlSampleScore;
}
/**
* calculate all domain names for all domain hashes
* @param domainSamples a map from domain hashes to hash statistics
* @return a map from domain hashes to host stats including domain names
*/
public Map<String, HostStat> domainHashResolver(final Map<String, URLHashCounter> domainSamples) {
final HashMap<String, HostStat> hostMap = new HashMap<String, HostStat>();
final ScoreMap<String> hosthashScore = new ConcurrentScoreMap<String>();
for (final Map.Entry<String, URLHashCounter> e: domainSamples.entrySet()) {
hosthashScore.inc(ASCII.String(e.getValue().urlhashb, 6, 6), e.getValue().count);
}
DigestURI url;
for (final Map.Entry<String, URLHashCounter> e: domainSamples.entrySet()) {
url = this.getURL(e.getValue().urlhashb);
hostMap.put(e.getKey(), new HostStat(url.getHost(), url.getPort(), e.getKey(), hosthashScore.get(e.getKey())));
}
return hostMap;
}
public Iterator<HostStat> statistics(int count, final ScoreMap<String> domainScore) {
// prevent too heavy IO.
if (this.statsDump != null && count <= this.statsDump.size()) return this.statsDump.iterator();
@ -809,15 +772,6 @@ public final class Fulltext {
return (this.statsDump == null) ? new ArrayList<HostStat>().iterator() : this.statsDump.iterator();
}
private static class URLHashCounter {
public byte[] urlhashb;
public int count;
public URLHashCounter(final byte[] urlhashb) {
this.urlhashb = urlhashb;
this.count = 1;
}
}
public static class HostStat {
public String hostname, hosthash;
public int port;