when a new crawl is started, delete all entries about error-urls for

crawl-start domains
This commit is contained in:
Michael Peter Christen 2012-11-05 22:14:27 +01:00
parent c6a6f4c4e6
commit 791e1dcfdf
3 changed files with 38 additions and 21 deletions

View File

@ -35,6 +35,7 @@ import java.util.regex.PatternSyntaxException;
import net.yacy.cora.document.ASCII;
import net.yacy.cora.document.MultiProtocolURI;
import net.yacy.cora.federate.solr.YaCySchema;
import net.yacy.cora.federate.yacy.CacheStrategy;
import net.yacy.cora.protocol.RequestHeader;
import net.yacy.cora.util.SpaceExceededException;
@ -371,6 +372,16 @@ public class Crawler_p {
sb.crawler.removePassive(handle);
try {sb.crawlQueues.noticeURL.removeByProfileHandle(profile.handle(), 10000);} catch (SpaceExceededException e1) {}
// delete all error urls for that domain
for (DigestURI u: rootURLs) {
String hosthash = u.hosthash();
try {
sb.crawlQueues.errorURL.removeHost(ASCII.getBytes(hosthash));
sb.index.fulltext().getSolr().deleteByQuery(YaCySchema.host_id_s.name() + ":\"" + hosthash + "\" AND " + YaCySchema.failreason_t.name() + ":[* TO *]");
sb.index.fulltext().commit();
} catch (IOException e) {Log.logException(e);}
}
// start the crawl
if ("url".equals(crawlingMode)) {
if (rootURLs.size() == 0) {

View File

@ -31,6 +31,7 @@ import java.io.IOException;
import java.util.ArrayList;
import java.util.Date;
import java.util.Iterator;
import java.util.List;
import java.util.Queue;
import java.util.concurrent.LinkedBlockingQueue;
@ -38,16 +39,15 @@ import org.apache.solr.common.SolrInputDocument;
import net.yacy.cora.document.ASCII;
import net.yacy.cora.document.UTF8;
import net.yacy.cora.federate.solr.connector.ShardSolrConnector;
import net.yacy.cora.federate.solr.connector.SolrConnector;
import net.yacy.cora.order.Base64Order;
import net.yacy.cora.order.NaturalOrder;
import net.yacy.cora.util.SpaceExceededException;
import net.yacy.crawler.retrieval.Request;
import net.yacy.kelondro.data.meta.DigestURI;
import net.yacy.kelondro.data.word.Word;
import net.yacy.kelondro.index.Index;
import net.yacy.kelondro.index.Row;
import net.yacy.kelondro.index.RowSet;
import net.yacy.kelondro.logging.Log;
import net.yacy.kelondro.table.SplitTable;
import net.yacy.kelondro.table.Table;
@ -56,7 +56,7 @@ import net.yacy.search.index.SolrConfiguration;
public class ZURL implements Iterable<ZURL.Entry> {
public static Log log = new Log("REJECTED");
private static Log log = new Log("REJECTED");
private static final int EcoFSBufferSize = 2000;
private static final int maxStackSize = 1000;
@ -93,7 +93,7 @@ public class ZURL implements Iterable<ZURL.Entry> {
private final SolrConnector solrConnector;
private final SolrConfiguration solrConfiguration;
public ZURL(
protected ZURL(
final SolrConnector solrConnector,
final SolrConfiguration solrConfiguration,
final File cachePath,
@ -124,21 +124,12 @@ public class ZURL implements Iterable<ZURL.Entry> {
this.stack = new LinkedBlockingQueue<byte[]>();
}
public ZURL(final ShardSolrConnector solrConnector,
final SolrConfiguration solrConfiguration) {
this.solrConnector = solrConnector;
this.solrConfiguration = solrConfiguration;
// creates a new ZUR in RAM
this.urlIndex = new RowSet(rowdef);
this.stack = new LinkedBlockingQueue<byte[]>();
}
public void clear() throws IOException {
protected void clear() throws IOException {
if (this.urlIndex != null) this.urlIndex.clear();
if (this.stack != null) this.stack.clear();
}
public void close() {
protected void close() {
try {clear();} catch (final IOException e) {}
if (this.urlIndex != null) this.urlIndex.close();
}
@ -153,6 +144,22 @@ public class ZURL implements Iterable<ZURL.Entry> {
return false;
}
}
public void removeHost(final byte[] hosthash) throws IOException {
if (hosthash == null) return;
Iterator<byte[]> i = this.urlIndex.keys(true, null);
List<byte[]> r = new ArrayList<byte[]>();
while (i.hasNext()) {
byte[] b = i.next();
if (NaturalOrder.naturalOrder.equal(hosthash, 0, b, 6, 6)) r.add(b);
}
for (byte[] b: r) this.urlIndex.remove(b);
i = this.stack.iterator();
while (i.hasNext()) {
byte[] b = i.next();
if (NaturalOrder.naturalOrder.equal(hosthash, 0, b, 6, 6)) i.remove();
}
}
public void push(
final Request bentry,
@ -259,7 +266,7 @@ public class ZURL implements Iterable<ZURL.Entry> {
}
}
public boolean exists(final byte[] urlHash) {
boolean exists(final byte[] urlHash) {
return this.urlIndex.has(urlHash);
}
@ -273,14 +280,14 @@ public class ZURL implements Iterable<ZURL.Entry> {
public class Entry {
Request bentry; // the balancer entry
private Request bentry; // the balancer entry
private final byte[] executor; // the crawling executor
private final Date workdate; // the time when the url was last time tried to load
private final int workcount; // number of tryings
private final String anycause; // string describing reason for load fail
private boolean stored;
protected Entry(
private Entry(
final Request bentry,
final byte[] executor,
final Date workdate,
@ -297,7 +304,7 @@ public class ZURL implements Iterable<ZURL.Entry> {
this.stored = false;
}
protected Entry(final Row.Entry entry) throws IOException {
private Entry(final Row.Entry entry) throws IOException {
assert (entry != null);
this.executor = entry.getColBytes(1, true);
this.workdate = new Date(entry.getColLong(2));
@ -317,7 +324,7 @@ public class ZURL implements Iterable<ZURL.Entry> {
return this.bentry.initiator();
}
public byte[] hash() {
private byte[] hash() {
// return a url-hash, based on the md5 algorithm
// the result is a String of 12 bytes within a 72-bit space
// (each byte has an 6-bit range)

View File

@ -183,7 +183,6 @@ import net.yacy.repository.LoaderDispatcher;
import net.yacy.search.index.Segment;
import net.yacy.search.index.SolrConfiguration;
import net.yacy.search.query.AccessTracker;
import net.yacy.search.query.QueryParams;
import net.yacy.search.query.SearchEvent;
import net.yacy.search.query.SearchEventCache;
import net.yacy.search.ranking.BlockRank;