mirror of
https://github.com/yacy/yacy_search_server.git
synced 2024-09-19 00:01:41 +02:00
when a new crawl is started, delete all entries about error-urls for
crawl-start domains
This commit is contained in:
parent
c6a6f4c4e6
commit
791e1dcfdf
|
@ -35,6 +35,7 @@ import java.util.regex.PatternSyntaxException;
|
|||
|
||||
import net.yacy.cora.document.ASCII;
|
||||
import net.yacy.cora.document.MultiProtocolURI;
|
||||
import net.yacy.cora.federate.solr.YaCySchema;
|
||||
import net.yacy.cora.federate.yacy.CacheStrategy;
|
||||
import net.yacy.cora.protocol.RequestHeader;
|
||||
import net.yacy.cora.util.SpaceExceededException;
|
||||
|
@ -371,6 +372,16 @@ public class Crawler_p {
|
|||
sb.crawler.removePassive(handle);
|
||||
try {sb.crawlQueues.noticeURL.removeByProfileHandle(profile.handle(), 10000);} catch (SpaceExceededException e1) {}
|
||||
|
||||
// delete all error urls for that domain
|
||||
for (DigestURI u: rootURLs) {
|
||||
String hosthash = u.hosthash();
|
||||
try {
|
||||
sb.crawlQueues.errorURL.removeHost(ASCII.getBytes(hosthash));
|
||||
sb.index.fulltext().getSolr().deleteByQuery(YaCySchema.host_id_s.name() + ":\"" + hosthash + "\" AND " + YaCySchema.failreason_t.name() + ":[* TO *]");
|
||||
sb.index.fulltext().commit();
|
||||
} catch (IOException e) {Log.logException(e);}
|
||||
}
|
||||
|
||||
// start the crawl
|
||||
if ("url".equals(crawlingMode)) {
|
||||
if (rootURLs.size() == 0) {
|
||||
|
|
|
@ -31,6 +31,7 @@ import java.io.IOException;
|
|||
import java.util.ArrayList;
|
||||
import java.util.Date;
|
||||
import java.util.Iterator;
|
||||
import java.util.List;
|
||||
import java.util.Queue;
|
||||
import java.util.concurrent.LinkedBlockingQueue;
|
||||
|
||||
|
@ -38,16 +39,15 @@ import org.apache.solr.common.SolrInputDocument;
|
|||
|
||||
import net.yacy.cora.document.ASCII;
|
||||
import net.yacy.cora.document.UTF8;
|
||||
import net.yacy.cora.federate.solr.connector.ShardSolrConnector;
|
||||
import net.yacy.cora.federate.solr.connector.SolrConnector;
|
||||
import net.yacy.cora.order.Base64Order;
|
||||
import net.yacy.cora.order.NaturalOrder;
|
||||
import net.yacy.cora.util.SpaceExceededException;
|
||||
import net.yacy.crawler.retrieval.Request;
|
||||
import net.yacy.kelondro.data.meta.DigestURI;
|
||||
import net.yacy.kelondro.data.word.Word;
|
||||
import net.yacy.kelondro.index.Index;
|
||||
import net.yacy.kelondro.index.Row;
|
||||
import net.yacy.kelondro.index.RowSet;
|
||||
import net.yacy.kelondro.logging.Log;
|
||||
import net.yacy.kelondro.table.SplitTable;
|
||||
import net.yacy.kelondro.table.Table;
|
||||
|
@ -56,7 +56,7 @@ import net.yacy.search.index.SolrConfiguration;
|
|||
|
||||
public class ZURL implements Iterable<ZURL.Entry> {
|
||||
|
||||
public static Log log = new Log("REJECTED");
|
||||
private static Log log = new Log("REJECTED");
|
||||
|
||||
private static final int EcoFSBufferSize = 2000;
|
||||
private static final int maxStackSize = 1000;
|
||||
|
@ -93,7 +93,7 @@ public class ZURL implements Iterable<ZURL.Entry> {
|
|||
private final SolrConnector solrConnector;
|
||||
private final SolrConfiguration solrConfiguration;
|
||||
|
||||
public ZURL(
|
||||
protected ZURL(
|
||||
final SolrConnector solrConnector,
|
||||
final SolrConfiguration solrConfiguration,
|
||||
final File cachePath,
|
||||
|
@ -124,21 +124,12 @@ public class ZURL implements Iterable<ZURL.Entry> {
|
|||
this.stack = new LinkedBlockingQueue<byte[]>();
|
||||
}
|
||||
|
||||
public ZURL(final ShardSolrConnector solrConnector,
|
||||
final SolrConfiguration solrConfiguration) {
|
||||
this.solrConnector = solrConnector;
|
||||
this.solrConfiguration = solrConfiguration;
|
||||
// creates a new ZUR in RAM
|
||||
this.urlIndex = new RowSet(rowdef);
|
||||
this.stack = new LinkedBlockingQueue<byte[]>();
|
||||
}
|
||||
|
||||
public void clear() throws IOException {
|
||||
protected void clear() throws IOException {
|
||||
if (this.urlIndex != null) this.urlIndex.clear();
|
||||
if (this.stack != null) this.stack.clear();
|
||||
}
|
||||
|
||||
public void close() {
|
||||
protected void close() {
|
||||
try {clear();} catch (final IOException e) {}
|
||||
if (this.urlIndex != null) this.urlIndex.close();
|
||||
}
|
||||
|
@ -153,6 +144,22 @@ public class ZURL implements Iterable<ZURL.Entry> {
|
|||
return false;
|
||||
}
|
||||
}
|
||||
|
||||
public void removeHost(final byte[] hosthash) throws IOException {
|
||||
if (hosthash == null) return;
|
||||
Iterator<byte[]> i = this.urlIndex.keys(true, null);
|
||||
List<byte[]> r = new ArrayList<byte[]>();
|
||||
while (i.hasNext()) {
|
||||
byte[] b = i.next();
|
||||
if (NaturalOrder.naturalOrder.equal(hosthash, 0, b, 6, 6)) r.add(b);
|
||||
}
|
||||
for (byte[] b: r) this.urlIndex.remove(b);
|
||||
i = this.stack.iterator();
|
||||
while (i.hasNext()) {
|
||||
byte[] b = i.next();
|
||||
if (NaturalOrder.naturalOrder.equal(hosthash, 0, b, 6, 6)) i.remove();
|
||||
}
|
||||
}
|
||||
|
||||
public void push(
|
||||
final Request bentry,
|
||||
|
@ -259,7 +266,7 @@ public class ZURL implements Iterable<ZURL.Entry> {
|
|||
}
|
||||
}
|
||||
|
||||
public boolean exists(final byte[] urlHash) {
|
||||
boolean exists(final byte[] urlHash) {
|
||||
return this.urlIndex.has(urlHash);
|
||||
}
|
||||
|
||||
|
@ -273,14 +280,14 @@ public class ZURL implements Iterable<ZURL.Entry> {
|
|||
|
||||
public class Entry {
|
||||
|
||||
Request bentry; // the balancer entry
|
||||
private Request bentry; // the balancer entry
|
||||
private final byte[] executor; // the crawling executor
|
||||
private final Date workdate; // the time when the url was last time tried to load
|
||||
private final int workcount; // number of tryings
|
||||
private final String anycause; // string describing reason for load fail
|
||||
private boolean stored;
|
||||
|
||||
protected Entry(
|
||||
private Entry(
|
||||
final Request bentry,
|
||||
final byte[] executor,
|
||||
final Date workdate,
|
||||
|
@ -297,7 +304,7 @@ public class ZURL implements Iterable<ZURL.Entry> {
|
|||
this.stored = false;
|
||||
}
|
||||
|
||||
protected Entry(final Row.Entry entry) throws IOException {
|
||||
private Entry(final Row.Entry entry) throws IOException {
|
||||
assert (entry != null);
|
||||
this.executor = entry.getColBytes(1, true);
|
||||
this.workdate = new Date(entry.getColLong(2));
|
||||
|
@ -317,7 +324,7 @@ public class ZURL implements Iterable<ZURL.Entry> {
|
|||
return this.bentry.initiator();
|
||||
}
|
||||
|
||||
public byte[] hash() {
|
||||
private byte[] hash() {
|
||||
// return a url-hash, based on the md5 algorithm
|
||||
// the result is a String of 12 bytes within a 72-bit space
|
||||
// (each byte has an 6-bit range)
|
||||
|
|
|
@ -183,7 +183,6 @@ import net.yacy.repository.LoaderDispatcher;
|
|||
import net.yacy.search.index.Segment;
|
||||
import net.yacy.search.index.SolrConfiguration;
|
||||
import net.yacy.search.query.AccessTracker;
|
||||
import net.yacy.search.query.QueryParams;
|
||||
import net.yacy.search.query.SearchEvent;
|
||||
import net.yacy.search.query.SearchEventCache;
|
||||
import net.yacy.search.ranking.BlockRank;
|
||||
|
|
Loading…
Reference in New Issue
Block a user