mirror of
https://github.com/yacy/yacy_search_server.git
synced 2024-09-19 00:01:41 +02:00
disabled writing new entries to crawl stacks to prevent that a domain
with many documents block refreshing of the crawl queue
This commit is contained in:
parent
6905182d41
commit
c326aa8f67
|
@ -273,8 +273,9 @@ public class Balancer {
|
|||
assert s < this.urlFileIndex.size() : "hash = " + ASCII.String(hash) + ", s = " + s + ", size = " + this.urlFileIndex.size();
|
||||
assert this.urlFileIndex.has(hash) : "hash = " + ASCII.String(hash);
|
||||
|
||||
// add the hash to a queue
|
||||
pushHashToDomainStacks(entry.url().getHost(), entry.url().hash());
|
||||
// add the hash to a queue if the host is unknown to get this fast into the balancer
|
||||
// now disabled to prevent that a crawl 'freezes' to a specific domain which hosts a lot of pages; the queues are filled anyway
|
||||
//if (!this.domainStacks.containsKey(entry.url().getHost())) pushHashToDomainStacks(entry.url().getHost(), entry.url().hash());
|
||||
}
|
||||
robots.ensureExist(entry.url(), Balancer.this.myAgentIDs, true); // concurrently load all robots.txt
|
||||
return null;
|
||||
|
|
|
@ -376,7 +376,6 @@ public class Segment {
|
|||
int outlinksSame = document.inboundLinks().size();
|
||||
int outlinksOther = document.outboundLinks().size();
|
||||
final RWIProcess rankingProcess = (searchEvent == null) ? null : searchEvent.getRankingResult();
|
||||
int wordCount = 0;
|
||||
final int urlLength = urlNormalform.length();
|
||||
final int urlComps = MultiProtocolURI.urlComps(url.toString()).length;
|
||||
|
||||
|
@ -409,7 +408,6 @@ public class Segment {
|
|||
} catch (final Exception e) {
|
||||
Log.logException(e);
|
||||
}
|
||||
wordCount++;
|
||||
|
||||
// during a search event it is possible that a heuristic is used which aquires index
|
||||
// data during search-time. To transfer indexed data directly to the search process
|
||||
|
|
Loading…
Reference in New Issue
Block a user