disabled writing new entries to crawl stacks to prevent that a domain

with many documents block refreshing of the crawl queue
This commit is contained in:
Michael Peter Christen 2012-10-29 22:26:52 +01:00
parent 6905182d41
commit c326aa8f67
2 changed files with 3 additions and 4 deletions

View File

@ -273,8 +273,9 @@ public class Balancer {
assert s < this.urlFileIndex.size() : "hash = " + ASCII.String(hash) + ", s = " + s + ", size = " + this.urlFileIndex.size();
assert this.urlFileIndex.has(hash) : "hash = " + ASCII.String(hash);
// add the hash to a queue
pushHashToDomainStacks(entry.url().getHost(), entry.url().hash());
// add the hash to a queue if the host is unknown to get this fast into the balancer
// now disabled to prevent that a crawl 'freezes' to a specific domain which hosts a lot of pages; the queues are filled anyway
//if (!this.domainStacks.containsKey(entry.url().getHost())) pushHashToDomainStacks(entry.url().getHost(), entry.url().hash());
}
robots.ensureExist(entry.url(), Balancer.this.myAgentIDs, true); // concurrently load all robots.txt
return null;

View File

@ -376,7 +376,6 @@ public class Segment {
int outlinksSame = document.inboundLinks().size();
int outlinksOther = document.outboundLinks().size();
final RWIProcess rankingProcess = (searchEvent == null) ? null : searchEvent.getRankingResult();
int wordCount = 0;
final int urlLength = urlNormalform.length();
final int urlComps = MultiProtocolURI.urlComps(url.toString()).length;
@ -409,7 +408,6 @@ public class Segment {
} catch (final Exception e) {
Log.logException(e);
}
wordCount++;
// during a search event it is possible that a heuristic is used which aquires index
// data during search-time. To transfer indexed data directly to the search process