disabled writing new entries to crawl stacks to prevent that a domain

with many documents block refreshing of the crawl queue
2024-09-19 00:01:41 +02:00 · 2012-10-29 22:26:52 +01:00 · 2012-10-29 22:26:52 +01:00 · c326aa8f67
commit c326aa8f67
parent 6905182d41
2 changed files with 3 additions and 4 deletions
--- a/source/net/yacy/crawler/Balancer.java
+++ b/source/net/yacy/crawler/Balancer.java
@ -273,8 +273,9 @@ public class Balancer {
 	        assert s < this.urlFileIndex.size() : "hash = " + ASCII.String(hash) + ", s = " + s + ", size = " + this.urlFileIndex.size();
 	        assert this.urlFileIndex.has(hash) : "hash = " + ASCII.String(hash);

-	        // add the hash to a queue
-	        pushHashToDomainStacks(entry.url().getHost(), entry.url().hash());
+	        // add the hash to a queue if the host is unknown to get this fast into the balancer
+	        // now disabled to prevent that a crawl 'freezes' to a specific domain which hosts a lot of pages; the queues are filled anyway
+	        //if (!this.domainStacks.containsKey(entry.url().getHost())) pushHashToDomainStacks(entry.url().getHost(), entry.url().hash());
        }
        robots.ensureExist(entry.url(), Balancer.this.myAgentIDs, true); // concurrently load all robots.txt
        return null;
--- a/source/net/yacy/search/index/Segment.java
+++ b/source/net/yacy/search/index/Segment.java
@ -376,7 +376,6 @@ public class Segment {
        int outlinksSame = document.inboundLinks().size();
        int outlinksOther = document.outboundLinks().size();
        final RWIProcess rankingProcess = (searchEvent == null) ? null : searchEvent.getRankingResult();
-        int wordCount = 0;
        final int urlLength = urlNormalform.length();
        final int urlComps = MultiProtocolURI.urlComps(url.toString()).length;

@ -409,7 +408,6 @@ public class Segment {
                } catch (final Exception e) {
                    Log.logException(e);
                }
-                wordCount++;
    
                // during a search event it is possible that a heuristic is used which aquires index
                // data during search-time. To transfer indexed data directly to the search process