From a83772c71be8cfddec636e87f79cce3901db9579 Mon Sep 17 00:00:00 2001 From: orbiter Date: Tue, 1 Jun 2010 09:30:23 +0000 Subject: [PATCH] fixes and enhancements for balancer: - crawl lists for each domain now uses a HandleSet which should use less memory than LinkedLists - but: fill more entries into the domain lists (all available entries) - fixes to selection criteria (best domain selection) git-svn-id: https://svn.berlios.de/svnroot/repos/yacy/trunk@6909 6c8d7289-2bf4-0310-a012-ef5d649a1542 --- source/de/anomic/crawler/Balancer.java | 114 +++++++++--------- .../net/yacy/cora/protocol/ProxySettings.java | 19 +-- source/net/yacy/kelondro/index/HandleSet.java | 16 ++- 3 files changed, 81 insertions(+), 68 deletions(-) diff --git a/source/de/anomic/crawler/Balancer.java b/source/de/anomic/crawler/Balancer.java index 40e4ec100..1a134886a 100644 --- a/source/de/anomic/crawler/Balancer.java +++ b/source/de/anomic/crawler/Balancer.java @@ -26,7 +26,6 @@ import java.io.File; import java.io.IOException; import java.util.ArrayList; import java.util.Iterator; -import java.util.LinkedList; import java.util.List; import java.util.Map; import java.util.TreeMap; @@ -54,7 +53,7 @@ public class Balancer { private static final String localhost = "localhost"; // class variables - private final ConcurrentHashMap> domainStacks; // a map from host name to lists with url hashs + private final ConcurrentHashMap domainStacks; // a map from host name to lists with url hashs private final ConcurrentLinkedQueue top; private final TreeMap delayed; private BufferedObjectIndex urlFileIndex; @@ -72,7 +71,7 @@ public class Balancer { final boolean useTailCache, final boolean exceed134217727) { this.cacheStacksPath = cachePath; - this.domainStacks = new ConcurrentHashMap>(); + this.domainStacks = new ConcurrentHashMap(); this.top = new ConcurrentLinkedQueue(); this.delayed = new TreeMap(); this.minimumLocalDelta = minimumLocalDelta; @@ -198,16 +197,11 @@ public class Balancer { } // iterate through the domain stacks - final Iterator>> q = domainStacks.entrySet().iterator(); - Map.Entry> se; - LinkedList stack; + final Iterator> q = domainStacks.entrySet().iterator(); + HandleSet stack; while (q.hasNext()) { - se = q.next(); - stack = se.getValue(); - final Iterator i = stack.iterator(); - while (i.hasNext()) { - if (urlHashes.has(i.next())) i.remove(); - } + stack = q.next().getValue(); + for (byte[] handle: urlHashes) stack.remove(handle); if (stack.isEmpty()) q.remove(); } @@ -235,7 +229,7 @@ public class Balancer { private boolean domainStacksNotEmpty() { if (domainStacks == null) return false; synchronized (domainStacks) { - for (LinkedList l: domainStacks.values()) { + for (HandleSet l: domainStacks.values()) { if (!l.isEmpty()) return true; } } @@ -257,37 +251,35 @@ public class Balancer { assert urlFileIndex.has(hash) : "hash = " + new String(hash); // add the hash to a queue - pushHashToDomainStacks(entry.url().getHost(), entry.url().hash(), 50); + pushHashToDomainStacks(entry.url().getHost(), entry.url().hash()); } } - private void pushHashToDomainStacks(String host, final byte[] urlhash, final int maxstacksize) { + private void pushHashToDomainStacks(String host, final byte[] urlhash) throws RowSpaceExceededException { // extend domain stack if (host == null) host = localhost; - LinkedList domainList = domainStacks.get(host); + HandleSet domainList = domainStacks.get(host); if (domainList == null) { // create new list - domainList = new LinkedList(); - domainList.add(urlhash); + domainList = new HandleSet(12, Base64Order.enhancedCoder, 1); + domainList.put(urlhash); domainStacks.put(host, domainList); } else { // extend existent domain list - if (domainList.size() < maxstacksize) domainList.addLast(urlhash); + domainList.put(urlhash); } } private void removeHashFromDomainStacks(String host, final byte[] urlhash) { - // extend domain stack + // reduce domain stack if (host == null) host = localhost; - final LinkedList domainList = domainStacks.get(host); - if (domainList == null) return; - final Iterator i = domainList.iterator(); - while (i.hasNext()) { - if (Base64Order.enhancedCoder.equal(i.next(), urlhash)) { - i.remove(); - return; - } + final HandleSet domainList = domainStacks.get(host); + if (domainList == null) { + domainStacks.remove(host); + return; } + domainList.remove(urlhash); + if (domainList.size() == 0) domainStacks.remove(host); } private byte[] nextFromDelayed() { @@ -320,23 +312,25 @@ public class Balancer { public Request pop(final boolean delay, final CrawlProfile profile) throws IOException { // returns a crawl entry from the stack and ensures minimum delta times - filltop(delay, -600000, false); - filltop(delay, -60000, false); - filltop(delay, -10000, false); - filltop(delay, -6000, false); - filltop(delay, -4000, false); - filltop(delay, -3000, false); - filltop(delay, -2000, false); - filltop(delay, -1000, false); - filltop(delay, -500, false); - filltop(delay, 0, true); - filltop(delay, 500, true); - filltop(delay, 1000, true); - filltop(delay, 2000, true); - filltop(delay, 3000, true); - filltop(delay, 4000, true); - filltop(delay, 6000, true); - filltop(delay, Long.MAX_VALUE, true); + try { + filltop(delay, -600000, false); + filltop(delay, -60000, false); + filltop(delay, -10000, false); + filltop(delay, -6000, false); + filltop(delay, -4000, false); + filltop(delay, -3000, false); + filltop(delay, -2000, false); + filltop(delay, -1000, false); + filltop(delay, -500, false); + filltop(delay, 0, true); + filltop(delay, 500, true); + filltop(delay, 1000, true); + filltop(delay, 2000, true); + filltop(delay, 3000, true); + filltop(delay, 4000, true); + filltop(delay, 6000, true); + filltop(delay, Long.MAX_VALUE, true); + } catch (RowSpaceExceededException e) {} long sleeptime = 0; Request crawlEntry = null; @@ -440,21 +434,21 @@ public class Balancer { return crawlEntry; } - private void filltop(final boolean delay, final long maximumwaiting, final boolean acceptonebest) { + private void filltop(final boolean delay, final long maximumwaiting, final boolean acceptonebest) throws RowSpaceExceededException { if (!this.top.isEmpty()) return; //System.out.println("*** DEBUG started filltop delay=" + ((delay) ? "true":"false") + ", maximumwaiting=" + maximumwaiting + ", acceptonebest=" + ((acceptonebest) ? "true":"false")); // check if we need to get entries from the file index try { - fillDomainStacks(200); + fillDomainStacks(); } catch (IOException e) { Log.logException(e); } // iterate over the domain stacks - final Iterator>> i = this.domainStacks.entrySet().iterator(); - Map.Entry> entry; + final Iterator> i = this.domainStacks.entrySet().iterator(); + Map.Entry entry; long smallestWaiting = Long.MAX_VALUE; byte[] besturlhash = null; String besthost = null; @@ -467,22 +461,21 @@ public class Balancer { continue; } - byte[] n = entry.getValue().getFirst(); + byte[] n = entry.getValue().removeOne(); if (n == null) continue; - besthost = entry.getKey(); if (delay) { - final long w = Latency.waitingRemainingGuessed(besthost, minimumLocalDelta, minimumGlobalDelta); + final long w = Latency.waitingRemainingGuessed(entry.getKey(), minimumLocalDelta, minimumGlobalDelta); if (w > maximumwaiting) { if (w < smallestWaiting) { smallestWaiting = w; besturlhash = n; besthost = entry.getKey(); } + entry.getValue().put(n); // put entry back continue; } } - n = entry.getValue().removeFirst(); this.top.add(n); if (entry.getValue().isEmpty()) i.remove(); } @@ -494,10 +487,9 @@ public class Balancer { } } - private void fillDomainStacks(final int maxdomstacksize) throws IOException { + private void fillDomainStacks() throws IOException { if (!this.domainStacks.isEmpty() && System.currentTimeMillis() - lastDomainStackFill < 120000L) return; this.domainStacks.clear(); - //synchronized (this.delayed) { delayed.clear(); } this.lastDomainStackFill = System.currentTimeMillis(); final HandleSet handles = this.urlFileIndex.keysFromBuffer(objectIndexBufferSize / 2); final CloneableIterator i = handles.keys(true, null); @@ -508,8 +500,11 @@ public class Balancer { handle = i.next(); request = new Request(this.urlFileIndex.get(handle)); host = request.url().getHost(); - pushHashToDomainStacks(host, handle, 1000); - if (this.domainStacks.size() > maxdomstacksize) break; + try { + pushHashToDomainStacks(host, handle); + } catch (RowSpaceExceededException e) { + break; + } } Log.logInfo("BALANCER", "re-fill of domain stacks; fileIndex.size() = " + this.urlFileIndex.size() + ", domainStacks.size = " + domainStacks.size() + ", collection time = " + (System.currentTimeMillis() - this.lastDomainStackFill) + " ms"); this.domStackInitSize = this.domainStacks.size(); @@ -536,9 +531,10 @@ public class Balancer { loop: while (count > 0) { // iterate over the domain stacks int celsize = cel.size(); - ll: for (LinkedList list: this.domainStacks.values()) { + ll: for (HandleSet list: this.domainStacks.values()) { if (list.size() <= depth) continue ll; - byte[] n = list.get(depth); + byte[] n = list.getOne(depth); + if (n == null) continue ll; try { Row.Entry rowEntry = urlFileIndex.get(n); if (rowEntry == null) continue; diff --git a/source/net/yacy/cora/protocol/ProxySettings.java b/source/net/yacy/cora/protocol/ProxySettings.java index 3b6fbc631..060358dbc 100644 --- a/source/net/yacy/cora/protocol/ProxySettings.java +++ b/source/net/yacy/cora/protocol/ProxySettings.java @@ -20,8 +20,8 @@ package net.yacy.cora.protocol; -import java.util.HashSet; -import java.util.Set; +import java.util.Map; +import java.util.concurrent.ConcurrentHashMap; import org.apache.commons.httpclient.HostConfiguration; import org.apache.commons.httpclient.HttpClient; @@ -32,12 +32,15 @@ import org.apache.commons.httpclient.HttpClient; */ public final class ProxySettings { + // Dummy value to associate with an Object in the backing Map + private static final Object PRESENT = new Object(); + public static boolean use = false, use4YaCy = false, use4ssl = false; public static String host = null, user = "", password = ""; public static int port = 0; public static String[] noProxy = null; - public static final Set allowProxy = new HashSet(); - public static final Set disallowProxy = new HashSet(); + public static final Map allowProxy = new ConcurrentHashMap(); + public static final Map disallowProxy = new ConcurrentHashMap(); /** * produce a HostConfiguration (apache object) with the proxy access information included @@ -59,15 +62,15 @@ public final class ProxySettings { */ public static boolean useForHost(final String host) { if (!use) return false; - if (allowProxy.contains(host)) return true; - if (disallowProxy.contains(host)) return false; + if (allowProxy.containsKey(host)) return true; + if (disallowProxy.containsKey(host)) return false; for (String pattern: noProxy) { if (host.matches(pattern)) { - disallowProxy.add(host); + disallowProxy.put(host, PRESENT); return false; } } - allowProxy.add(host); + allowProxy.put(host, PRESENT); return true; } diff --git a/source/net/yacy/kelondro/index/HandleSet.java b/source/net/yacy/kelondro/index/HandleSet.java index 2046a1693..e29becdbf 100644 --- a/source/net/yacy/kelondro/index/HandleSet.java +++ b/source/net/yacy/kelondro/index/HandleSet.java @@ -164,13 +164,27 @@ public final class HandleSet implements Iterable, Cloneable { return indexentry != null; } - public final synchronized byte[] removeone() { + public final synchronized byte[] removeOne() { Row.Entry indexentry; indexentry = index.removeOne(); if (indexentry == null) return null; return indexentry.getColBytes(0, true); } + /** + * get one entry; objects are taken from the end of the list + * a getOne(0) would return the same object as removeOne() would remove + * @param idx + * @return entry from the end of the list + */ + public final synchronized byte[] getOne(int idx) { + if (idx >= this.size()) return null; + Row.Entry indexentry; + indexentry = index.get(this.size() - 1 - idx, true); + if (indexentry == null) return null; + return indexentry.getColBytes(0, true); + } + public final synchronized boolean isEmpty() { return index.isEmpty(); }