mirror of
https://github.com/yacy/yacy_search_server.git
synced 2024-09-21 00:00:13 +02:00
3e742d1e34
If remote crawl option is not activated, skip init of remoteCrawlJob to save the resources of queue and ideling thread. Deploy of the remoteCrawlJob deferred on activation of the option.
646 lines
30 KiB
Java
646 lines
30 KiB
Java
// LegacyBalancer.java
|
|
// -----------------------
|
|
// part of YaCy
|
|
// (C) by Michael Peter Christen; mc@yacy.net
|
|
// first published on http://www.anomic.de
|
|
// Frankfurt, Germany, 2005
|
|
// created: 24.09.2005
|
|
//
|
|
// This program is free software; you can redistribute it and/or modify
|
|
// it under the terms of the GNU General Public License as published by
|
|
// the Free Software Foundation; either version 2 of the License, or
|
|
// (at your option) any later version.
|
|
//
|
|
// This program is distributed in the hope that it will be useful,
|
|
// but WITHOUT ANY WARRANTY; without even the implied warranty of
|
|
// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
|
|
// GNU General Public License for more details.
|
|
//
|
|
// You should have received a copy of the GNU General Public License
|
|
// along with this program; if not, write to the Free Software
|
|
// Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
|
|
|
|
package net.yacy.crawler;
|
|
|
|
import java.io.File;
|
|
import java.io.IOException;
|
|
import java.util.AbstractMap;
|
|
import java.util.ArrayList;
|
|
import java.util.Iterator;
|
|
import java.util.List;
|
|
import java.util.Map;
|
|
import java.util.Random;
|
|
import java.util.Set;
|
|
import java.util.TreeMap;
|
|
import java.util.concurrent.ConcurrentHashMap;
|
|
import java.util.concurrent.ConcurrentMap;
|
|
|
|
import net.yacy.cora.document.encoding.ASCII;
|
|
import net.yacy.cora.document.encoding.UTF8;
|
|
import net.yacy.cora.order.Base64Order;
|
|
import net.yacy.cora.protocol.ClientIdentification;
|
|
import net.yacy.cora.protocol.Domains;
|
|
import net.yacy.cora.sorting.OrderedScoreMap;
|
|
import net.yacy.cora.storage.HandleSet;
|
|
import net.yacy.cora.util.ConcurrentLog;
|
|
import net.yacy.cora.util.SpaceExceededException;
|
|
import net.yacy.crawler.data.CrawlProfile;
|
|
import net.yacy.crawler.data.Latency;
|
|
import net.yacy.crawler.retrieval.Request;
|
|
import net.yacy.crawler.robots.RobotsTxt;
|
|
import net.yacy.kelondro.data.word.Word;
|
|
import net.yacy.kelondro.index.BufferedObjectIndex;
|
|
import net.yacy.kelondro.index.Row;
|
|
import net.yacy.kelondro.index.RowHandleSet;
|
|
import net.yacy.kelondro.table.Table;
|
|
import net.yacy.kelondro.util.MemoryControl;
|
|
import net.yacy.repository.Blacklist.BlacklistType;
|
|
import net.yacy.search.Switchboard;
|
|
|
|
public class LegacyBalancer implements Balancer {
|
|
|
|
private static final String indexSuffix = "A.db";
|
|
private static final int EcoFSBufferSize = 1000;
|
|
private static final int objectIndexBufferSize = 1000;
|
|
private static final int MAX_DOUBLE_PUSH_CHECK = 100000;
|
|
|
|
// class variables filled with external values
|
|
private final File cacheStacksPath;
|
|
private BufferedObjectIndex urlFileIndex;
|
|
|
|
// class variables computed during operation
|
|
private final ConcurrentMap<String, HostHandles> domainStacks; // a map from host name to lists with url hashs
|
|
private final HandleSet double_push_check; // for debugging
|
|
private long lastDomainStackFill;
|
|
private int domStackInitSize;
|
|
private final List<Map.Entry<String, byte[]>> zeroWaitingCandidates;
|
|
private final Random random; // used to alternate between choose-from-maxstack or choose from any zero-waiting
|
|
|
|
@Override
|
|
public int getOnDemandLimit() {
|
|
throw new UnsupportedOperationException("Not supported yet."); //To change body of generated methods, choose Tools | Templates.
|
|
}
|
|
|
|
@Override
|
|
public boolean getExceed134217727() {
|
|
throw new UnsupportedOperationException("Not supported yet."); //To change body of generated methods, choose Tools | Templates.
|
|
}
|
|
|
|
private static class HostHandles {
|
|
public String hosthash;
|
|
public HandleSet handleSet;
|
|
public HostHandles(final String hosthash, final HandleSet handleSet) {
|
|
this.hosthash = hosthash;
|
|
this.handleSet = handleSet;
|
|
}
|
|
}
|
|
|
|
public LegacyBalancer(
|
|
final File cachePath,
|
|
final String stackname,
|
|
final boolean useTailCache,
|
|
final boolean exceed134217727) {
|
|
this.cacheStacksPath = cachePath;
|
|
this.domainStacks = new ConcurrentHashMap<String, HostHandles>();
|
|
this.domStackInitSize = Integer.MAX_VALUE;
|
|
this.double_push_check = new RowHandleSet(Word.commonHashLength, Word.commonHashOrder, 0);
|
|
this.zeroWaitingCandidates = new ArrayList<Map.Entry<String, byte[]>>();
|
|
this.random = new Random(System.currentTimeMillis());
|
|
|
|
// create a stack for newly entered entries
|
|
if (!(cachePath.exists())) cachePath.mkdir(); // make the path
|
|
this.cacheStacksPath.mkdirs();
|
|
final File f = new File(this.cacheStacksPath, stackname + indexSuffix);
|
|
try {
|
|
this.urlFileIndex = new BufferedObjectIndex(new Table(f, Request.rowdef, EcoFSBufferSize, 0, useTailCache, exceed134217727, true), objectIndexBufferSize);
|
|
} catch (final SpaceExceededException e) {
|
|
try {
|
|
this.urlFileIndex = new BufferedObjectIndex(new Table(f, Request.rowdef, 0, 0, false, exceed134217727, true), objectIndexBufferSize);
|
|
} catch (final SpaceExceededException e1) {
|
|
ConcurrentLog.logException(e1);
|
|
}
|
|
}
|
|
this.lastDomainStackFill = 0;
|
|
ConcurrentLog.info("Balancer", "opened balancer file with " + this.urlFileIndex.size() + " entries from " + f.toString());
|
|
}
|
|
|
|
@Override
|
|
public synchronized void close() {
|
|
if (this.urlFileIndex != null) {
|
|
this.urlFileIndex.close();
|
|
this.urlFileIndex = null;
|
|
}
|
|
}
|
|
|
|
@Override
|
|
public void clear() {
|
|
ConcurrentLog.info("Balancer", "cleaning balancer with " + this.urlFileIndex.size() + " entries from " + this.urlFileIndex.filename());
|
|
try {
|
|
this.urlFileIndex.clear();
|
|
} catch (final IOException e) {
|
|
ConcurrentLog.logException(e);
|
|
}
|
|
this.domainStacks.clear();
|
|
this.double_push_check.clear();
|
|
}
|
|
|
|
@Override
|
|
public Request get(final byte[] urlhash) throws IOException {
|
|
assert urlhash != null;
|
|
if (this.urlFileIndex == null) return null; // case occurs during shutdown
|
|
final Row.Entry entry = this.urlFileIndex.get(urlhash, false);
|
|
if (entry == null) return null;
|
|
return new Request(entry);
|
|
}
|
|
|
|
@Override
|
|
public int removeAllByProfileHandle(final String profileHandle, final long timeout) throws IOException, SpaceExceededException {
|
|
// removes all entries with a specific profile hash.
|
|
// this may last some time
|
|
// returns number of deletions
|
|
|
|
// first find a list of url hashes that shall be deleted
|
|
final HandleSet urlHashes = new RowHandleSet(this.urlFileIndex.row().primaryKeyLength, Base64Order.enhancedCoder, 100);
|
|
final long terminate = timeout == Long.MAX_VALUE ? Long.MAX_VALUE : (timeout > 0) ? System.currentTimeMillis() + timeout : Long.MAX_VALUE;
|
|
synchronized (this) {
|
|
final Iterator<Row.Entry> i = this.urlFileIndex.rows();
|
|
Row.Entry rowEntry;
|
|
Request crawlEntry;
|
|
while (i.hasNext() && (System.currentTimeMillis() < terminate)) {
|
|
rowEntry = i.next();
|
|
crawlEntry = new Request(rowEntry);
|
|
if (crawlEntry.profileHandle().equals(profileHandle)) {
|
|
urlHashes.put(crawlEntry.url().hash());
|
|
}
|
|
}
|
|
}
|
|
|
|
// then delete all these urls from the queues and the file index
|
|
return remove(urlHashes);
|
|
}
|
|
|
|
/**
|
|
* this method is only here, because so many import/export methods need it
|
|
and it was implemented in the previous architecture
|
|
however, usage is not recommended
|
|
* @param urlHashes, a list of hashes that shall be removed
|
|
* @return number of entries that had been removed
|
|
* @throws IOException
|
|
*/
|
|
@Override
|
|
public synchronized int remove(final HandleSet urlHashes) throws IOException {
|
|
final int s = this.urlFileIndex.size();
|
|
int removedCounter = 0;
|
|
for (final byte[] urlhash: urlHashes) {
|
|
final Row.Entry entry = this.urlFileIndex.remove(urlhash);
|
|
if (entry != null) removedCounter++;
|
|
|
|
// remove from double-check caches
|
|
this.double_push_check.remove(urlhash);
|
|
}
|
|
if (removedCounter == 0) return 0;
|
|
assert this.urlFileIndex.size() + removedCounter == s : "urlFileIndex.size() = " + this.urlFileIndex.size() + ", s = " + s;
|
|
|
|
// iterate through the domain stacks
|
|
final Iterator<Map.Entry<String, HostHandles>> q = this.domainStacks.entrySet().iterator();
|
|
HandleSet stack;
|
|
while (q.hasNext()) {
|
|
stack = q.next().getValue().handleSet;
|
|
for (final byte[] handle: urlHashes) stack.remove(handle);
|
|
if (stack.isEmpty()) q.remove();
|
|
}
|
|
|
|
// iterate through zero-waiting map
|
|
final Iterator<Map.Entry<String, byte[]>> i = this.zeroWaitingCandidates.iterator();
|
|
while (i.hasNext()) {
|
|
if (urlHashes.has(i.next().getValue())) i.remove();
|
|
}
|
|
|
|
return removedCounter;
|
|
}
|
|
|
|
@Override
|
|
public boolean has(final byte[] urlhashb) {
|
|
return this.urlFileIndex.has(urlhashb) || this.double_push_check.has(urlhashb);
|
|
}
|
|
|
|
@Override
|
|
public int size() {
|
|
return this.urlFileIndex.size();
|
|
}
|
|
|
|
@Override
|
|
public boolean isEmpty() {
|
|
return this.urlFileIndex.isEmpty();
|
|
}
|
|
|
|
/**
|
|
* push a crawl request on the balancer stack
|
|
* @param entry
|
|
* @return null if this was successful or a String explaining what went wrong in case of an error
|
|
* @throws IOException
|
|
* @throws SpaceExceededException
|
|
*/
|
|
@Override
|
|
public String push(final Request entry, CrawlProfile profile, final RobotsTxt robots) throws IOException, SpaceExceededException {
|
|
assert entry != null;
|
|
final byte[] hash = entry.url().hash();
|
|
synchronized (this) {
|
|
// double-check
|
|
if (this.double_push_check.has(hash)) return "double occurrence in double_push_check";
|
|
if (this.urlFileIndex.has(hash)) return "double occurrence in urlFileIndex";
|
|
|
|
if (this.double_push_check.size() > MAX_DOUBLE_PUSH_CHECK || MemoryControl.shortStatus()) this.double_push_check.clear();
|
|
this.double_push_check.put(hash);
|
|
|
|
// increase dom counter
|
|
if (profile != null && profile.domMaxPages() != Integer.MAX_VALUE && profile.domMaxPages() > 0) {
|
|
profile.domInc(entry.url().getHost());
|
|
}
|
|
|
|
// add to index
|
|
final int s = this.urlFileIndex.size();
|
|
this.urlFileIndex.put(entry.toRow());
|
|
assert s < this.urlFileIndex.size() : "hash = " + ASCII.String(hash) + ", s = " + s + ", size = " + this.urlFileIndex.size();
|
|
assert this.urlFileIndex.has(hash) : "hash = " + ASCII.String(hash);
|
|
|
|
// add the hash to a queue if the host is unknown to get this fast into the balancer
|
|
// now disabled to prevent that a crawl 'freezes' to a specific domain which hosts a lot of pages; the queues are filled anyway
|
|
//if (!this.domainStacks.containsKey(entry.url().getHost())) pushHashToDomainStacks(entry.url().getHost(), entry.url().hash());
|
|
}
|
|
robots.ensureExist(entry.url(), profile.getAgent(), true); // concurrently load all robots.txt
|
|
return null;
|
|
}
|
|
|
|
/**
|
|
* get a list of domains that are currently maintained as domain stacks
|
|
* @return a map of clear text strings of host names to an integer array: {the size of the domain stack, guessed delta waiting time}
|
|
*/
|
|
@Override
|
|
public Map<String, Integer[]> getDomainStackHosts(RobotsTxt robots) {
|
|
Map<String, Integer[]> map = new TreeMap<String, Integer[]>(); // we use a tree map to get a stable ordering
|
|
for (Map.Entry<String, HostHandles> entry: this.domainStacks.entrySet()) {
|
|
final String hostname = entry.getKey();
|
|
final HostHandles hosthandles = entry.getValue();
|
|
int size = hosthandles.handleSet.size();
|
|
int delta = Latency.waitingRemainingGuessed(hostname, hosthandles.hosthash, robots, ClientIdentification.yacyInternetCrawlerAgent);
|
|
map.put(hostname, new Integer[]{size, delta});
|
|
}
|
|
return map;
|
|
}
|
|
|
|
/**
|
|
* get lists of crawl request entries for a specific host
|
|
* @param host
|
|
* @param maxcount
|
|
* @param maxtime
|
|
* @return a list of crawl loader requests
|
|
*/
|
|
@Override
|
|
public List<Request> getDomainStackReferences(final String host, int maxcount, final long maxtime) {
|
|
final HostHandles hh = this.domainStacks.get(host);
|
|
if (hh == null) return new ArrayList<Request>(0);
|
|
final HandleSet domainList = hh.handleSet;
|
|
if (domainList.isEmpty()) return new ArrayList<Request>(0);
|
|
maxcount = Math.min(maxcount, domainList.size());
|
|
final ArrayList<Request> cel = new ArrayList<Request>(maxcount);
|
|
long timeout = maxtime == Long.MAX_VALUE ? Long.MAX_VALUE : System.currentTimeMillis() + maxtime;
|
|
for (int i = 0; i < maxcount; i++) {
|
|
final byte[] urlhash = domainList.getOne(i);
|
|
if (urlhash == null) continue;
|
|
Row.Entry rowEntry;
|
|
try {
|
|
rowEntry = this.urlFileIndex.get(urlhash, true);
|
|
} catch (final IOException e) {
|
|
continue;
|
|
}
|
|
if (rowEntry == null) continue;
|
|
Request crawlEntry;
|
|
try {
|
|
crawlEntry = new Request(rowEntry);
|
|
} catch (final IOException e) {
|
|
continue;
|
|
}
|
|
cel.add(crawlEntry);
|
|
if (System.currentTimeMillis() > timeout) break;
|
|
}
|
|
return cel;
|
|
}
|
|
|
|
private void pushHashToDomainStacks(String host, String hosthash, final byte[] urlhash) throws SpaceExceededException {
|
|
// extend domain stack
|
|
if (host == null) host = Domains.LOCALHOST;
|
|
HostHandles hh = this.domainStacks.get(host);
|
|
if (hh == null) {
|
|
// create new list
|
|
HandleSet domainList = new RowHandleSet(Word.commonHashLength, Base64Order.enhancedCoder, 1);
|
|
domainList.put(urlhash);
|
|
this.domainStacks.put(host, new HostHandles(hosthash, domainList));
|
|
} else {
|
|
HandleSet domainList = hh.handleSet;
|
|
// extend existent domain list
|
|
domainList.put(urlhash);
|
|
}
|
|
}
|
|
|
|
private void removeHashFromDomainStacks(String host, final byte[] urlhash) {
|
|
// reduce domain stack
|
|
if (host == null) host = Domains.LOCALHOST;
|
|
HostHandles hh = this.domainStacks.get(host);
|
|
if (hh == null) {
|
|
this.domainStacks.remove(host);
|
|
return;
|
|
}
|
|
HandleSet domainList = hh.handleSet;
|
|
domainList.remove(urlhash);
|
|
if (domainList.isEmpty()) this.domainStacks.remove(host);
|
|
}
|
|
|
|
/**
|
|
* get the next entry in this crawl queue in such a way that the domain access time delta is maximized
|
|
* and always above the given minimum delay time. An additional delay time is computed using the robots.txt
|
|
* crawl-delay time which is always respected. In case the minimum time cannot ensured, this method pauses
|
|
* the necessary time until the url is released and returned as CrawlEntry object. In case that a profile
|
|
* for the computed Entry does not exist, null is returned
|
|
* @param delay true if the requester demands forced delays using explicit thread sleep
|
|
* @param profile
|
|
* @return a url in a CrawlEntry object
|
|
* @throws IOException
|
|
* @throws SpaceExceededException
|
|
*/
|
|
@Override
|
|
public Request pop(final boolean delay, final CrawlSwitchboard cs, final RobotsTxt robots) throws IOException {
|
|
// returns a crawl entry from the stack and ensures minimum delta times
|
|
|
|
long sleeptime = 0;
|
|
Request crawlEntry = null;
|
|
CrawlProfile profileEntry = null;
|
|
byte[] failhash = null;
|
|
while (!this.urlFileIndex.isEmpty()) {
|
|
byte[] nexthash = getbest(robots, cs);
|
|
if (nexthash == null) return null;
|
|
|
|
synchronized (this) {
|
|
Row.Entry rowEntry = (nexthash == null) ? null : this.urlFileIndex.remove(nexthash);
|
|
if (rowEntry == null) continue;
|
|
|
|
crawlEntry = new Request(rowEntry);
|
|
//Log.logInfo("Balancer", "fetched next url: " + crawlEntry.url().toNormalform(true, false));
|
|
|
|
// check blacklist (again) because the user may have created blacklist entries after the queue has been filled
|
|
if (Switchboard.urlBlacklist.isListed(BlacklistType.CRAWLER, crawlEntry.url())) {
|
|
ConcurrentLog.fine("CRAWLER", "URL '" + crawlEntry.url() + "' is in blacklist.");
|
|
continue;
|
|
}
|
|
|
|
// at this point we must check if the crawlEntry has relevance because the crawl profile still exists
|
|
// if not: return null. A calling method must handle the null value and try again
|
|
profileEntry = cs.get(UTF8.getBytes(crawlEntry.profileHandle()));
|
|
if (profileEntry == null) {
|
|
ConcurrentLog.fine("Balancer", "no profile entry for handle " + crawlEntry.profileHandle());
|
|
continue;
|
|
}
|
|
// depending on the caching policy we need sleep time to avoid DoS-like situations
|
|
sleeptime = Latency.getDomainSleepTime(robots, profileEntry, crawlEntry.url());
|
|
|
|
assert Base64Order.enhancedCoder.equal(nexthash, rowEntry.getPrimaryKeyBytes()) : "result = " + ASCII.String(nexthash) + ", rowEntry.getPrimaryKeyBytes() = " + ASCII.String(rowEntry.getPrimaryKeyBytes());
|
|
assert Base64Order.enhancedCoder.equal(nexthash, crawlEntry.url().hash()) : "result = " + ASCII.String(nexthash) + ", crawlEntry.url().hash() = " + ASCII.String(crawlEntry.url().hash());
|
|
|
|
if (failhash != null && Base64Order.enhancedCoder.equal(failhash, nexthash)) break; // prevent endless loops
|
|
break;
|
|
}
|
|
}
|
|
if (crawlEntry == null) return null;
|
|
ClientIdentification.Agent agent = profileEntry == null ? ClientIdentification.yacyInternetCrawlerAgent : profileEntry.getAgent();
|
|
long robotsTime = Latency.getRobotsTime(robots, crawlEntry.url(), agent);
|
|
Latency.updateAfterSelection(crawlEntry.url(), profileEntry == null ? 0 : robotsTime);
|
|
if (delay && sleeptime > 0) {
|
|
// force a busy waiting here
|
|
// in best case, this should never happen if the balancer works properly
|
|
// this is only to protection against the worst case, where the crawler could
|
|
// behave in a DoS-manner
|
|
ConcurrentLog.info("BALANCER", "forcing crawl-delay of " + sleeptime + " milliseconds for " + crawlEntry.url().getHost() + ": " + Latency.waitingRemainingExplain(crawlEntry.url(), robots, agent) + ", domainStacks.size() = " + this.domainStacks.size() + ", domainStacksInitSize = " + this.domStackInitSize);
|
|
long loops = sleeptime / 1000;
|
|
long rest = sleeptime % 1000;
|
|
if (loops < 3) {
|
|
rest = rest + 1000 * loops;
|
|
loops = 0;
|
|
}
|
|
Thread.currentThread().setName("Balancer waiting for " +crawlEntry.url().getHost() + ": " + sleeptime + " milliseconds");
|
|
synchronized(this) {
|
|
// must be synchronized here to avoid 'takeover' moves from other threads which then idle the same time which would not be enough
|
|
if (rest > 0) {try {this.wait(rest);} catch (final InterruptedException e) {}}
|
|
for (int i = 0; i < loops; i++) {
|
|
ConcurrentLog.info("BALANCER", "waiting for " + crawlEntry.url().getHost() + ": " + (loops - i) + " seconds remaining...");
|
|
try {this.wait(1000); } catch (final InterruptedException e) {}
|
|
}
|
|
}
|
|
Latency.updateAfterSelection(crawlEntry.url(), robotsTime);
|
|
}
|
|
return crawlEntry;
|
|
}
|
|
|
|
private byte[] getbest(final RobotsTxt robots, final CrawlSwitchboard cs) {
|
|
|
|
synchronized (this.zeroWaitingCandidates) {
|
|
if (this.zeroWaitingCandidates.size() > 0) {
|
|
byte[] urlhash = pickFromZeroWaiting();
|
|
if (urlhash != null) return urlhash;
|
|
}
|
|
this.zeroWaitingCandidates.clear();
|
|
|
|
// check if we need to get entries from the file index
|
|
try {
|
|
fillDomainStacks();
|
|
} catch (final IOException e) {
|
|
ConcurrentLog.logException(e);
|
|
}
|
|
|
|
// iterate over the domain stacks
|
|
final Iterator<Map.Entry<String, HostHandles>> i = this.domainStacks.entrySet().iterator();
|
|
Map.Entry<String, HostHandles> entry;
|
|
OrderedScoreMap<Map.Entry<String, byte[]>> nextZeroCandidates = new OrderedScoreMap<Map.Entry<String, byte[]>>(null);
|
|
OrderedScoreMap<Map.Entry<String, byte[]>> failoverCandidates = new OrderedScoreMap<Map.Entry<String, byte[]>>(null);
|
|
int newCandidatesForward = 1;
|
|
while (i.hasNext() && nextZeroCandidates.size() < 1000) {
|
|
entry = i.next();
|
|
final String hostname = entry.getKey();
|
|
final HostHandles hosthandles = entry.getValue();
|
|
|
|
// clean up empty entries
|
|
if (hosthandles.handleSet.isEmpty()) {
|
|
i.remove();
|
|
continue;
|
|
}
|
|
|
|
final byte[] urlhash = hosthandles.handleSet.getOne(0);
|
|
if (urlhash == null) continue;
|
|
|
|
int w;
|
|
Row.Entry rowEntry;
|
|
try {
|
|
rowEntry = this.urlFileIndex.get(urlhash, false);
|
|
if (rowEntry == null) continue; // may have been deleted there manwhile
|
|
Request crawlEntry = new Request(rowEntry);
|
|
CrawlProfile profileEntry = cs.get(UTF8.getBytes(crawlEntry.profileHandle()));
|
|
if (profileEntry == null) {
|
|
ConcurrentLog.warn("Balancer", "no profile entry for handle " + crawlEntry.profileHandle());
|
|
continue;
|
|
}
|
|
w = Latency.waitingRemaining(crawlEntry.url(), robots, profileEntry.getAgent());
|
|
} catch (final IOException e1) {
|
|
ConcurrentLog.warn("Balancer", e1.getMessage(), e1);
|
|
continue;
|
|
}
|
|
|
|
if (w <= 0) {
|
|
if (w == Integer.MIN_VALUE) {
|
|
if (newCandidatesForward-- > 0) {
|
|
nextZeroCandidates.set(new AbstractMap.SimpleEntry<String, byte[]>(hostname, urlhash), 10000);
|
|
} else {
|
|
failoverCandidates.set(new AbstractMap.SimpleEntry<String, byte[]>(hostname, urlhash), 0);
|
|
}
|
|
} else {
|
|
nextZeroCandidates.set(new AbstractMap.SimpleEntry<String, byte[]>(hostname, urlhash), hosthandles.handleSet.size());
|
|
}
|
|
} else {
|
|
failoverCandidates.set(new AbstractMap.SimpleEntry<String, byte[]>(hostname, urlhash), w);
|
|
}
|
|
}
|
|
//Log.logInfo("Balancer", "*** getbest: created new nextZeroCandidates-list, size = " + nextZeroCandidates.size() + ", domainStacks.size = " + this.domainStacks.size());
|
|
|
|
if (!nextZeroCandidates.isEmpty()) {
|
|
// take some of the nextZeroCandidates and put the best into the zeroWaitingCandidates
|
|
int pick = nextZeroCandidates.size() <= 10 ? nextZeroCandidates.size() : Math.max(1, nextZeroCandidates.size() / 3);
|
|
Iterator<Map.Entry<String, byte[]>> k = nextZeroCandidates.keys(false);
|
|
while (k.hasNext() && pick-- > 0) {
|
|
this.zeroWaitingCandidates.add(k.next());
|
|
}
|
|
//Log.logInfo("Balancer", "*** getbest: created new zeroWaitingCandidates-list, size = " + zeroWaitingCandidates.size() + ", domainStacks.size = " + this.domainStacks.size());
|
|
|
|
return pickFromZeroWaiting();
|
|
}
|
|
|
|
if (!failoverCandidates.isEmpty()) {
|
|
// bad luck: just take that one with least waiting
|
|
Iterator<Map.Entry<String, byte[]>> k = failoverCandidates.keys(true);
|
|
String besthost;
|
|
byte[] besturlhash;
|
|
Map.Entry<String, byte[]> hosthash;
|
|
while (k.hasNext()) {
|
|
hosthash = k.next();
|
|
//if (failoverCandidates.get(hosthash) > 2000) break; // thats too long; we want a second chance for this!
|
|
besthost = hosthash.getKey();
|
|
besturlhash = hosthash.getValue();
|
|
removeHashFromDomainStacks(besthost, besturlhash);
|
|
//Log.logInfo("Balancer", "*** getbest: no zero waiting candidates, besthost = " + besthost);
|
|
return besturlhash;
|
|
}
|
|
}
|
|
|
|
//Log.logInfo("Balancer", "*** getbest: besturlhash == null");
|
|
return null; // this should never happen
|
|
}
|
|
}
|
|
|
|
private byte[] pickFromZeroWaiting() {
|
|
// by random we choose now either from the largest stack or from any of the other stacks
|
|
String host = null;
|
|
byte[] hash = null;
|
|
while (this.zeroWaitingCandidates.size() > 0) {
|
|
Map.Entry<String, byte[]> z = this.zeroWaitingCandidates.remove(this.random.nextInt(this.zeroWaitingCandidates.size()));
|
|
HostHandles hh = this.domainStacks.get(z.getKey());
|
|
if (hh == null) continue;
|
|
host = z.getKey(); if (host == null) continue;
|
|
hash = z.getValue(); if (hash == null) continue;
|
|
removeHashFromDomainStacks(host, hash);
|
|
ConcurrentLog.info("Balancer", "// getbest: picked a random from the zero-waiting stack: " + host + ", zeroWaitingCandidates.size = " + this.zeroWaitingCandidates.size());
|
|
return hash;
|
|
}
|
|
|
|
//Log.logInfo("Balancer", "*** getbest: picking from zero-waiting stack failed!" + " zeroWaitingCandidates.size = " + this.zeroWaitingCandidates.size());
|
|
this.zeroWaitingCandidates.clear();
|
|
return null;
|
|
}
|
|
|
|
private void fillDomainStacks() throws IOException {
|
|
if (!this.domainStacks.isEmpty() && System.currentTimeMillis() - this.lastDomainStackFill < 60000L) return;
|
|
this.domainStacks.clear();
|
|
this.lastDomainStackFill = System.currentTimeMillis();
|
|
final HandleSet blackhandles = new RowHandleSet(Word.commonHashLength, Word.commonHashOrder, 10);
|
|
String host;
|
|
Request request;
|
|
int count = 0;
|
|
long timeout = System.currentTimeMillis() + 5000;
|
|
for (Row.Entry entry: this.urlFileIndex.random(10000)) {
|
|
if (entry == null) continue;
|
|
request = new Request(entry);
|
|
|
|
// check blacklist (again) because the user may have created blacklist entries after the queue has been filled
|
|
if (Switchboard.urlBlacklist.isListed(BlacklistType.CRAWLER, request.url())) {
|
|
ConcurrentLog.fine("CRAWLER", "URL '" + request.url() + "' is in blacklist.");
|
|
try {blackhandles.put(entry.getPrimaryKeyBytes());} catch (final SpaceExceededException e) {}
|
|
continue;
|
|
}
|
|
|
|
host = request.url().getHost();
|
|
try {
|
|
pushHashToDomainStacks(host, request.url().hosthash(), entry.getPrimaryKeyBytes());
|
|
} catch (final SpaceExceededException e) {
|
|
break;
|
|
}
|
|
count++;
|
|
if (this.domainStacks.size() >= 1000 || count >= 100000 || System.currentTimeMillis() > timeout) break;
|
|
}
|
|
|
|
// if we collected blacklist entries then delete them now
|
|
for (byte[] blackhandle: blackhandles) this.urlFileIndex.remove(blackhandle);
|
|
|
|
ConcurrentLog.info("BALANCER", "re-fill of domain stacks; fileIndex.size() = " + this.urlFileIndex.size() + ", domainStacks.size = " + this.domainStacks.size() + ", blackhandles = " + blackhandles.size() + ", collection time = " + (System.currentTimeMillis() - this.lastDomainStackFill) + " ms");
|
|
this.domStackInitSize = this.domainStacks.size();
|
|
}
|
|
|
|
@Override
|
|
public Iterator<Request> iterator() throws IOException {
|
|
return new EntryIterator();
|
|
}
|
|
|
|
private class EntryIterator implements Iterator<Request> {
|
|
|
|
private Iterator<Row.Entry> rowIterator;
|
|
|
|
public EntryIterator() throws IOException {
|
|
this.rowIterator = LegacyBalancer.this.urlFileIndex.rows();
|
|
}
|
|
|
|
@Override
|
|
public boolean hasNext() {
|
|
return (this.rowIterator == null) ? false : this.rowIterator.hasNext();
|
|
}
|
|
|
|
@Override
|
|
public Request next() {
|
|
final Row.Entry entry = this.rowIterator.next();
|
|
try {
|
|
return (entry == null) ? null : new Request(entry);
|
|
} catch (final IOException e) {
|
|
ConcurrentLog.logException(e);
|
|
this.rowIterator = null;
|
|
return null;
|
|
}
|
|
}
|
|
|
|
@Override
|
|
public void remove() {
|
|
if (this.rowIterator != null) this.rowIterator.remove();
|
|
}
|
|
|
|
}
|
|
|
|
@Override
|
|
public int removeAllByHostHashes(Set<String> hosthashes) {
|
|
return 0;
|
|
}
|
|
|
|
}
|