yacy_search_server/source/net/yacy/crawler/Balancer.java
Michael Peter Christen da86f150ab - added a new Crawler Balancer: HostBalancer and HostQueues:
This organizes all urls to be loaded in separate queues for each host.
Each host separates the crawl depth into it's own queue. The primary
rule for urls taken from any queue is, that the crawl depth is minimal.
This produces a crawl depth which is identical to the clickdepth.
Furthermorem the crawl is able to create a much better balancing over
all hosts which is fair to all hosts that are in the queue.
This process will create a very large number of files for wide crawls in
the QUEUES folder: for each host a directory, for each crawl depth a
file inside the directory. A crawl with maxdepth = 4 will be able to
create 10.000s of files. To be able to use that many file readers, it
was necessary to implement a new index data structure which opens the
file only if an access is wanted (OnDemandOpenFileIndex). The usage of
such on-demand file reader shall prevent that the number of file
pointers is over the system limit, which is usually about 10.000 open
files. Some parts of YaCy had to be adopted to handle the crawl depth
number correctly. The logging and the IndexCreateQueues servlet had to
be adopted to show the crawl queues differently, because the host name
is attached to the port on the host to differentiate between http,
https, and ftp services.
2014-04-16 21:34:28 +02:00

145 lines
5.0 KiB
Java

/**
* Balancer
* Copyright 2014 by Michael Peter Christen, mc@yacy.net, Frankfurt am Main, Germany
* First released 14.04.2014 at http://yacy.net
*
* This library is free software; you can redistribute it and/or
* modify it under the terms of the GNU Lesser General Public
* License as published by the Free Software Foundation; either
* version 2.1 of the License, or (at your option) any later version.
*
* This library is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
* Lesser General Public License for more details.
*
* You should have received a copy of the GNU Lesser General Public License
* along with this program in the file lgpl21.txt
* If not, see <http://www.gnu.org/licenses/>.
*/
package net.yacy.crawler;
import java.io.IOException;
import java.util.Iterator;
import java.util.List;
import java.util.Map;
import java.util.Set;
import net.yacy.cora.storage.HandleSet;
import net.yacy.cora.util.SpaceExceededException;
import net.yacy.crawler.data.CrawlProfile;
import net.yacy.crawler.retrieval.Request;
import net.yacy.crawler.robots.RobotsTxt;
public interface Balancer {
/**
* close the balancer object
*/
public void close();
/**
* delete all urls from the stack
*/
public void clear();
/**
* get one url from the crawl stack
* @param urlhash
* @return the request for an url by given url hash
* @throws IOException
*/
public Request get(final byte[] urlhash) throws IOException;
/**
* delete all urls from the stack by given profile handle
* @param profileHandle
* @param timeout
* @return the number of removed urls
* @throws IOException
* @throws SpaceExceededException
*/
public int removeAllByProfileHandle(final String profileHandle, final long timeout) throws IOException, SpaceExceededException;
/**
* delete all urls which are stored for given host hashes
* @param hosthashes
* @return number of deleted urls
*/
public int removeAllByHostHashes(final Set<String> hosthashes);
/**
* @param urlHashes, a list of hashes that shall be removed
* @return number of entries that had been removed
* @throws IOException
*/
public int remove(final HandleSet urlHashes) throws IOException;
/**
* check if given url hash is contained in the balancer stack
* @param urlhashb
* @return true if the url is queued here, false otherwise
*/
public boolean has(final byte[] urlhashb);
/**
* get the size of the stack
* @return the number of urls waiting to be loaded
*/
public int size();
/**
* check if stack is empty
* @return true iff size() == 0
*/
public boolean isEmpty();
/**
* push a crawl request on the balancer stack
* @param entry
* @return null if this was successful or a String explaining what went wrong in case of an error
* @throws IOException
* @throws SpaceExceededException
*/
public String push(final Request entry, CrawlProfile profile, final RobotsTxt robots) throws IOException, SpaceExceededException;
/**
* get a list of domains that are currently maintained as domain stacks
* @return a map of clear text strings of host names to an integer array: {the size of the domain stack, guessed delta waiting time}
*/
public Map<String, Integer[]> getDomainStackHosts(RobotsTxt robots);
/**
* get lists of crawl request entries for a specific host
* @param host
* @param maxcount
* @param maxtime
* @return a list of crawl loader requests
*/
public List<Request> getDomainStackReferences(final String host, int maxcount, final long maxtime);
/**
* get the next entry in this crawl queue in such a way that the domain access time delta is maximized
* and always above the given minimum delay time. An additional delay time is computed using the robots.txt
* crawl-delay time which is always respected. In case the minimum time cannot ensured, this method pauses
* the necessary time until the url is released and returned as CrawlEntry object. In case that a profile
* for the computed Entry does not exist, null is returned
* @param delay true if the requester demands forced delays using explicit thread sleep
* @param profile
* @return a url in a CrawlEntry object
* @throws IOException
* @throws SpaceExceededException
*/
public Request pop(final boolean delay, final CrawlSwitchboard cs, final RobotsTxt robots) throws IOException;
/**
* iterate through all requests in the queue
* @return
* @throws IOException
*/
public Iterator<Request> iterator() throws IOException;
}