yacy_search_server/source/net/yacy/crawler/Balancer.java

151 lines
5.1 KiB
Java
Raw Normal View History

/**
* Balancer
* SPDX-FileCopyrightText: 2014 Michael Peter Christen <mc@yacy.net)>
* SPDX-License-Identifier: GPL-2.0-or-later
* Frankfurt am Main, Germany
* First released 14.04.2014 at http://yacy.net
*
* This library is free software; you can redistribute it and/or
* modify it under the terms of the GNU Lesser General Public
* License as published by the Free Software Foundation; either
* version 2.1 of the License, or (at your option) any later version.
*
* This library is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
* Lesser General Public License for more details.
*
* You should have received a copy of the GNU Lesser General Public License
* along with this program in the file lgpl21.txt
* If not, see <http://www.gnu.org/licenses/>.
*/
2012-09-21 15:48:16 +02:00
package net.yacy.crawler;
import java.io.IOException;
import java.util.Iterator;
import java.util.List;
import java.util.Map;
import java.util.Set;
2012-07-27 12:13:53 +02:00
import net.yacy.cora.storage.HandleSet;
import net.yacy.cora.util.SpaceExceededException;
2012-09-21 15:48:16 +02:00
import net.yacy.crawler.data.CrawlProfile;
import net.yacy.crawler.retrieval.Request;
import net.yacy.crawler.robots.RobotsTxt;
public interface Balancer {
/**
* close the balancer object
*/
public void close();
/**
* delete all urls from the stack
*/
public void clear();
/**
* get one url from the crawl stack
* @param urlhash
* @return the request for an url by given url hash
* @throws IOException
*/
public Request get(final byte[] urlhash) throws IOException;
/**
* delete all urls from the stack by given profile handle
* @param profileHandle
* @param timeout
* @return the number of removed urls
* @throws IOException
* @throws SpaceExceededException
*/
public int removeAllByProfileHandle(final String profileHandle, final long timeout) throws IOException, SpaceExceededException;
/**
* delete all urls which are stored for given host hashes
* @param hosthashes
* @return number of deleted urls
*/
public int removeAllByHostHashes(final Set<String> hosthashes);
/**
* @param urlHashes, a list of hashes that shall be removed
* @return number of entries that had been removed
* @throws IOException
*/
public int remove(final HandleSet urlHashes) throws IOException;
/**
* check if given url hash is contained in the balancer stack
* @param urlhashb
* @return true if the url is queued here, false otherwise
*/
public boolean has(final byte[] urlhashb);
/**
* get the size of the stack
* @return the number of urls waiting to be loaded
*/
public int size();
public int getOnDemandLimit();
public boolean getExceed134217727();
/**
* check if stack is empty
* @return true iff size() == 0
*/
public boolean isEmpty();
/**
* push a crawl request on the balancer stack
* @param entry
* @return null if this was successful or a String explaining what went wrong in case of an error
* @throws IOException
2012-07-27 12:13:53 +02:00
* @throws SpaceExceededException
*/
public String push(final Request entry, CrawlProfile profile, final RobotsTxt robots) throws IOException, SpaceExceededException;
/**
* get a list of domains that are currently maintained as domain stacks
* @return a map of clear text strings of host names (each host name eventually concatenated with a port, depending on the implementation)
* to an integer array: {the size of the domain stack, guessed delta waiting time}
*/
public Map<String, Integer[]> getDomainStackHosts(RobotsTxt robots);
/**
* get lists of crawl request entries for a specific host
* @param host
* @param maxcount
* @param maxtime
* @return a list of crawl loader requests
*/
public List<Request> getDomainStackReferences(final String host, int maxcount, final long maxtime);
/**
* get the next entry in this crawl queue in such a way that the domain access time delta is maximized
* and always above the given minimum delay time. In case the minimum time cannot ensured, this method pauses
* the necessary time until the url is released and returned as CrawlEntry object. In case that a profile
* for the computed Entry does not exist, null is returned
* @param delay true if the requester demands forced delays using explicit thread sleep
2022-01-26 11:22:43 +01:00
* @param cs
* @param robots
* @return a url in a CrawlEntry object
* @throws IOException
2012-07-27 12:13:53 +02:00
* @throws SpaceExceededException
*/
public Request pop(final boolean delay, final CrawlSwitchboard cs, final RobotsTxt robots) throws IOException;
/**
* iterate through all requests in the queue
* @return
* @throws IOException
*/
public Iterator<Request> iterator() throws IOException;
}