mirror of
https://github.com/yacy/yacy_search_server.git
synced 2024-09-21 00:00:13 +02:00
765943a4b7
in intranets and the internet can now choose to appear as Googlebot. This is an essential necessity to be able to compete in the field of commercial search appliances, since most web pages are these days optimized only for Google and no other search platform any more. All commercial search engine providers have a built-in fake-Google User Agent to be able to get the same search index as Google can do. Without the resistance against obeying to robots.txt in this case, no competition is possible any more. YaCy will always obey the robots.txt when it is used for crawling the web in a peer-to-peer network, but to establish a Search Appliance (like a Google Search Appliance, GSA) it is necessary to be able to behave exactly like a Google crawler. With this change, you will be able to switch the user agent when portal or intranet mode is selected on per-crawl-start basis. Every crawl start can have a different user agent.
309 lines
13 KiB
Java
309 lines
13 KiB
Java
// NoticedURL.java
|
|
// -----------------------
|
|
// part of YaCy
|
|
// (C) by Michael Peter Christen; mc@yacy.net
|
|
// first published on http://www.anomic.de
|
|
// Frankfurt, Germany, 2004
|
|
//
|
|
//$LastChangedDate$
|
|
//$LastChangedRevision$
|
|
//$LastChangedBy$
|
|
//
|
|
// This program is free software; you can redistribute it and/or modify
|
|
// it under the terms of the GNU General Public License as published by
|
|
// the Free Software Foundation; either version 2 of the License, or
|
|
// (at your option) any later version.
|
|
//
|
|
// This program is distributed in the hope that it will be useful,
|
|
// but WITHOUT ANY WARRANTY; without even the implied warranty of
|
|
// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
|
|
// GNU General Public License for more details.
|
|
//
|
|
// You should have received a copy of the GNU General Public License
|
|
// along with this program; if not, write to the Free Software
|
|
// Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
|
|
|
|
// NURL - noticed (known but not loaded) URL's
|
|
|
|
package net.yacy.crawler.data;
|
|
|
|
import java.io.File;
|
|
import java.io.IOException;
|
|
import java.util.HashSet;
|
|
import java.util.Iterator;
|
|
import java.util.List;
|
|
import java.util.Map;
|
|
|
|
import net.yacy.cora.order.Base64Order;
|
|
import net.yacy.cora.storage.HandleSet;
|
|
import net.yacy.cora.util.ConcurrentLog;
|
|
import net.yacy.cora.util.SpaceExceededException;
|
|
import net.yacy.crawler.Balancer;
|
|
import net.yacy.crawler.CrawlSwitchboard;
|
|
import net.yacy.crawler.retrieval.Request;
|
|
import net.yacy.crawler.robots.RobotsTxt;
|
|
import net.yacy.kelondro.index.RowHandleSet;
|
|
|
|
public class NoticedURL {
|
|
|
|
public enum StackType {
|
|
LOCAL, GLOBAL, REMOTE, NOLOAD;
|
|
}
|
|
|
|
private Balancer coreStack; // links found by crawling to depth-1
|
|
private Balancer limitStack; // links found by crawling at target depth
|
|
private Balancer remoteStack; // links from remote crawl orders
|
|
private Balancer noloadStack; // links that are not passed to a loader; the index will be generated from the Request entry
|
|
|
|
protected NoticedURL(
|
|
final File cachePath,
|
|
final boolean useTailCache,
|
|
final boolean exceed134217727) {
|
|
ConcurrentLog.info("NoticedURL", "CREATING STACKS at " + cachePath.toString());
|
|
this.coreStack = new Balancer(cachePath, "urlNoticeCoreStack", useTailCache, exceed134217727);
|
|
this.limitStack = new Balancer(cachePath, "urlNoticeLimitStack", useTailCache, exceed134217727);
|
|
//overhangStack = new plasmaCrawlBalancer(overhangStackFile);
|
|
this.remoteStack = new Balancer(cachePath, "urlNoticeRemoteStack", useTailCache, exceed134217727);
|
|
this.noloadStack = new Balancer(cachePath, "urlNoticeNoLoadStack", useTailCache, exceed134217727);
|
|
}
|
|
|
|
public void clear() {
|
|
ConcurrentLog.info("NoticedURL", "CLEARING ALL STACKS");
|
|
if (this.coreStack != null) this.coreStack.clear();
|
|
if (this.limitStack != null) this.limitStack.clear();
|
|
if (this.remoteStack != null) this.remoteStack.clear();
|
|
if (this.noloadStack != null) this.noloadStack.clear();
|
|
}
|
|
|
|
protected void close() {
|
|
ConcurrentLog.info("NoticedURL", "CLOSING ALL STACKS");
|
|
if (this.coreStack != null) {
|
|
this.coreStack.close();
|
|
this.coreStack = null;
|
|
}
|
|
if (this.limitStack != null) {
|
|
this.limitStack.close();
|
|
this.limitStack = null;
|
|
}
|
|
//overhangStack.close();
|
|
if (this.remoteStack != null) {
|
|
this.remoteStack.close();
|
|
this.remoteStack = null;
|
|
}
|
|
if (this.noloadStack != null) {
|
|
this.noloadStack.close();
|
|
this.noloadStack = null;
|
|
}
|
|
}
|
|
|
|
@Override
|
|
protected void finalize() throws Throwable {
|
|
if ((this.coreStack != null) || (this.limitStack != null) || (this.remoteStack != null)) {
|
|
ConcurrentLog.warn("plasmaCrawlNURL", "NURL stack closed by finalizer");
|
|
close();
|
|
}
|
|
super.finalize();
|
|
}
|
|
|
|
public boolean notEmpty() {
|
|
return this.coreStack.notEmpty() || this.limitStack.notEmpty() || this.remoteStack.notEmpty() || this.noloadStack.notEmpty();
|
|
}
|
|
|
|
public boolean notEmptyLocal() {
|
|
return this.coreStack.notEmpty() || this.limitStack.notEmpty() || this.noloadStack.notEmpty();
|
|
}
|
|
|
|
public int size() {
|
|
// this does not count the overhang stack size
|
|
return ((this.coreStack == null) ? 0 : this.coreStack.size()) + ((this.limitStack == null) ? 0 : this.limitStack.size()) + ((this.remoteStack == null) ? 0 : this.remoteStack.size());
|
|
}
|
|
|
|
public boolean isEmpty() {
|
|
if (this.coreStack == null) return true;
|
|
if (!this.coreStack.isEmpty()) return false;
|
|
if (!this.limitStack.isEmpty()) return false;
|
|
if (!this.remoteStack.isEmpty()) return false;
|
|
if (!this.noloadStack.isEmpty()) return false;
|
|
return true;
|
|
}
|
|
|
|
public int stackSize(final StackType stackType) {
|
|
switch (stackType) {
|
|
case NOLOAD: return (this.noloadStack == null) ? 0 : this.noloadStack.size();
|
|
case LOCAL: return (this.coreStack == null) ? 0 : this.coreStack.size();
|
|
case GLOBAL: return (this.limitStack == null) ? 0 : this.limitStack.size();
|
|
case REMOTE: return (this.remoteStack == null) ? 0 : this.remoteStack.size();
|
|
default: return -1;
|
|
}
|
|
}
|
|
|
|
protected boolean existsInStack(final byte[] urlhashb) {
|
|
return
|
|
this.coreStack.has(urlhashb) ||
|
|
this.limitStack.has(urlhashb) ||
|
|
//overhangStack.has(urlhashb) ||
|
|
this.remoteStack.has(urlhashb) ||
|
|
this.noloadStack.has(urlhashb);
|
|
}
|
|
|
|
/**
|
|
* push a crawl request on one of the different crawl stacks
|
|
* @param stackType
|
|
* @param entry
|
|
* @return null if this was successful or a String explaining what went wrong in case of an error
|
|
*/
|
|
public String push(final StackType stackType, final Request entry, CrawlProfile profile, final RobotsTxt robots) {
|
|
try {
|
|
switch (stackType) {
|
|
case LOCAL: return this.coreStack.push(entry, profile, robots);
|
|
case GLOBAL: return this.limitStack.push(entry, profile, robots);
|
|
case REMOTE: return this.remoteStack.push(entry, profile, robots);
|
|
case NOLOAD: return this.noloadStack.push(entry, profile, robots);
|
|
default: return "stack type unknown";
|
|
}
|
|
} catch (final Exception er) {
|
|
ConcurrentLog.logException(er);
|
|
return "error pushing onto the crawl stack: " + er.getMessage();
|
|
}
|
|
}
|
|
|
|
protected Request get(final byte[] urlhash) {
|
|
Request entry = null;
|
|
try {if ((entry = this.noloadStack.get(urlhash)) != null) return entry;} catch (final IOException e) {}
|
|
try {if ((entry = this.coreStack.get(urlhash)) != null) return entry;} catch (final IOException e) {}
|
|
try {if ((entry = this.limitStack.get(urlhash)) != null) return entry;} catch (final IOException e) {}
|
|
try {if ((entry = this.remoteStack.get(urlhash)) != null) return entry;} catch (final IOException e) {}
|
|
return null;
|
|
}
|
|
|
|
/**
|
|
* remove a CrawlEntry by a given hash. Usage of this method is not encouraged,
|
|
* because the underlying data structure (crawl stacks) cannot handle removals very good.
|
|
* @param urlhash
|
|
* @return true, if the entry was removed; false if not
|
|
*/
|
|
public boolean removeByURLHash(final byte[] urlhashBytes) {
|
|
try {
|
|
final HandleSet urlHashes = new RowHandleSet(12, Base64Order.enhancedCoder, 1);
|
|
urlHashes.put(urlhashBytes);
|
|
boolean ret = false;
|
|
try {ret |= this.noloadStack.remove(urlHashes) > 0;} catch (final IOException e) {}
|
|
try {ret |= this.coreStack.remove(urlHashes) > 0;} catch (final IOException e) {}
|
|
try {ret |= this.limitStack.remove(urlHashes) > 0;} catch (final IOException e) {}
|
|
try {ret |= this.remoteStack.remove(urlHashes) > 0;} catch (final IOException e) {}
|
|
return ret;
|
|
} catch (final SpaceExceededException e) {
|
|
ConcurrentLog.logException(e);
|
|
return false;
|
|
}
|
|
}
|
|
|
|
public int removeByProfileHandle(final String handle, final long timeout) throws SpaceExceededException {
|
|
int removed = 0;
|
|
try {removed += this.noloadStack.removeAllByProfileHandle(handle, timeout);} catch (final IOException e) {}
|
|
try {removed += this.coreStack.removeAllByProfileHandle(handle, timeout);} catch (final IOException e) {}
|
|
try {removed += this.limitStack.removeAllByProfileHandle(handle, timeout);} catch (final IOException e) {}
|
|
try {removed += this.remoteStack.removeAllByProfileHandle(handle, timeout);} catch (final IOException e) {}
|
|
return removed;
|
|
}
|
|
|
|
/**
|
|
* get a list of domains that are currently maintained as domain stacks
|
|
* @return a map of clear text strings of host names to two integers: the size of the domain stacks and the access delta time
|
|
*/
|
|
public Map<String, Integer[]> getDomainStackHosts(final StackType stackType, RobotsTxt robots) {
|
|
switch (stackType) {
|
|
case LOCAL: return this.coreStack.getDomainStackHosts(robots);
|
|
case GLOBAL: return this.limitStack.getDomainStackHosts(robots);
|
|
case REMOTE: return this.remoteStack.getDomainStackHosts(robots);
|
|
case NOLOAD: return this.noloadStack.getDomainStackHosts(robots);
|
|
default: return null;
|
|
}
|
|
}
|
|
|
|
/**
|
|
* get lists of crawl request entries for a specific host
|
|
* @param host
|
|
* @param maxcount
|
|
* @return a list of crawl loader requests
|
|
*/
|
|
public List<Request> getDomainStackReferences(final StackType stackType, String host, int maxcount, final long maxtime) {
|
|
switch (stackType) {
|
|
case LOCAL: return this.coreStack.getDomainStackReferences(host, maxcount, maxtime);
|
|
case GLOBAL: return this.limitStack.getDomainStackReferences(host, maxcount, maxtime);
|
|
case REMOTE: return this.remoteStack.getDomainStackReferences(host, maxcount, maxtime);
|
|
case NOLOAD: return this.noloadStack.getDomainStackReferences(host, maxcount, maxtime);
|
|
default: return null;
|
|
}
|
|
}
|
|
|
|
public Request pop(final StackType stackType, final boolean delay, final CrawlSwitchboard cs, final RobotsTxt robots) throws IOException {
|
|
switch (stackType) {
|
|
case LOCAL: return pop(this.coreStack, delay, cs, robots);
|
|
case GLOBAL: return pop(this.limitStack, delay, cs, robots);
|
|
case REMOTE: return pop(this.remoteStack, delay, cs, robots);
|
|
case NOLOAD: return pop(this.noloadStack, false, cs, robots);
|
|
default: return null;
|
|
}
|
|
}
|
|
|
|
protected void shift(final StackType fromStack, final StackType toStack, final CrawlSwitchboard cs, final RobotsTxt robots) {
|
|
try {
|
|
final Request entry = pop(fromStack, false, cs, robots);
|
|
if (entry != null) {
|
|
final String warning = push(toStack, entry, null, robots);
|
|
if (warning != null) {
|
|
ConcurrentLog.warn("NoticedURL", "shift from " + fromStack + " to " + toStack + ": " + warning);
|
|
}
|
|
}
|
|
} catch (final IOException e) {
|
|
return;
|
|
}
|
|
}
|
|
|
|
public void clear(final StackType stackType) {
|
|
ConcurrentLog.info("NoticedURL", "CLEARING STACK " + stackType);
|
|
switch (stackType) {
|
|
case LOCAL: this.coreStack.clear(); break;
|
|
case GLOBAL: this.limitStack.clear(); break;
|
|
case REMOTE: this.remoteStack.clear(); break;
|
|
case NOLOAD: this.noloadStack.clear(); break;
|
|
default: return;
|
|
}
|
|
}
|
|
|
|
private static Request pop(final Balancer balancer, final boolean delay, final CrawlSwitchboard cs, final RobotsTxt robots) throws IOException {
|
|
// this is a filo - pop
|
|
int s;
|
|
Request entry;
|
|
int errors = 0;
|
|
while ((s = balancer.size()) > 0) {
|
|
entry = balancer.pop(delay, cs, robots);
|
|
if (entry == null) {
|
|
if (s > balancer.size()) continue;
|
|
errors++;
|
|
if (errors < 100) continue;
|
|
final int aftersize = balancer.size();
|
|
balancer.clear(); // the balancer is broken and cannot shrink
|
|
ConcurrentLog.warn("BALANCER", "entry is null, balancer cannot shrink (bevore pop = " + s + ", after pop = " + aftersize + "); reset of balancer");
|
|
}
|
|
return entry;
|
|
}
|
|
return null;
|
|
}
|
|
|
|
public Iterator<Request> iterator(final StackType stackType) {
|
|
// returns an iterator of plasmaCrawlBalancerEntry Objects
|
|
try {switch (stackType) {
|
|
case LOCAL: return this.coreStack.iterator();
|
|
case GLOBAL: return this.limitStack.iterator();
|
|
case REMOTE: return this.remoteStack.iterator();
|
|
case NOLOAD: return this.noloadStack.iterator();
|
|
default: return null;
|
|
}} catch (final IOException e) {
|
|
return new HashSet<Request>().iterator();
|
|
}
|
|
}
|
|
|
|
}
|