refactoring of load_delay: this is a matter of client identification

This commit is contained in:
Michael Peter Christen 2013-07-12 16:24:56 +02:00
parent 0d0b3a30f5
commit bcc623a843
15 changed files with 27 additions and 35 deletions

View File

@ -28,7 +28,6 @@ import net.yacy.cora.federate.yacy.CacheStrategy;
import net.yacy.cora.protocol.ClientIdentification;
import net.yacy.cora.protocol.RequestHeader;
import net.yacy.cora.util.ConcurrentLog;
import net.yacy.crawler.data.CrawlQueues;
import net.yacy.crawler.retrieval.Request;
import net.yacy.crawler.retrieval.Response;
import net.yacy.crawler.robots.RobotsTxtEntry;
@ -88,19 +87,19 @@ public class CrawlCheck_p {
robotsEntry = sb.robots.getEntry(u, sb.peers.myBotIDs());
if (robotsEntry == null) {
prop.put("table_list_" + row + "_robots", "no robots");
prop.put("table_list_" + row + "_crawldelay", CrawlQueues.queuedMinLoadDelay + " ms");
prop.put("table_list_" + row + "_crawldelay", ClientIdentification.minLoadDelay() + " ms");
prop.put("table_list_" + row + "_sitemap", "");
} else {
robotsAllowed = !robotsEntry.isDisallowed(u);
prop.put("table_list_" + row + "_robots", "robots exist: " + (robotsAllowed ? "crawl allowed" : "url disallowed"));
prop.put("table_list_" + row + "_crawldelay", Math.max(CrawlQueues.queuedMinLoadDelay, robotsEntry.getCrawlDelayMillis()) + " ms");
prop.put("table_list_" + row + "_crawldelay", Math.max(ClientIdentification.minLoadDelay(), robotsEntry.getCrawlDelayMillis()) + " ms");
prop.put("table_list_" + row + "_sitemap", robotsEntry.getSitemap() == null ? "-" : robotsEntry.getSitemap().toNormalform(true));
}
// try to load the url
if (robotsAllowed) try {
Request request = sb.loader.request(u, true, false);
final Response response = sb.loader.load(request, CacheStrategy.NOCACHE, BlacklistType.CRAWLER, CrawlQueues.queuedMinLoadDelay, ClientIdentification.DEFAULT_TIMEOUT);
final Response response = sb.loader.load(request, CacheStrategy.NOCACHE, BlacklistType.CRAWLER, ClientIdentification.minLoadDelay(), ClientIdentification.DEFAULT_TIMEOUT);
if (response == null) {
prop.put("table_list_" + row + "_access", "no response");
} else {

View File

@ -43,7 +43,6 @@ import net.yacy.cora.util.ConcurrentLog;
import net.yacy.cora.util.SpaceExceededException;
import net.yacy.crawler.CrawlSwitchboard;
import net.yacy.crawler.data.CrawlProfile;
import net.yacy.crawler.data.CrawlQueues;
import net.yacy.crawler.data.ZURL.FailCategory;
import net.yacy.crawler.retrieval.Request;
import net.yacy.crawler.retrieval.SitemapImporter;
@ -288,7 +287,7 @@ public class Crawler_p {
// download document
Document scraper;
try {
scraper = sb.loader.loadDocument(sitelistURL, CacheStrategy.IFFRESH, BlacklistType.CRAWLER, CrawlQueues.queuedMinLoadDelay, ClientIdentification.DEFAULT_TIMEOUT);
scraper = sb.loader.loadDocument(sitelistURL, CacheStrategy.IFFRESH, BlacklistType.CRAWLER, ClientIdentification.minLoadDelay(), ClientIdentification.DEFAULT_TIMEOUT);
// get links and generate filter
for (DigestURI u: scraper.getAnchors().keySet()) {
newRootURLs.add(u);

View File

@ -27,7 +27,6 @@ import net.yacy.cora.geo.OpenGeoDBLocation;
import net.yacy.cora.protocol.ClientIdentification;
import net.yacy.cora.protocol.RequestHeader;
import net.yacy.cora.util.ConcurrentLog;
import net.yacy.crawler.data.CrawlQueues;
import net.yacy.crawler.retrieval.Response;
import net.yacy.document.LibraryProvider;
import net.yacy.kelondro.data.meta.DigestURI;
@ -67,7 +66,7 @@ public class DictionaryLoader_p {
if (post.containsKey("geon0Load")) {
// load from the net
try {
final Response response = sb.loader.load(sb.loader.request(new DigestURI(LibraryProvider.Dictionary.GEON0.url), false, true), CacheStrategy.NOCACHE, Integer.MAX_VALUE, null, CrawlQueues.queuedMinLoadDelay, ClientIdentification.DEFAULT_TIMEOUT);
final Response response = sb.loader.load(sb.loader.request(new DigestURI(LibraryProvider.Dictionary.GEON0.url), false, true), CacheStrategy.NOCACHE, Integer.MAX_VALUE, null, ClientIdentification.minLoadDelay(), ClientIdentification.DEFAULT_TIMEOUT);
final byte[] b = response.getContent();
FileUtils.copy(b, LibraryProvider.Dictionary.GEON0.file());
LibraryProvider.geoLoc.activateLocation(LibraryProvider.Dictionary.GEON0.nickname, new GeonamesLocation(LibraryProvider.Dictionary.GEON0.file(), null, -1));
@ -109,7 +108,7 @@ public class DictionaryLoader_p {
if (post.containsKey("geon1Load")) {
// load from the net
try {
final Response response = sb.loader.load(sb.loader.request(new DigestURI(LibraryProvider.Dictionary.GEON1.url), false, true), CacheStrategy.NOCACHE, Integer.MAX_VALUE, null, CrawlQueues.queuedMinLoadDelay, ClientIdentification.DEFAULT_TIMEOUT);
final Response response = sb.loader.load(sb.loader.request(new DigestURI(LibraryProvider.Dictionary.GEON1.url), false, true), CacheStrategy.NOCACHE, Integer.MAX_VALUE, null, ClientIdentification.minLoadDelay(), ClientIdentification.DEFAULT_TIMEOUT);
final byte[] b = response.getContent();
FileUtils.copy(b, LibraryProvider.Dictionary.GEON1.file());
LibraryProvider.geoLoc.activateLocation(LibraryProvider.Dictionary.GEON1.nickname, new GeonamesLocation(LibraryProvider.Dictionary.GEON1.file(), null, -1));
@ -151,7 +150,7 @@ public class DictionaryLoader_p {
if (post.containsKey("geon2Load")) {
// load from the net
try {
final Response response = sb.loader.load(sb.loader.request(new DigestURI(LibraryProvider.Dictionary.GEON2.url), false, true), CacheStrategy.NOCACHE, Integer.MAX_VALUE, null, CrawlQueues.queuedMinLoadDelay, ClientIdentification.DEFAULT_TIMEOUT);
final Response response = sb.loader.load(sb.loader.request(new DigestURI(LibraryProvider.Dictionary.GEON2.url), false, true), CacheStrategy.NOCACHE, Integer.MAX_VALUE, null, ClientIdentification.minLoadDelay(), ClientIdentification.DEFAULT_TIMEOUT);
final byte[] b = response.getContent();
FileUtils.copy(b, LibraryProvider.Dictionary.GEON2.file());
LibraryProvider.geoLoc.activateLocation(LibraryProvider.Dictionary.GEON2.nickname, new GeonamesLocation(LibraryProvider.Dictionary.GEON2.file(), null, 100000));
@ -193,7 +192,7 @@ public class DictionaryLoader_p {
if (post.containsKey("geo1Load")) {
// load from the net
try {
final Response response = sb.loader.load(sb.loader.request(new DigestURI(LibraryProvider.Dictionary.GEODB1.url), false, true), CacheStrategy.NOCACHE, Integer.MAX_VALUE, null, CrawlQueues.queuedMinLoadDelay, ClientIdentification.DEFAULT_TIMEOUT);
final Response response = sb.loader.load(sb.loader.request(new DigestURI(LibraryProvider.Dictionary.GEODB1.url), false, true), CacheStrategy.NOCACHE, Integer.MAX_VALUE, null, ClientIdentification.minLoadDelay(), ClientIdentification.DEFAULT_TIMEOUT);
final byte[] b = response.getContent();
FileUtils.copy(b, LibraryProvider.Dictionary.GEODB1.file());
LibraryProvider.geoLoc.deactivateLocalization(LibraryProvider.Dictionary.GEODB1.nickname);
@ -236,7 +235,7 @@ public class DictionaryLoader_p {
if (post.containsKey("drw0Load")) {
// load from the net
try {
final Response response = sb.loader.load(sb.loader.request(new DigestURI(LibraryProvider.Dictionary.DRW0.url), false, true), CacheStrategy.NOCACHE, Integer.MAX_VALUE, null, CrawlQueues.queuedMinLoadDelay, ClientIdentification.DEFAULT_TIMEOUT);
final Response response = sb.loader.load(sb.loader.request(new DigestURI(LibraryProvider.Dictionary.DRW0.url), false, true), CacheStrategy.NOCACHE, Integer.MAX_VALUE, null, ClientIdentification.minLoadDelay(), ClientIdentification.DEFAULT_TIMEOUT);
final byte[] b = response.getContent();
FileUtils.copy(b, LibraryProvider.Dictionary.DRW0.file());
LibraryProvider.activateDeReWo();
@ -280,7 +279,7 @@ public class DictionaryLoader_p {
if (post.containsKey("pnd0Load")) {
// load from the net
try {
final Response response = sb.loader.load(sb.loader.request(new DigestURI(LibraryProvider.Dictionary.PND0.url), false, true), CacheStrategy.NOCACHE, Integer.MAX_VALUE, null, CrawlQueues.queuedMinLoadDelay, ClientIdentification.DEFAULT_TIMEOUT);
final Response response = sb.loader.load(sb.loader.request(new DigestURI(LibraryProvider.Dictionary.PND0.url), false, true), CacheStrategy.NOCACHE, Integer.MAX_VALUE, null, ClientIdentification.minLoadDelay(), ClientIdentification.DEFAULT_TIMEOUT);
final byte[] b = response.getContent();
FileUtils.copy(b, LibraryProvider.Dictionary.PND0.file());
LibraryProvider.activatePND();

View File

@ -42,7 +42,6 @@ import net.yacy.cora.util.CommonPattern;
import net.yacy.cora.util.ConcurrentLog;
import net.yacy.cora.util.SpaceExceededException;
import net.yacy.crawler.HarvestProcess;
import net.yacy.crawler.data.CrawlQueues;
import net.yacy.crawler.retrieval.RSSLoader;
import net.yacy.crawler.retrieval.Response;
import net.yacy.data.WorkTables;
@ -267,7 +266,7 @@ public class Load_RSS_p {
RSSReader rss = null;
if (url != null) try {
prop.put("url", url.toNormalform(true));
final Response response = sb.loader.load(sb.loader.request(url, true, false), CacheStrategy.NOCACHE, Integer.MAX_VALUE, BlacklistType.CRAWLER, CrawlQueues.queuedMinLoadDelay, ClientIdentification.DEFAULT_TIMEOUT);
final Response response = sb.loader.load(sb.loader.request(url, true, false), CacheStrategy.NOCACHE, Integer.MAX_VALUE, BlacklistType.CRAWLER, ClientIdentification.minLoadDelay(), ClientIdentification.DEFAULT_TIMEOUT);
final byte[] resource = response == null ? null : response.getContent();
rss = resource == null ? null : RSSReader.parse(RSSFeed.DEFAULT_MAXSIZE, resource);
} catch (final IOException e) {

View File

@ -45,7 +45,6 @@ import net.yacy.cora.lod.vocabulary.YaCyMetadata;
import net.yacy.cora.protocol.ClientIdentification;
import net.yacy.cora.protocol.RequestHeader;
import net.yacy.crawler.data.Cache;
import net.yacy.crawler.data.CrawlQueues;
import net.yacy.crawler.retrieval.Response;
import net.yacy.document.Condenser;
import net.yacy.document.Document;
@ -169,7 +168,7 @@ public class ViewFile {
Response response = null;
try {
response = sb.loader.load(sb.loader.request(url, true, false), authorized ? CacheStrategy.IFEXIST : CacheStrategy.CACHEONLY, Integer.MAX_VALUE, null, CrawlQueues.queuedMinLoadDelay, ClientIdentification.DEFAULT_TIMEOUT);
response = sb.loader.load(sb.loader.request(url, true, false), authorized ? CacheStrategy.IFEXIST : CacheStrategy.CACHEONLY, Integer.MAX_VALUE, null, ClientIdentification.minLoadDelay(), ClientIdentification.DEFAULT_TIMEOUT);
} catch (final IOException e) {
prop.put("error", "4");
prop.put("error_errorText", "error loading resource: " + e.getMessage());

View File

@ -39,7 +39,6 @@ import net.yacy.cora.protocol.HeaderFramework;
import net.yacy.cora.protocol.RequestHeader;
import net.yacy.cora.storage.ConcurrentARC;
import net.yacy.cora.util.ConcurrentLog;
import net.yacy.crawler.data.CrawlQueues;
import net.yacy.data.URLLicense;
import net.yacy.document.ImageParser;
import net.yacy.kelondro.data.meta.DigestURI;
@ -105,7 +104,7 @@ public class ViewImage {
if (image == null) {
byte[] resourceb = null;
if (url != null) try {
resourceb = sb.loader.loadContent(sb.loader.request(url, false, true), CacheStrategy.IFEXIST, BlacklistType.SEARCH, CrawlQueues.queuedMinLoadDelay, ClientIdentification.DEFAULT_TIMEOUT);
resourceb = sb.loader.loadContent(sb.loader.request(url, false, true), CacheStrategy.IFEXIST, BlacklistType.SEARCH, ClientIdentification.minLoadDelay(), ClientIdentification.DEFAULT_TIMEOUT);
} catch (final IOException e) {
ConcurrentLog.fine("ViewImage", "cannot load: " + e.getMessage());
}

View File

@ -37,7 +37,6 @@ import net.yacy.cora.federate.yacy.CacheStrategy;
import net.yacy.cora.protocol.ClientIdentification;
import net.yacy.cora.protocol.RequestHeader;
import net.yacy.cora.util.ConcurrentLog;
import net.yacy.crawler.data.CrawlQueues;
import net.yacy.crawler.robots.RobotsTxtEntry;
import net.yacy.kelondro.data.meta.DigestURI;
import net.yacy.repository.Blacklist.BlacklistType;
@ -97,7 +96,7 @@ public class getpageinfo {
}
net.yacy.document.Document scraper = null;
if (u != null) try {
scraper = sb.loader.loadDocument(u, CacheStrategy.IFEXIST, BlacklistType.CRAWLER, CrawlQueues.queuedMinLoadDelay, ClientIdentification.DEFAULT_TIMEOUT);
scraper = sb.loader.loadDocument(u, CacheStrategy.IFEXIST, BlacklistType.CRAWLER, ClientIdentification.minLoadDelay(), ClientIdentification.DEFAULT_TIMEOUT);
} catch (final IOException e) {
ConcurrentLog.logException(e);
// bad things are possible, i.e. that the Server responds with "403 Bad Behavior"

View File

@ -37,7 +37,6 @@ import net.yacy.cora.federate.yacy.CacheStrategy;
import net.yacy.cora.protocol.ClientIdentification;
import net.yacy.cora.protocol.RequestHeader;
import net.yacy.cora.util.ConcurrentLog;
import net.yacy.crawler.data.CrawlQueues;
import net.yacy.crawler.robots.RobotsTxtEntry;
import net.yacy.kelondro.data.meta.DigestURI;
import net.yacy.repository.Blacklist.BlacklistType;
@ -97,7 +96,7 @@ public class getpageinfo_p {
}
net.yacy.document.Document scraper = null;
if (u != null) try {
scraper = sb.loader.loadDocument(u, CacheStrategy.IFEXIST, BlacklistType.CRAWLER, CrawlQueues.queuedMinLoadDelay, ClientIdentification.DEFAULT_TIMEOUT);
scraper = sb.loader.loadDocument(u, CacheStrategy.IFEXIST, BlacklistType.CRAWLER, ClientIdentification.minLoadDelay(), ClientIdentification.DEFAULT_TIMEOUT);
} catch (final IOException e) {
ConcurrentLog.logException(e);
// bad things are possible, i.e. that the Server responds with "403 Bad Behavior"

View File

@ -35,7 +35,6 @@ import net.yacy.cora.order.Base64Order;
import net.yacy.cora.protocol.ClientIdentification;
import net.yacy.cora.protocol.RequestHeader;
import net.yacy.cora.util.ConcurrentLog;
import net.yacy.crawler.data.CrawlQueues;
import net.yacy.kelondro.data.citation.CitationReference;
import net.yacy.kelondro.data.meta.DigestURI;
import net.yacy.kelondro.rwi.IndexCell;
@ -98,7 +97,7 @@ public class webstructure {
prop.put("references", 1);
net.yacy.document.Document scraper = null;
if (url != null) try {
scraper = sb.loader.loadDocument(url, CacheStrategy.IFEXIST, null, CrawlQueues.queuedMinLoadDelay, ClientIdentification.DEFAULT_TIMEOUT);
scraper = sb.loader.loadDocument(url, CacheStrategy.IFEXIST, null, ClientIdentification.minLoadDelay(), ClientIdentification.DEFAULT_TIMEOUT);
} catch (final IOException e) {
ConcurrentLog.logException(e);
}

View File

@ -27,6 +27,7 @@ package net.yacy.cora.protocol;
public class ClientIdentification {
public static final long MIN_LOAD_DELAY = 500;
public static final int DEFAULT_TIMEOUT = 10000;
public static final int minimumLocalDeltaInit = 10; // the minimum time difference between access of the same local domain
public static final int minimumGlobalDeltaInit = 500; // the minimum time difference between access of the same global domain
@ -118,4 +119,8 @@ public class ClientIdentification {
return location;
}
public static long minLoadDelay() {
return MIN_LOAD_DELAY;
}
}

View File

@ -63,7 +63,6 @@ import net.yacy.search.SwitchboardConstants;
public class CrawlQueues {
public static final long queuedMinLoadDelay = 500;
private static final String ERROR_DB_FILENAME = "urlError4.db";
private static final String DELEGATED_DB_FILENAME = "urlDelegated4.db";
@ -654,7 +653,7 @@ public class CrawlQueues {
try {
this.request.setStatus("loading", WorkflowJob.STATUS_RUNNING);
final CrawlProfile e = CrawlQueues.this.sb.crawler.getActive(UTF8.getBytes(this.request.profileHandle()));
final Response response = CrawlQueues.this.sb.loader.load(this.request, e == null ? CacheStrategy.IFEXIST : e.cacheStrategy(), BlacklistType.CRAWLER, queuedMinLoadDelay, ClientIdentification.DEFAULT_TIMEOUT);
final Response response = CrawlQueues.this.sb.loader.load(this.request, e == null ? CacheStrategy.IFEXIST : e.cacheStrategy(), BlacklistType.CRAWLER, ClientIdentification.minLoadDelay(), ClientIdentification.DEFAULT_TIMEOUT);
if (response == null) {
this.request.setStatus("error", WorkflowJob.STATUS_FINISHED);
if (CrawlQueues.this.log.isFine()) {

View File

@ -45,7 +45,6 @@ import net.yacy.cora.storage.ComparableARC;
import net.yacy.cora.util.ConcurrentLog;
import net.yacy.cora.util.SpaceExceededException;
import net.yacy.crawler.HarvestProcess;
import net.yacy.crawler.data.CrawlQueues;
import net.yacy.data.WorkTables;
import net.yacy.kelondro.blob.Tables;
import net.yacy.kelondro.data.meta.DigestURI;
@ -71,7 +70,7 @@ public class RSSLoader extends Thread {
public void run() {
RSSReader rss = null;
try {
final Response response = this.sb.loader.load(this.sb.loader.request(this.urlf, true, false), CacheStrategy.NOCACHE, Integer.MAX_VALUE, BlacklistType.CRAWLER, CrawlQueues.queuedMinLoadDelay, ClientIdentification.DEFAULT_TIMEOUT);
final Response response = this.sb.loader.load(this.sb.loader.request(this.urlf, true, false), CacheStrategy.NOCACHE, Integer.MAX_VALUE, BlacklistType.CRAWLER, ClientIdentification.minLoadDelay(), ClientIdentification.DEFAULT_TIMEOUT);
final byte[] resource = response == null ? null : response.getContent();
rss = resource == null ? null : RSSReader.parse(RSSFeed.DEFAULT_MAXSIZE, resource);
} catch (final MalformedURLException e) {

View File

@ -53,7 +53,6 @@ import net.yacy.cora.protocol.ResponseHeader;
import net.yacy.cora.protocol.http.HTTPClient;
import net.yacy.cora.storage.Files;
import net.yacy.cora.util.ConcurrentLog;
import net.yacy.crawler.data.CrawlQueues;
import net.yacy.document.Document;
import net.yacy.document.parser.tarParser;
import net.yacy.kelondro.data.meta.DigestURI;
@ -239,7 +238,7 @@ public final class yacyRelease extends yacyVersion {
try {
final DigestURI uri = location.getLocationURL();
Thread.currentThread().setName("allReleaseFrom - host " + uri.getHost()); // makes it more easy to see which release blocks process in thread dump
scraper = Switchboard.getSwitchboard().loader.loadDocument(uri, CacheStrategy.NOCACHE, null, CrawlQueues.queuedMinLoadDelay, ClientIdentification.DEFAULT_TIMEOUT);
scraper = Switchboard.getSwitchboard().loader.loadDocument(uri, CacheStrategy.NOCACHE, null, ClientIdentification.minLoadDelay(), ClientIdentification.DEFAULT_TIMEOUT);
} catch (final IOException e) {
return null;
}

View File

@ -2879,7 +2879,7 @@ public final class Switchboard extends serverSwitch {
// get a scraper to get the title
Document scraper;
try {
scraper = this.loader.loadDocument(url, CacheStrategy.IFFRESH, BlacklistType.CRAWLER, CrawlQueues.queuedMinLoadDelay, ClientIdentification.DEFAULT_TIMEOUT);
scraper = this.loader.loadDocument(url, CacheStrategy.IFFRESH, BlacklistType.CRAWLER, ClientIdentification.minLoadDelay(), ClientIdentification.DEFAULT_TIMEOUT);
} catch (IOException e) {
return "scraper cannot load URL: " + e.getMessage();
}
@ -2986,7 +2986,7 @@ public final class Switchboard extends serverSwitch {
String urlName = url.toNormalform(true);
Thread.currentThread().setName("Switchboard.addToIndex:" + urlName);
try {
final Response response = Switchboard.this.loader.load(request, CacheStrategy.IFFRESH, BlacklistType.CRAWLER, CrawlQueues.queuedMinLoadDelay, ClientIdentification.DEFAULT_TIMEOUT);
final Response response = Switchboard.this.loader.load(request, CacheStrategy.IFFRESH, BlacklistType.CRAWLER, ClientIdentification.minLoadDelay(), ClientIdentification.DEFAULT_TIMEOUT);
if (response == null) {
throw new IOException("response == null");
}

View File

@ -55,7 +55,6 @@ import net.yacy.cora.storage.HandleSet;
import net.yacy.cora.util.ConcurrentLog;
import net.yacy.cora.util.LookAheadIterator;
import net.yacy.cora.util.SpaceExceededException;
import net.yacy.crawler.data.CrawlQueues;
import net.yacy.crawler.retrieval.Response;
import net.yacy.document.Condenser;
import net.yacy.document.Document;
@ -812,7 +811,7 @@ public class Segment {
try {
// parse the resource
final Document document = Document.mergeDocuments(url, null, loader.loadDocuments(loader.request(url, true, false), cacheStrategy, Integer.MAX_VALUE, null, CrawlQueues.queuedMinLoadDelay, ClientIdentification.DEFAULT_TIMEOUT));
final Document document = Document.mergeDocuments(url, null, loader.loadDocuments(loader.request(url, true, false), cacheStrategy, Integer.MAX_VALUE, null, ClientIdentification.minLoadDelay(), ClientIdentification.DEFAULT_TIMEOUT));
if (document == null) {
// delete just the url entry
fulltext().remove(urlhash);