mirror of
https://github.com/yacy/yacy_search_server.git
synced 2024-09-19 00:01:41 +02:00
refactoring of load_delay: this is a matter of client identification
This commit is contained in:
parent
0d0b3a30f5
commit
bcc623a843
|
@ -28,7 +28,6 @@ import net.yacy.cora.federate.yacy.CacheStrategy;
|
||||||
import net.yacy.cora.protocol.ClientIdentification;
|
import net.yacy.cora.protocol.ClientIdentification;
|
||||||
import net.yacy.cora.protocol.RequestHeader;
|
import net.yacy.cora.protocol.RequestHeader;
|
||||||
import net.yacy.cora.util.ConcurrentLog;
|
import net.yacy.cora.util.ConcurrentLog;
|
||||||
import net.yacy.crawler.data.CrawlQueues;
|
|
||||||
import net.yacy.crawler.retrieval.Request;
|
import net.yacy.crawler.retrieval.Request;
|
||||||
import net.yacy.crawler.retrieval.Response;
|
import net.yacy.crawler.retrieval.Response;
|
||||||
import net.yacy.crawler.robots.RobotsTxtEntry;
|
import net.yacy.crawler.robots.RobotsTxtEntry;
|
||||||
|
@ -88,19 +87,19 @@ public class CrawlCheck_p {
|
||||||
robotsEntry = sb.robots.getEntry(u, sb.peers.myBotIDs());
|
robotsEntry = sb.robots.getEntry(u, sb.peers.myBotIDs());
|
||||||
if (robotsEntry == null) {
|
if (robotsEntry == null) {
|
||||||
prop.put("table_list_" + row + "_robots", "no robots");
|
prop.put("table_list_" + row + "_robots", "no robots");
|
||||||
prop.put("table_list_" + row + "_crawldelay", CrawlQueues.queuedMinLoadDelay + " ms");
|
prop.put("table_list_" + row + "_crawldelay", ClientIdentification.minLoadDelay() + " ms");
|
||||||
prop.put("table_list_" + row + "_sitemap", "");
|
prop.put("table_list_" + row + "_sitemap", "");
|
||||||
} else {
|
} else {
|
||||||
robotsAllowed = !robotsEntry.isDisallowed(u);
|
robotsAllowed = !robotsEntry.isDisallowed(u);
|
||||||
prop.put("table_list_" + row + "_robots", "robots exist: " + (robotsAllowed ? "crawl allowed" : "url disallowed"));
|
prop.put("table_list_" + row + "_robots", "robots exist: " + (robotsAllowed ? "crawl allowed" : "url disallowed"));
|
||||||
prop.put("table_list_" + row + "_crawldelay", Math.max(CrawlQueues.queuedMinLoadDelay, robotsEntry.getCrawlDelayMillis()) + " ms");
|
prop.put("table_list_" + row + "_crawldelay", Math.max(ClientIdentification.minLoadDelay(), robotsEntry.getCrawlDelayMillis()) + " ms");
|
||||||
prop.put("table_list_" + row + "_sitemap", robotsEntry.getSitemap() == null ? "-" : robotsEntry.getSitemap().toNormalform(true));
|
prop.put("table_list_" + row + "_sitemap", robotsEntry.getSitemap() == null ? "-" : robotsEntry.getSitemap().toNormalform(true));
|
||||||
}
|
}
|
||||||
|
|
||||||
// try to load the url
|
// try to load the url
|
||||||
if (robotsAllowed) try {
|
if (robotsAllowed) try {
|
||||||
Request request = sb.loader.request(u, true, false);
|
Request request = sb.loader.request(u, true, false);
|
||||||
final Response response = sb.loader.load(request, CacheStrategy.NOCACHE, BlacklistType.CRAWLER, CrawlQueues.queuedMinLoadDelay, ClientIdentification.DEFAULT_TIMEOUT);
|
final Response response = sb.loader.load(request, CacheStrategy.NOCACHE, BlacklistType.CRAWLER, ClientIdentification.minLoadDelay(), ClientIdentification.DEFAULT_TIMEOUT);
|
||||||
if (response == null) {
|
if (response == null) {
|
||||||
prop.put("table_list_" + row + "_access", "no response");
|
prop.put("table_list_" + row + "_access", "no response");
|
||||||
} else {
|
} else {
|
||||||
|
|
|
@ -43,7 +43,6 @@ import net.yacy.cora.util.ConcurrentLog;
|
||||||
import net.yacy.cora.util.SpaceExceededException;
|
import net.yacy.cora.util.SpaceExceededException;
|
||||||
import net.yacy.crawler.CrawlSwitchboard;
|
import net.yacy.crawler.CrawlSwitchboard;
|
||||||
import net.yacy.crawler.data.CrawlProfile;
|
import net.yacy.crawler.data.CrawlProfile;
|
||||||
import net.yacy.crawler.data.CrawlQueues;
|
|
||||||
import net.yacy.crawler.data.ZURL.FailCategory;
|
import net.yacy.crawler.data.ZURL.FailCategory;
|
||||||
import net.yacy.crawler.retrieval.Request;
|
import net.yacy.crawler.retrieval.Request;
|
||||||
import net.yacy.crawler.retrieval.SitemapImporter;
|
import net.yacy.crawler.retrieval.SitemapImporter;
|
||||||
|
@ -288,7 +287,7 @@ public class Crawler_p {
|
||||||
// download document
|
// download document
|
||||||
Document scraper;
|
Document scraper;
|
||||||
try {
|
try {
|
||||||
scraper = sb.loader.loadDocument(sitelistURL, CacheStrategy.IFFRESH, BlacklistType.CRAWLER, CrawlQueues.queuedMinLoadDelay, ClientIdentification.DEFAULT_TIMEOUT);
|
scraper = sb.loader.loadDocument(sitelistURL, CacheStrategy.IFFRESH, BlacklistType.CRAWLER, ClientIdentification.minLoadDelay(), ClientIdentification.DEFAULT_TIMEOUT);
|
||||||
// get links and generate filter
|
// get links and generate filter
|
||||||
for (DigestURI u: scraper.getAnchors().keySet()) {
|
for (DigestURI u: scraper.getAnchors().keySet()) {
|
||||||
newRootURLs.add(u);
|
newRootURLs.add(u);
|
||||||
|
|
|
@ -27,7 +27,6 @@ import net.yacy.cora.geo.OpenGeoDBLocation;
|
||||||
import net.yacy.cora.protocol.ClientIdentification;
|
import net.yacy.cora.protocol.ClientIdentification;
|
||||||
import net.yacy.cora.protocol.RequestHeader;
|
import net.yacy.cora.protocol.RequestHeader;
|
||||||
import net.yacy.cora.util.ConcurrentLog;
|
import net.yacy.cora.util.ConcurrentLog;
|
||||||
import net.yacy.crawler.data.CrawlQueues;
|
|
||||||
import net.yacy.crawler.retrieval.Response;
|
import net.yacy.crawler.retrieval.Response;
|
||||||
import net.yacy.document.LibraryProvider;
|
import net.yacy.document.LibraryProvider;
|
||||||
import net.yacy.kelondro.data.meta.DigestURI;
|
import net.yacy.kelondro.data.meta.DigestURI;
|
||||||
|
@ -67,7 +66,7 @@ public class DictionaryLoader_p {
|
||||||
if (post.containsKey("geon0Load")) {
|
if (post.containsKey("geon0Load")) {
|
||||||
// load from the net
|
// load from the net
|
||||||
try {
|
try {
|
||||||
final Response response = sb.loader.load(sb.loader.request(new DigestURI(LibraryProvider.Dictionary.GEON0.url), false, true), CacheStrategy.NOCACHE, Integer.MAX_VALUE, null, CrawlQueues.queuedMinLoadDelay, ClientIdentification.DEFAULT_TIMEOUT);
|
final Response response = sb.loader.load(sb.loader.request(new DigestURI(LibraryProvider.Dictionary.GEON0.url), false, true), CacheStrategy.NOCACHE, Integer.MAX_VALUE, null, ClientIdentification.minLoadDelay(), ClientIdentification.DEFAULT_TIMEOUT);
|
||||||
final byte[] b = response.getContent();
|
final byte[] b = response.getContent();
|
||||||
FileUtils.copy(b, LibraryProvider.Dictionary.GEON0.file());
|
FileUtils.copy(b, LibraryProvider.Dictionary.GEON0.file());
|
||||||
LibraryProvider.geoLoc.activateLocation(LibraryProvider.Dictionary.GEON0.nickname, new GeonamesLocation(LibraryProvider.Dictionary.GEON0.file(), null, -1));
|
LibraryProvider.geoLoc.activateLocation(LibraryProvider.Dictionary.GEON0.nickname, new GeonamesLocation(LibraryProvider.Dictionary.GEON0.file(), null, -1));
|
||||||
|
@ -109,7 +108,7 @@ public class DictionaryLoader_p {
|
||||||
if (post.containsKey("geon1Load")) {
|
if (post.containsKey("geon1Load")) {
|
||||||
// load from the net
|
// load from the net
|
||||||
try {
|
try {
|
||||||
final Response response = sb.loader.load(sb.loader.request(new DigestURI(LibraryProvider.Dictionary.GEON1.url), false, true), CacheStrategy.NOCACHE, Integer.MAX_VALUE, null, CrawlQueues.queuedMinLoadDelay, ClientIdentification.DEFAULT_TIMEOUT);
|
final Response response = sb.loader.load(sb.loader.request(new DigestURI(LibraryProvider.Dictionary.GEON1.url), false, true), CacheStrategy.NOCACHE, Integer.MAX_VALUE, null, ClientIdentification.minLoadDelay(), ClientIdentification.DEFAULT_TIMEOUT);
|
||||||
final byte[] b = response.getContent();
|
final byte[] b = response.getContent();
|
||||||
FileUtils.copy(b, LibraryProvider.Dictionary.GEON1.file());
|
FileUtils.copy(b, LibraryProvider.Dictionary.GEON1.file());
|
||||||
LibraryProvider.geoLoc.activateLocation(LibraryProvider.Dictionary.GEON1.nickname, new GeonamesLocation(LibraryProvider.Dictionary.GEON1.file(), null, -1));
|
LibraryProvider.geoLoc.activateLocation(LibraryProvider.Dictionary.GEON1.nickname, new GeonamesLocation(LibraryProvider.Dictionary.GEON1.file(), null, -1));
|
||||||
|
@ -151,7 +150,7 @@ public class DictionaryLoader_p {
|
||||||
if (post.containsKey("geon2Load")) {
|
if (post.containsKey("geon2Load")) {
|
||||||
// load from the net
|
// load from the net
|
||||||
try {
|
try {
|
||||||
final Response response = sb.loader.load(sb.loader.request(new DigestURI(LibraryProvider.Dictionary.GEON2.url), false, true), CacheStrategy.NOCACHE, Integer.MAX_VALUE, null, CrawlQueues.queuedMinLoadDelay, ClientIdentification.DEFAULT_TIMEOUT);
|
final Response response = sb.loader.load(sb.loader.request(new DigestURI(LibraryProvider.Dictionary.GEON2.url), false, true), CacheStrategy.NOCACHE, Integer.MAX_VALUE, null, ClientIdentification.minLoadDelay(), ClientIdentification.DEFAULT_TIMEOUT);
|
||||||
final byte[] b = response.getContent();
|
final byte[] b = response.getContent();
|
||||||
FileUtils.copy(b, LibraryProvider.Dictionary.GEON2.file());
|
FileUtils.copy(b, LibraryProvider.Dictionary.GEON2.file());
|
||||||
LibraryProvider.geoLoc.activateLocation(LibraryProvider.Dictionary.GEON2.nickname, new GeonamesLocation(LibraryProvider.Dictionary.GEON2.file(), null, 100000));
|
LibraryProvider.geoLoc.activateLocation(LibraryProvider.Dictionary.GEON2.nickname, new GeonamesLocation(LibraryProvider.Dictionary.GEON2.file(), null, 100000));
|
||||||
|
@ -193,7 +192,7 @@ public class DictionaryLoader_p {
|
||||||
if (post.containsKey("geo1Load")) {
|
if (post.containsKey("geo1Load")) {
|
||||||
// load from the net
|
// load from the net
|
||||||
try {
|
try {
|
||||||
final Response response = sb.loader.load(sb.loader.request(new DigestURI(LibraryProvider.Dictionary.GEODB1.url), false, true), CacheStrategy.NOCACHE, Integer.MAX_VALUE, null, CrawlQueues.queuedMinLoadDelay, ClientIdentification.DEFAULT_TIMEOUT);
|
final Response response = sb.loader.load(sb.loader.request(new DigestURI(LibraryProvider.Dictionary.GEODB1.url), false, true), CacheStrategy.NOCACHE, Integer.MAX_VALUE, null, ClientIdentification.minLoadDelay(), ClientIdentification.DEFAULT_TIMEOUT);
|
||||||
final byte[] b = response.getContent();
|
final byte[] b = response.getContent();
|
||||||
FileUtils.copy(b, LibraryProvider.Dictionary.GEODB1.file());
|
FileUtils.copy(b, LibraryProvider.Dictionary.GEODB1.file());
|
||||||
LibraryProvider.geoLoc.deactivateLocalization(LibraryProvider.Dictionary.GEODB1.nickname);
|
LibraryProvider.geoLoc.deactivateLocalization(LibraryProvider.Dictionary.GEODB1.nickname);
|
||||||
|
@ -236,7 +235,7 @@ public class DictionaryLoader_p {
|
||||||
if (post.containsKey("drw0Load")) {
|
if (post.containsKey("drw0Load")) {
|
||||||
// load from the net
|
// load from the net
|
||||||
try {
|
try {
|
||||||
final Response response = sb.loader.load(sb.loader.request(new DigestURI(LibraryProvider.Dictionary.DRW0.url), false, true), CacheStrategy.NOCACHE, Integer.MAX_VALUE, null, CrawlQueues.queuedMinLoadDelay, ClientIdentification.DEFAULT_TIMEOUT);
|
final Response response = sb.loader.load(sb.loader.request(new DigestURI(LibraryProvider.Dictionary.DRW0.url), false, true), CacheStrategy.NOCACHE, Integer.MAX_VALUE, null, ClientIdentification.minLoadDelay(), ClientIdentification.DEFAULT_TIMEOUT);
|
||||||
final byte[] b = response.getContent();
|
final byte[] b = response.getContent();
|
||||||
FileUtils.copy(b, LibraryProvider.Dictionary.DRW0.file());
|
FileUtils.copy(b, LibraryProvider.Dictionary.DRW0.file());
|
||||||
LibraryProvider.activateDeReWo();
|
LibraryProvider.activateDeReWo();
|
||||||
|
@ -280,7 +279,7 @@ public class DictionaryLoader_p {
|
||||||
if (post.containsKey("pnd0Load")) {
|
if (post.containsKey("pnd0Load")) {
|
||||||
// load from the net
|
// load from the net
|
||||||
try {
|
try {
|
||||||
final Response response = sb.loader.load(sb.loader.request(new DigestURI(LibraryProvider.Dictionary.PND0.url), false, true), CacheStrategy.NOCACHE, Integer.MAX_VALUE, null, CrawlQueues.queuedMinLoadDelay, ClientIdentification.DEFAULT_TIMEOUT);
|
final Response response = sb.loader.load(sb.loader.request(new DigestURI(LibraryProvider.Dictionary.PND0.url), false, true), CacheStrategy.NOCACHE, Integer.MAX_VALUE, null, ClientIdentification.minLoadDelay(), ClientIdentification.DEFAULT_TIMEOUT);
|
||||||
final byte[] b = response.getContent();
|
final byte[] b = response.getContent();
|
||||||
FileUtils.copy(b, LibraryProvider.Dictionary.PND0.file());
|
FileUtils.copy(b, LibraryProvider.Dictionary.PND0.file());
|
||||||
LibraryProvider.activatePND();
|
LibraryProvider.activatePND();
|
||||||
|
|
|
@ -42,7 +42,6 @@ import net.yacy.cora.util.CommonPattern;
|
||||||
import net.yacy.cora.util.ConcurrentLog;
|
import net.yacy.cora.util.ConcurrentLog;
|
||||||
import net.yacy.cora.util.SpaceExceededException;
|
import net.yacy.cora.util.SpaceExceededException;
|
||||||
import net.yacy.crawler.HarvestProcess;
|
import net.yacy.crawler.HarvestProcess;
|
||||||
import net.yacy.crawler.data.CrawlQueues;
|
|
||||||
import net.yacy.crawler.retrieval.RSSLoader;
|
import net.yacy.crawler.retrieval.RSSLoader;
|
||||||
import net.yacy.crawler.retrieval.Response;
|
import net.yacy.crawler.retrieval.Response;
|
||||||
import net.yacy.data.WorkTables;
|
import net.yacy.data.WorkTables;
|
||||||
|
@ -267,7 +266,7 @@ public class Load_RSS_p {
|
||||||
RSSReader rss = null;
|
RSSReader rss = null;
|
||||||
if (url != null) try {
|
if (url != null) try {
|
||||||
prop.put("url", url.toNormalform(true));
|
prop.put("url", url.toNormalform(true));
|
||||||
final Response response = sb.loader.load(sb.loader.request(url, true, false), CacheStrategy.NOCACHE, Integer.MAX_VALUE, BlacklistType.CRAWLER, CrawlQueues.queuedMinLoadDelay, ClientIdentification.DEFAULT_TIMEOUT);
|
final Response response = sb.loader.load(sb.loader.request(url, true, false), CacheStrategy.NOCACHE, Integer.MAX_VALUE, BlacklistType.CRAWLER, ClientIdentification.minLoadDelay(), ClientIdentification.DEFAULT_TIMEOUT);
|
||||||
final byte[] resource = response == null ? null : response.getContent();
|
final byte[] resource = response == null ? null : response.getContent();
|
||||||
rss = resource == null ? null : RSSReader.parse(RSSFeed.DEFAULT_MAXSIZE, resource);
|
rss = resource == null ? null : RSSReader.parse(RSSFeed.DEFAULT_MAXSIZE, resource);
|
||||||
} catch (final IOException e) {
|
} catch (final IOException e) {
|
||||||
|
|
|
@ -45,7 +45,6 @@ import net.yacy.cora.lod.vocabulary.YaCyMetadata;
|
||||||
import net.yacy.cora.protocol.ClientIdentification;
|
import net.yacy.cora.protocol.ClientIdentification;
|
||||||
import net.yacy.cora.protocol.RequestHeader;
|
import net.yacy.cora.protocol.RequestHeader;
|
||||||
import net.yacy.crawler.data.Cache;
|
import net.yacy.crawler.data.Cache;
|
||||||
import net.yacy.crawler.data.CrawlQueues;
|
|
||||||
import net.yacy.crawler.retrieval.Response;
|
import net.yacy.crawler.retrieval.Response;
|
||||||
import net.yacy.document.Condenser;
|
import net.yacy.document.Condenser;
|
||||||
import net.yacy.document.Document;
|
import net.yacy.document.Document;
|
||||||
|
@ -169,7 +168,7 @@ public class ViewFile {
|
||||||
|
|
||||||
Response response = null;
|
Response response = null;
|
||||||
try {
|
try {
|
||||||
response = sb.loader.load(sb.loader.request(url, true, false), authorized ? CacheStrategy.IFEXIST : CacheStrategy.CACHEONLY, Integer.MAX_VALUE, null, CrawlQueues.queuedMinLoadDelay, ClientIdentification.DEFAULT_TIMEOUT);
|
response = sb.loader.load(sb.loader.request(url, true, false), authorized ? CacheStrategy.IFEXIST : CacheStrategy.CACHEONLY, Integer.MAX_VALUE, null, ClientIdentification.minLoadDelay(), ClientIdentification.DEFAULT_TIMEOUT);
|
||||||
} catch (final IOException e) {
|
} catch (final IOException e) {
|
||||||
prop.put("error", "4");
|
prop.put("error", "4");
|
||||||
prop.put("error_errorText", "error loading resource: " + e.getMessage());
|
prop.put("error_errorText", "error loading resource: " + e.getMessage());
|
||||||
|
|
|
@ -39,7 +39,6 @@ import net.yacy.cora.protocol.HeaderFramework;
|
||||||
import net.yacy.cora.protocol.RequestHeader;
|
import net.yacy.cora.protocol.RequestHeader;
|
||||||
import net.yacy.cora.storage.ConcurrentARC;
|
import net.yacy.cora.storage.ConcurrentARC;
|
||||||
import net.yacy.cora.util.ConcurrentLog;
|
import net.yacy.cora.util.ConcurrentLog;
|
||||||
import net.yacy.crawler.data.CrawlQueues;
|
|
||||||
import net.yacy.data.URLLicense;
|
import net.yacy.data.URLLicense;
|
||||||
import net.yacy.document.ImageParser;
|
import net.yacy.document.ImageParser;
|
||||||
import net.yacy.kelondro.data.meta.DigestURI;
|
import net.yacy.kelondro.data.meta.DigestURI;
|
||||||
|
@ -105,7 +104,7 @@ public class ViewImage {
|
||||||
if (image == null) {
|
if (image == null) {
|
||||||
byte[] resourceb = null;
|
byte[] resourceb = null;
|
||||||
if (url != null) try {
|
if (url != null) try {
|
||||||
resourceb = sb.loader.loadContent(sb.loader.request(url, false, true), CacheStrategy.IFEXIST, BlacklistType.SEARCH, CrawlQueues.queuedMinLoadDelay, ClientIdentification.DEFAULT_TIMEOUT);
|
resourceb = sb.loader.loadContent(sb.loader.request(url, false, true), CacheStrategy.IFEXIST, BlacklistType.SEARCH, ClientIdentification.minLoadDelay(), ClientIdentification.DEFAULT_TIMEOUT);
|
||||||
} catch (final IOException e) {
|
} catch (final IOException e) {
|
||||||
ConcurrentLog.fine("ViewImage", "cannot load: " + e.getMessage());
|
ConcurrentLog.fine("ViewImage", "cannot load: " + e.getMessage());
|
||||||
}
|
}
|
||||||
|
|
|
@ -37,7 +37,6 @@ import net.yacy.cora.federate.yacy.CacheStrategy;
|
||||||
import net.yacy.cora.protocol.ClientIdentification;
|
import net.yacy.cora.protocol.ClientIdentification;
|
||||||
import net.yacy.cora.protocol.RequestHeader;
|
import net.yacy.cora.protocol.RequestHeader;
|
||||||
import net.yacy.cora.util.ConcurrentLog;
|
import net.yacy.cora.util.ConcurrentLog;
|
||||||
import net.yacy.crawler.data.CrawlQueues;
|
|
||||||
import net.yacy.crawler.robots.RobotsTxtEntry;
|
import net.yacy.crawler.robots.RobotsTxtEntry;
|
||||||
import net.yacy.kelondro.data.meta.DigestURI;
|
import net.yacy.kelondro.data.meta.DigestURI;
|
||||||
import net.yacy.repository.Blacklist.BlacklistType;
|
import net.yacy.repository.Blacklist.BlacklistType;
|
||||||
|
@ -97,7 +96,7 @@ public class getpageinfo {
|
||||||
}
|
}
|
||||||
net.yacy.document.Document scraper = null;
|
net.yacy.document.Document scraper = null;
|
||||||
if (u != null) try {
|
if (u != null) try {
|
||||||
scraper = sb.loader.loadDocument(u, CacheStrategy.IFEXIST, BlacklistType.CRAWLER, CrawlQueues.queuedMinLoadDelay, ClientIdentification.DEFAULT_TIMEOUT);
|
scraper = sb.loader.loadDocument(u, CacheStrategy.IFEXIST, BlacklistType.CRAWLER, ClientIdentification.minLoadDelay(), ClientIdentification.DEFAULT_TIMEOUT);
|
||||||
} catch (final IOException e) {
|
} catch (final IOException e) {
|
||||||
ConcurrentLog.logException(e);
|
ConcurrentLog.logException(e);
|
||||||
// bad things are possible, i.e. that the Server responds with "403 Bad Behavior"
|
// bad things are possible, i.e. that the Server responds with "403 Bad Behavior"
|
||||||
|
|
|
@ -37,7 +37,6 @@ import net.yacy.cora.federate.yacy.CacheStrategy;
|
||||||
import net.yacy.cora.protocol.ClientIdentification;
|
import net.yacy.cora.protocol.ClientIdentification;
|
||||||
import net.yacy.cora.protocol.RequestHeader;
|
import net.yacy.cora.protocol.RequestHeader;
|
||||||
import net.yacy.cora.util.ConcurrentLog;
|
import net.yacy.cora.util.ConcurrentLog;
|
||||||
import net.yacy.crawler.data.CrawlQueues;
|
|
||||||
import net.yacy.crawler.robots.RobotsTxtEntry;
|
import net.yacy.crawler.robots.RobotsTxtEntry;
|
||||||
import net.yacy.kelondro.data.meta.DigestURI;
|
import net.yacy.kelondro.data.meta.DigestURI;
|
||||||
import net.yacy.repository.Blacklist.BlacklistType;
|
import net.yacy.repository.Blacklist.BlacklistType;
|
||||||
|
@ -97,7 +96,7 @@ public class getpageinfo_p {
|
||||||
}
|
}
|
||||||
net.yacy.document.Document scraper = null;
|
net.yacy.document.Document scraper = null;
|
||||||
if (u != null) try {
|
if (u != null) try {
|
||||||
scraper = sb.loader.loadDocument(u, CacheStrategy.IFEXIST, BlacklistType.CRAWLER, CrawlQueues.queuedMinLoadDelay, ClientIdentification.DEFAULT_TIMEOUT);
|
scraper = sb.loader.loadDocument(u, CacheStrategy.IFEXIST, BlacklistType.CRAWLER, ClientIdentification.minLoadDelay(), ClientIdentification.DEFAULT_TIMEOUT);
|
||||||
} catch (final IOException e) {
|
} catch (final IOException e) {
|
||||||
ConcurrentLog.logException(e);
|
ConcurrentLog.logException(e);
|
||||||
// bad things are possible, i.e. that the Server responds with "403 Bad Behavior"
|
// bad things are possible, i.e. that the Server responds with "403 Bad Behavior"
|
||||||
|
|
|
@ -35,7 +35,6 @@ import net.yacy.cora.order.Base64Order;
|
||||||
import net.yacy.cora.protocol.ClientIdentification;
|
import net.yacy.cora.protocol.ClientIdentification;
|
||||||
import net.yacy.cora.protocol.RequestHeader;
|
import net.yacy.cora.protocol.RequestHeader;
|
||||||
import net.yacy.cora.util.ConcurrentLog;
|
import net.yacy.cora.util.ConcurrentLog;
|
||||||
import net.yacy.crawler.data.CrawlQueues;
|
|
||||||
import net.yacy.kelondro.data.citation.CitationReference;
|
import net.yacy.kelondro.data.citation.CitationReference;
|
||||||
import net.yacy.kelondro.data.meta.DigestURI;
|
import net.yacy.kelondro.data.meta.DigestURI;
|
||||||
import net.yacy.kelondro.rwi.IndexCell;
|
import net.yacy.kelondro.rwi.IndexCell;
|
||||||
|
@ -98,7 +97,7 @@ public class webstructure {
|
||||||
prop.put("references", 1);
|
prop.put("references", 1);
|
||||||
net.yacy.document.Document scraper = null;
|
net.yacy.document.Document scraper = null;
|
||||||
if (url != null) try {
|
if (url != null) try {
|
||||||
scraper = sb.loader.loadDocument(url, CacheStrategy.IFEXIST, null, CrawlQueues.queuedMinLoadDelay, ClientIdentification.DEFAULT_TIMEOUT);
|
scraper = sb.loader.loadDocument(url, CacheStrategy.IFEXIST, null, ClientIdentification.minLoadDelay(), ClientIdentification.DEFAULT_TIMEOUT);
|
||||||
} catch (final IOException e) {
|
} catch (final IOException e) {
|
||||||
ConcurrentLog.logException(e);
|
ConcurrentLog.logException(e);
|
||||||
}
|
}
|
||||||
|
|
|
@ -27,6 +27,7 @@ package net.yacy.cora.protocol;
|
||||||
|
|
||||||
public class ClientIdentification {
|
public class ClientIdentification {
|
||||||
|
|
||||||
|
public static final long MIN_LOAD_DELAY = 500;
|
||||||
public static final int DEFAULT_TIMEOUT = 10000;
|
public static final int DEFAULT_TIMEOUT = 10000;
|
||||||
public static final int minimumLocalDeltaInit = 10; // the minimum time difference between access of the same local domain
|
public static final int minimumLocalDeltaInit = 10; // the minimum time difference between access of the same local domain
|
||||||
public static final int minimumGlobalDeltaInit = 500; // the minimum time difference between access of the same global domain
|
public static final int minimumGlobalDeltaInit = 500; // the minimum time difference between access of the same global domain
|
||||||
|
@ -118,4 +119,8 @@ public class ClientIdentification {
|
||||||
|
|
||||||
return location;
|
return location;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
public static long minLoadDelay() {
|
||||||
|
return MIN_LOAD_DELAY;
|
||||||
|
}
|
||||||
}
|
}
|
||||||
|
|
|
@ -63,7 +63,6 @@ import net.yacy.search.SwitchboardConstants;
|
||||||
|
|
||||||
public class CrawlQueues {
|
public class CrawlQueues {
|
||||||
|
|
||||||
public static final long queuedMinLoadDelay = 500;
|
|
||||||
private static final String ERROR_DB_FILENAME = "urlError4.db";
|
private static final String ERROR_DB_FILENAME = "urlError4.db";
|
||||||
private static final String DELEGATED_DB_FILENAME = "urlDelegated4.db";
|
private static final String DELEGATED_DB_FILENAME = "urlDelegated4.db";
|
||||||
|
|
||||||
|
@ -654,7 +653,7 @@ public class CrawlQueues {
|
||||||
try {
|
try {
|
||||||
this.request.setStatus("loading", WorkflowJob.STATUS_RUNNING);
|
this.request.setStatus("loading", WorkflowJob.STATUS_RUNNING);
|
||||||
final CrawlProfile e = CrawlQueues.this.sb.crawler.getActive(UTF8.getBytes(this.request.profileHandle()));
|
final CrawlProfile e = CrawlQueues.this.sb.crawler.getActive(UTF8.getBytes(this.request.profileHandle()));
|
||||||
final Response response = CrawlQueues.this.sb.loader.load(this.request, e == null ? CacheStrategy.IFEXIST : e.cacheStrategy(), BlacklistType.CRAWLER, queuedMinLoadDelay, ClientIdentification.DEFAULT_TIMEOUT);
|
final Response response = CrawlQueues.this.sb.loader.load(this.request, e == null ? CacheStrategy.IFEXIST : e.cacheStrategy(), BlacklistType.CRAWLER, ClientIdentification.minLoadDelay(), ClientIdentification.DEFAULT_TIMEOUT);
|
||||||
if (response == null) {
|
if (response == null) {
|
||||||
this.request.setStatus("error", WorkflowJob.STATUS_FINISHED);
|
this.request.setStatus("error", WorkflowJob.STATUS_FINISHED);
|
||||||
if (CrawlQueues.this.log.isFine()) {
|
if (CrawlQueues.this.log.isFine()) {
|
||||||
|
|
|
@ -45,7 +45,6 @@ import net.yacy.cora.storage.ComparableARC;
|
||||||
import net.yacy.cora.util.ConcurrentLog;
|
import net.yacy.cora.util.ConcurrentLog;
|
||||||
import net.yacy.cora.util.SpaceExceededException;
|
import net.yacy.cora.util.SpaceExceededException;
|
||||||
import net.yacy.crawler.HarvestProcess;
|
import net.yacy.crawler.HarvestProcess;
|
||||||
import net.yacy.crawler.data.CrawlQueues;
|
|
||||||
import net.yacy.data.WorkTables;
|
import net.yacy.data.WorkTables;
|
||||||
import net.yacy.kelondro.blob.Tables;
|
import net.yacy.kelondro.blob.Tables;
|
||||||
import net.yacy.kelondro.data.meta.DigestURI;
|
import net.yacy.kelondro.data.meta.DigestURI;
|
||||||
|
@ -71,7 +70,7 @@ public class RSSLoader extends Thread {
|
||||||
public void run() {
|
public void run() {
|
||||||
RSSReader rss = null;
|
RSSReader rss = null;
|
||||||
try {
|
try {
|
||||||
final Response response = this.sb.loader.load(this.sb.loader.request(this.urlf, true, false), CacheStrategy.NOCACHE, Integer.MAX_VALUE, BlacklistType.CRAWLER, CrawlQueues.queuedMinLoadDelay, ClientIdentification.DEFAULT_TIMEOUT);
|
final Response response = this.sb.loader.load(this.sb.loader.request(this.urlf, true, false), CacheStrategy.NOCACHE, Integer.MAX_VALUE, BlacklistType.CRAWLER, ClientIdentification.minLoadDelay(), ClientIdentification.DEFAULT_TIMEOUT);
|
||||||
final byte[] resource = response == null ? null : response.getContent();
|
final byte[] resource = response == null ? null : response.getContent();
|
||||||
rss = resource == null ? null : RSSReader.parse(RSSFeed.DEFAULT_MAXSIZE, resource);
|
rss = resource == null ? null : RSSReader.parse(RSSFeed.DEFAULT_MAXSIZE, resource);
|
||||||
} catch (final MalformedURLException e) {
|
} catch (final MalformedURLException e) {
|
||||||
|
|
|
@ -53,7 +53,6 @@ import net.yacy.cora.protocol.ResponseHeader;
|
||||||
import net.yacy.cora.protocol.http.HTTPClient;
|
import net.yacy.cora.protocol.http.HTTPClient;
|
||||||
import net.yacy.cora.storage.Files;
|
import net.yacy.cora.storage.Files;
|
||||||
import net.yacy.cora.util.ConcurrentLog;
|
import net.yacy.cora.util.ConcurrentLog;
|
||||||
import net.yacy.crawler.data.CrawlQueues;
|
|
||||||
import net.yacy.document.Document;
|
import net.yacy.document.Document;
|
||||||
import net.yacy.document.parser.tarParser;
|
import net.yacy.document.parser.tarParser;
|
||||||
import net.yacy.kelondro.data.meta.DigestURI;
|
import net.yacy.kelondro.data.meta.DigestURI;
|
||||||
|
@ -239,7 +238,7 @@ public final class yacyRelease extends yacyVersion {
|
||||||
try {
|
try {
|
||||||
final DigestURI uri = location.getLocationURL();
|
final DigestURI uri = location.getLocationURL();
|
||||||
Thread.currentThread().setName("allReleaseFrom - host " + uri.getHost()); // makes it more easy to see which release blocks process in thread dump
|
Thread.currentThread().setName("allReleaseFrom - host " + uri.getHost()); // makes it more easy to see which release blocks process in thread dump
|
||||||
scraper = Switchboard.getSwitchboard().loader.loadDocument(uri, CacheStrategy.NOCACHE, null, CrawlQueues.queuedMinLoadDelay, ClientIdentification.DEFAULT_TIMEOUT);
|
scraper = Switchboard.getSwitchboard().loader.loadDocument(uri, CacheStrategy.NOCACHE, null, ClientIdentification.minLoadDelay(), ClientIdentification.DEFAULT_TIMEOUT);
|
||||||
} catch (final IOException e) {
|
} catch (final IOException e) {
|
||||||
return null;
|
return null;
|
||||||
}
|
}
|
||||||
|
|
|
@ -2879,7 +2879,7 @@ public final class Switchboard extends serverSwitch {
|
||||||
// get a scraper to get the title
|
// get a scraper to get the title
|
||||||
Document scraper;
|
Document scraper;
|
||||||
try {
|
try {
|
||||||
scraper = this.loader.loadDocument(url, CacheStrategy.IFFRESH, BlacklistType.CRAWLER, CrawlQueues.queuedMinLoadDelay, ClientIdentification.DEFAULT_TIMEOUT);
|
scraper = this.loader.loadDocument(url, CacheStrategy.IFFRESH, BlacklistType.CRAWLER, ClientIdentification.minLoadDelay(), ClientIdentification.DEFAULT_TIMEOUT);
|
||||||
} catch (IOException e) {
|
} catch (IOException e) {
|
||||||
return "scraper cannot load URL: " + e.getMessage();
|
return "scraper cannot load URL: " + e.getMessage();
|
||||||
}
|
}
|
||||||
|
@ -2986,7 +2986,7 @@ public final class Switchboard extends serverSwitch {
|
||||||
String urlName = url.toNormalform(true);
|
String urlName = url.toNormalform(true);
|
||||||
Thread.currentThread().setName("Switchboard.addToIndex:" + urlName);
|
Thread.currentThread().setName("Switchboard.addToIndex:" + urlName);
|
||||||
try {
|
try {
|
||||||
final Response response = Switchboard.this.loader.load(request, CacheStrategy.IFFRESH, BlacklistType.CRAWLER, CrawlQueues.queuedMinLoadDelay, ClientIdentification.DEFAULT_TIMEOUT);
|
final Response response = Switchboard.this.loader.load(request, CacheStrategy.IFFRESH, BlacklistType.CRAWLER, ClientIdentification.minLoadDelay(), ClientIdentification.DEFAULT_TIMEOUT);
|
||||||
if (response == null) {
|
if (response == null) {
|
||||||
throw new IOException("response == null");
|
throw new IOException("response == null");
|
||||||
}
|
}
|
||||||
|
|
|
@ -55,7 +55,6 @@ import net.yacy.cora.storage.HandleSet;
|
||||||
import net.yacy.cora.util.ConcurrentLog;
|
import net.yacy.cora.util.ConcurrentLog;
|
||||||
import net.yacy.cora.util.LookAheadIterator;
|
import net.yacy.cora.util.LookAheadIterator;
|
||||||
import net.yacy.cora.util.SpaceExceededException;
|
import net.yacy.cora.util.SpaceExceededException;
|
||||||
import net.yacy.crawler.data.CrawlQueues;
|
|
||||||
import net.yacy.crawler.retrieval.Response;
|
import net.yacy.crawler.retrieval.Response;
|
||||||
import net.yacy.document.Condenser;
|
import net.yacy.document.Condenser;
|
||||||
import net.yacy.document.Document;
|
import net.yacy.document.Document;
|
||||||
|
@ -812,7 +811,7 @@ public class Segment {
|
||||||
|
|
||||||
try {
|
try {
|
||||||
// parse the resource
|
// parse the resource
|
||||||
final Document document = Document.mergeDocuments(url, null, loader.loadDocuments(loader.request(url, true, false), cacheStrategy, Integer.MAX_VALUE, null, CrawlQueues.queuedMinLoadDelay, ClientIdentification.DEFAULT_TIMEOUT));
|
final Document document = Document.mergeDocuments(url, null, loader.loadDocuments(loader.request(url, true, false), cacheStrategy, Integer.MAX_VALUE, null, ClientIdentification.minLoadDelay(), ClientIdentification.DEFAULT_TIMEOUT));
|
||||||
if (document == null) {
|
if (document == null) {
|
||||||
// delete just the url entry
|
// delete just the url entry
|
||||||
fulltext().remove(urlhash);
|
fulltext().remove(urlhash);
|
||||||
|
|
Loading…
Reference in New Issue
Block a user