diff --git a/htroot/Bookmarks.java b/htroot/Bookmarks.java index b1b962571..94cefaf3b 100644 --- a/htroot/Bookmarks.java +++ b/htroot/Bookmarks.java @@ -196,7 +196,7 @@ public class Bookmarks { // try to get the bookmark from the LURL database final URIMetadataRow urlentry = sb.index.urlMetadata().load(ASCII.getBytes(urlHash)); if (urlentry != null) try { - final Document document = Document.mergeDocuments(urlentry.url(), null, sb.loader.loadDocuments(sb.loader.request(urlentry.url(), true, false), CacheStrategy.IFEXIST, 5000, Integer.MAX_VALUE)); + final Document document = Document.mergeDocuments(urlentry.url(), null, sb.loader.loadDocuments(sb.loader.request(urlentry.url(), true, false), CacheStrategy.IFEXIST, 5000, Integer.MAX_VALUE, null)); prop.put("mode_edit", "0"); // create mode prop.put("mode_url", urlentry.url().toNormalform(false, true)); prop.putHTML("mode_title", urlentry.dc_title()); diff --git a/htroot/Crawler_p.java b/htroot/Crawler_p.java index 6b43fd250..5fb3c7013 100644 --- a/htroot/Crawler_p.java +++ b/htroot/Crawler_p.java @@ -50,6 +50,7 @@ import net.yacy.kelondro.index.RowSpaceExceededException; import net.yacy.kelondro.logging.Log; import net.yacy.kelondro.util.FileUtils; import net.yacy.peers.NewsPool; +import net.yacy.repository.Blacklist.BlacklistType; import net.yacy.search.Switchboard; import net.yacy.search.SwitchboardConstants; import net.yacy.search.index.Segment; @@ -322,7 +323,7 @@ public class Crawler_p { sb.crawlQueues.errorURL.remove(urlhash); // get a scraper to get the title - final Document scraper = sb.loader.loadDocument(url, CacheStrategy.IFFRESH); + final Document scraper = sb.loader.loadDocument(url, CacheStrategy.IFFRESH, BlacklistType.CRAWLER); final String title = scraper == null ? url.toNormalform(true, true) : scraper.dc_title(); final String description = scraper.dc_description(); @@ -544,7 +545,7 @@ public class Crawler_p { try { final DigestURI sitelistURL = new DigestURI(crawlingStart); // download document - Document scraper = sb.loader.loadDocument(sitelistURL, CacheStrategy.IFFRESH); + Document scraper = sb.loader.loadDocument(sitelistURL, CacheStrategy.IFFRESH, BlacklistType.CRAWLER); // String title = scraper.getTitle(); // String description = scraper.getDescription(); @@ -647,11 +648,11 @@ public class Crawler_p { private static long recrawlIfOlderC(final boolean recrawlIfOlderCheck, final int recrawlIfOlderNumber, final String crawlingIfOlderUnit) { if (!recrawlIfOlderCheck) return 0L; - if ("year".equals(crawlingIfOlderUnit)) return System.currentTimeMillis() - (long) recrawlIfOlderNumber * 1000L * 60L * 60L * 24L * 365L; - if ("month".equals(crawlingIfOlderUnit)) return System.currentTimeMillis() - (long) recrawlIfOlderNumber * 1000L * 60L * 60L * 24L * 30L; - if ("day".equals(crawlingIfOlderUnit)) return System.currentTimeMillis() - (long) recrawlIfOlderNumber * 1000L * 60L * 60L * 24L; - if ("hour".equals(crawlingIfOlderUnit)) return System.currentTimeMillis() - (long) recrawlIfOlderNumber * 1000L * 60L * 60L; - return System.currentTimeMillis() - (long) recrawlIfOlderNumber; + if ("year".equals(crawlingIfOlderUnit)) return System.currentTimeMillis() - recrawlIfOlderNumber * 1000L * 60L * 60L * 24L * 365L; + if ("month".equals(crawlingIfOlderUnit)) return System.currentTimeMillis() - recrawlIfOlderNumber * 1000L * 60L * 60L * 24L * 30L; + if ("day".equals(crawlingIfOlderUnit)) return System.currentTimeMillis() - recrawlIfOlderNumber * 1000L * 60L * 60L * 24L; + if ("hour".equals(crawlingIfOlderUnit)) return System.currentTimeMillis() - recrawlIfOlderNumber * 1000L * 60L * 60L; + return System.currentTimeMillis() - recrawlIfOlderNumber; } private static void setPerformance(final Switchboard sb, final serverObjects post) { diff --git a/htroot/DictionaryLoader_p.java b/htroot/DictionaryLoader_p.java index 70ba014ee..273a779cc 100644 --- a/htroot/DictionaryLoader_p.java +++ b/htroot/DictionaryLoader_p.java @@ -65,7 +65,7 @@ public class DictionaryLoader_p { if (post.containsKey("geon0Load")) { // load from the net try { - final Response response = sb.loader.load(sb.loader.request(new DigestURI(LibraryProvider.Dictionary.GEON0.url), false, true), CacheStrategy.NOCACHE, Integer.MAX_VALUE, false); + final Response response = sb.loader.load(sb.loader.request(new DigestURI(LibraryProvider.Dictionary.GEON0.url), false, true), CacheStrategy.NOCACHE, Integer.MAX_VALUE, null); final byte[] b = response.getContent(); FileUtils.copy(b, LibraryProvider.Dictionary.GEON0.file()); LibraryProvider.geoLoc.activateLocation(LibraryProvider.Dictionary.GEON0.nickname, new GeonamesLocation(LibraryProvider.Dictionary.GEON0.file(), null, -1)); @@ -107,7 +107,7 @@ public class DictionaryLoader_p { if (post.containsKey("geon1Load")) { // load from the net try { - final Response response = sb.loader.load(sb.loader.request(new DigestURI(LibraryProvider.Dictionary.GEON1.url), false, true), CacheStrategy.NOCACHE, Integer.MAX_VALUE, false); + final Response response = sb.loader.load(sb.loader.request(new DigestURI(LibraryProvider.Dictionary.GEON1.url), false, true), CacheStrategy.NOCACHE, Integer.MAX_VALUE, null); final byte[] b = response.getContent(); FileUtils.copy(b, LibraryProvider.Dictionary.GEON1.file()); LibraryProvider.geoLoc.activateLocation(LibraryProvider.Dictionary.GEON1.nickname, new GeonamesLocation(LibraryProvider.Dictionary.GEON1.file(), null, -1)); @@ -149,7 +149,7 @@ public class DictionaryLoader_p { if (post.containsKey("geon2Load")) { // load from the net try { - final Response response = sb.loader.load(sb.loader.request(new DigestURI(LibraryProvider.Dictionary.GEON2.url), false, true), CacheStrategy.NOCACHE, Integer.MAX_VALUE, false); + final Response response = sb.loader.load(sb.loader.request(new DigestURI(LibraryProvider.Dictionary.GEON2.url), false, true), CacheStrategy.NOCACHE, Integer.MAX_VALUE, null); final byte[] b = response.getContent(); FileUtils.copy(b, LibraryProvider.Dictionary.GEON2.file()); LibraryProvider.geoLoc.activateLocation(LibraryProvider.Dictionary.GEON2.nickname, new GeonamesLocation(LibraryProvider.Dictionary.GEON2.file(), null, 100000)); @@ -191,7 +191,7 @@ public class DictionaryLoader_p { if (post.containsKey("geo1Load")) { // load from the net try { - final Response response = sb.loader.load(sb.loader.request(new DigestURI(LibraryProvider.Dictionary.GEODB1.url), false, true), CacheStrategy.NOCACHE, Integer.MAX_VALUE, false); + final Response response = sb.loader.load(sb.loader.request(new DigestURI(LibraryProvider.Dictionary.GEODB1.url), false, true), CacheStrategy.NOCACHE, Integer.MAX_VALUE, null); final byte[] b = response.getContent(); FileUtils.copy(b, LibraryProvider.Dictionary.GEODB1.file()); LibraryProvider.geoLoc.deactivateLocalization(LibraryProvider.Dictionary.GEODB1.nickname); @@ -234,7 +234,7 @@ public class DictionaryLoader_p { if (post.containsKey("drw0Load")) { // load from the net try { - final Response response = sb.loader.load(sb.loader.request(new DigestURI(LibraryProvider.Dictionary.DRW0.url), false, true), CacheStrategy.NOCACHE, Integer.MAX_VALUE, false); + final Response response = sb.loader.load(sb.loader.request(new DigestURI(LibraryProvider.Dictionary.DRW0.url), false, true), CacheStrategy.NOCACHE, Integer.MAX_VALUE, null); final byte[] b = response.getContent(); FileUtils.copy(b, LibraryProvider.Dictionary.DRW0.file()); LibraryProvider.activateDeReWo(); @@ -278,7 +278,7 @@ public class DictionaryLoader_p { if (post.containsKey("pnd0Load")) { // load from the net try { - final Response response = sb.loader.load(sb.loader.request(new DigestURI(LibraryProvider.Dictionary.PND0.url), false, true), CacheStrategy.NOCACHE, Integer.MAX_VALUE, false); + final Response response = sb.loader.load(sb.loader.request(new DigestURI(LibraryProvider.Dictionary.PND0.url), false, true), CacheStrategy.NOCACHE, Integer.MAX_VALUE, null); final byte[] b = response.getContent(); FileUtils.copy(b, LibraryProvider.Dictionary.PND0.file()); LibraryProvider.activatePND(); diff --git a/htroot/Load_RSS_p.java b/htroot/Load_RSS_p.java index 0137cbaf7..8d173cca6 100644 --- a/htroot/Load_RSS_p.java +++ b/htroot/Load_RSS_p.java @@ -41,6 +41,7 @@ import net.yacy.kelondro.blob.Tables.Row; import net.yacy.kelondro.data.meta.DigestURI; import net.yacy.kelondro.index.RowSpaceExceededException; import net.yacy.kelondro.logging.Log; +import net.yacy.repository.Blacklist.BlacklistType; import net.yacy.search.Switchboard; import de.anomic.crawler.RSSLoader; import de.anomic.crawler.retrieval.Response; @@ -255,7 +256,7 @@ public class Load_RSS_p { RSSReader rss = null; if (url != null) try { prop.put("url", url.toNormalform(true, false)); - final Response response = sb.loader.load(sb.loader.request(url, true, false), CacheStrategy.NOCACHE, Integer.MAX_VALUE, true); + final Response response = sb.loader.load(sb.loader.request(url, true, false), CacheStrategy.NOCACHE, Integer.MAX_VALUE, BlacklistType.CRAWLER); final byte[] resource = response == null ? null : response.getContent(); rss = resource == null ? null : RSSReader.parse(RSSFeed.DEFAULT_MAXSIZE, resource); } catch (final IOException e) { diff --git a/htroot/ViewFile.java b/htroot/ViewFile.java index 571d01fa4..1139c217e 100644 --- a/htroot/ViewFile.java +++ b/htroot/ViewFile.java @@ -163,7 +163,7 @@ public class ViewFile { Response response = null; try { - response = sb.loader.load(sb.loader.request(url, true, false), authorized ? CacheStrategy.IFEXIST : CacheStrategy.CACHEONLY, Integer.MAX_VALUE, true); + response = sb.loader.load(sb.loader.request(url, true, false), authorized ? CacheStrategy.IFEXIST : CacheStrategy.CACHEONLY, Integer.MAX_VALUE, null); } catch (final IOException e) { prop.put("error", "4"); prop.put("error_errorText", "error loading resource: " + e.getMessage()); diff --git a/htroot/ViewImage.java b/htroot/ViewImage.java index b12a94921..8ae8dced5 100644 --- a/htroot/ViewImage.java +++ b/htroot/ViewImage.java @@ -42,6 +42,7 @@ import net.yacy.kelondro.data.meta.DigestURI; import net.yacy.kelondro.logging.Log; import net.yacy.kelondro.util.FileUtils; import net.yacy.kelondro.util.MemoryControl; +import net.yacy.repository.Blacklist.BlacklistType; import net.yacy.search.Switchboard; import de.anomic.server.serverObjects; import de.anomic.server.serverSwitch; @@ -95,7 +96,7 @@ public class ViewImage { if (image == null) { byte[] resourceb = null; if (url != null) try { - resourceb = sb.loader.loadContent(sb.loader.request(url, false, true), CacheStrategy.IFEXIST); + resourceb = sb.loader.loadContent(sb.loader.request(url, false, true), CacheStrategy.IFEXIST, BlacklistType.SEARCH); } catch (final IOException e) { Log.logFine("ViewImage", "cannot load: " + e.getMessage()); } diff --git a/htroot/api/getpageinfo.java b/htroot/api/getpageinfo.java index c031cca1a..8bee79c26 100644 --- a/htroot/api/getpageinfo.java +++ b/htroot/api/getpageinfo.java @@ -37,6 +37,7 @@ import net.yacy.cora.protocol.RequestHeader; import net.yacy.cora.services.federated.yacy.CacheStrategy; import net.yacy.kelondro.data.meta.DigestURI; import net.yacy.kelondro.logging.Log; +import net.yacy.repository.Blacklist.BlacklistType; import net.yacy.search.Switchboard; import org.w3c.dom.Document; @@ -94,7 +95,7 @@ public class getpageinfo { } net.yacy.document.Document scraper = null; if (u != null) try { - scraper = sb.loader.loadDocument(u, CacheStrategy.IFEXIST); + scraper = sb.loader.loadDocument(u, CacheStrategy.IFEXIST, BlacklistType.CRAWLER); } catch (final IOException e) { Log.logException(e); // bad things are possible, i.e. that the Server responds with "403 Bad Behavior" diff --git a/htroot/api/getpageinfo_p.java b/htroot/api/getpageinfo_p.java index a2c483543..b27fa7ee2 100644 --- a/htroot/api/getpageinfo_p.java +++ b/htroot/api/getpageinfo_p.java @@ -37,6 +37,7 @@ import net.yacy.cora.protocol.RequestHeader; import net.yacy.cora.services.federated.yacy.CacheStrategy; import net.yacy.kelondro.data.meta.DigestURI; import net.yacy.kelondro.logging.Log; +import net.yacy.repository.Blacklist.BlacklistType; import net.yacy.search.Switchboard; import org.w3c.dom.Document; @@ -94,7 +95,7 @@ public class getpageinfo_p { } net.yacy.document.Document scraper = null; if (u != null) try { - scraper = sb.loader.loadDocument(u, CacheStrategy.IFEXIST); + scraper = sb.loader.loadDocument(u, CacheStrategy.IFEXIST, BlacklistType.CRAWLER); } catch (final IOException e) { Log.logException(e); // bad things are possible, i.e. that the Server responds with "403 Bad Behavior" diff --git a/htroot/api/webstructure.java b/htroot/api/webstructure.java index de835ea95..3ae74aad3 100644 --- a/htroot/api/webstructure.java +++ b/htroot/api/webstructure.java @@ -97,7 +97,7 @@ public class webstructure { prop.put("references", 1); net.yacy.document.Document scraper = null; if (url != null) try { - scraper = sb.loader.loadDocument(url, CacheStrategy.IFEXIST); + scraper = sb.loader.loadDocument(url, CacheStrategy.IFEXIST, null); } catch (final IOException e) { Log.logException(e); } diff --git a/htroot/yacysearch.java b/htroot/yacysearch.java index 11db85f8b..a803b4d80 100644 --- a/htroot/yacysearch.java +++ b/htroot/yacysearch.java @@ -69,6 +69,7 @@ import net.yacy.kelondro.util.SetTools; import net.yacy.peers.EventChannel; import net.yacy.peers.NewsPool; import net.yacy.peers.graphics.ProfilingGraph; +import net.yacy.repository.Blacklist.BlacklistType; import net.yacy.search.EventTracker; import net.yacy.search.Switchboard; import net.yacy.search.SwitchboardConstants; @@ -667,7 +668,7 @@ public class yacysearch { sb.loader.request(urlentry.url(), true, false), CacheStrategy.IFEXIST, 5000, - Integer.MAX_VALUE); + Integer.MAX_VALUE, BlacklistType.SEARCH); } catch ( final IOException e ) { } catch ( final Parser.Failure e ) { } diff --git a/htroot/yacysearchitem.java b/htroot/yacysearchitem.java index 3a69f2007..eacd5919c 100644 --- a/htroot/yacysearchitem.java +++ b/htroot/yacysearchitem.java @@ -183,7 +183,7 @@ public class yacysearchitem { // END interaction prop.putHTML("content_target", target); - if (faviconURL != null && fileType == FileType.HTML) sb.loader.loadIfNotExistBackground(faviconURL, 1024 * 1024 * 10); + if (faviconURL != null && fileType == FileType.HTML) sb.loader.loadIfNotExistBackground(faviconURL, 1024 * 1024 * 10, null); prop.putHTML("content_faviconCode", sb.licensedURLs.aquireLicense(faviconURL)); // acquire license for favicon url loading prop.put("content_urlhash", resulthashString); prop.put("content_ranking", result.ranking); @@ -266,7 +266,7 @@ public class yacysearchitem { final String target = sb.getConfig(resultUrlstring.matches(target_special_pattern) ? SwitchboardConstants.SEARCH_TARGET_SPECIAL : SwitchboardConstants.SEARCH_TARGET_DEFAULT, "_self"); final String license = sb.licensedURLs.aquireLicense(ms.url()); - sb.loader.loadIfNotExistBackground(ms.url(), 1024 * 1024 * 10); + sb.loader.loadIfNotExistBackground(ms.url(), 1024 * 1024 * 10, null); prop.putHTML("content_item_hrefCache", (auth) ? "/ViewImage.png?url=" + resultUrlstring : resultUrlstring); prop.putHTML("content_item_href", resultUrlstring); prop.putHTML("content_item_target", target); diff --git a/source/de/anomic/crawler/CrawlQueues.java b/source/de/anomic/crawler/CrawlQueues.java index c4a1d94c7..8ae5ad4b0 100644 --- a/source/de/anomic/crawler/CrawlQueues.java +++ b/source/de/anomic/crawler/CrawlQueues.java @@ -49,6 +49,7 @@ import net.yacy.kelondro.workflow.WorkflowJob; import net.yacy.peers.Protocol; import net.yacy.peers.Seed; import net.yacy.peers.dht.PeerSelection; +import net.yacy.repository.Blacklist.BlacklistType; import net.yacy.search.Switchboard; import net.yacy.search.Switchboard.indexingQueueEntry; import net.yacy.search.SwitchboardConstants; @@ -655,7 +656,7 @@ public class CrawlQueues { try { this.request.setStatus("loading", WorkflowJob.STATUS_RUNNING); final CrawlProfile e = CrawlQueues.this.sb.crawler.getActive(UTF8.getBytes(this.request.profileHandle())); - final Response response = CrawlQueues.this.sb.loader.load(this.request, e == null ? CacheStrategy.IFEXIST : e.cacheStrategy(), true); + final Response response = CrawlQueues.this.sb.loader.load(this.request, e == null ? CacheStrategy.IFEXIST : e.cacheStrategy(), BlacklistType.CRAWLER); if (response == null) { this.request.setStatus("error", WorkflowJob.STATUS_FINISHED); if (CrawlQueues.this.log.isFine()) { diff --git a/source/de/anomic/crawler/RSSLoader.java b/source/de/anomic/crawler/RSSLoader.java index 7e03f948a..fa1ff7e1a 100644 --- a/source/de/anomic/crawler/RSSLoader.java +++ b/source/de/anomic/crawler/RSSLoader.java @@ -41,6 +41,7 @@ import net.yacy.kelondro.data.meta.DigestURI; import net.yacy.kelondro.index.RowSpaceExceededException; import net.yacy.kelondro.logging.Log; import net.yacy.kelondro.order.Base64Order; +import net.yacy.repository.Blacklist.BlacklistType; import net.yacy.search.Switchboard; import de.anomic.crawler.retrieval.Response; import de.anomic.data.WorkTables; @@ -62,7 +63,7 @@ public class RSSLoader extends Thread { public void run() { RSSReader rss = null; try { - final Response response = this.sb.loader.load(this.sb.loader.request(this.urlf, true, false), CacheStrategy.NOCACHE, Integer.MAX_VALUE, true); + final Response response = this.sb.loader.load(this.sb.loader.request(this.urlf, true, false), CacheStrategy.NOCACHE, Integer.MAX_VALUE, BlacklistType.CRAWLER); final byte[] resource = response == null ? null : response.getContent(); rss = resource == null ? null : RSSReader.parse(RSSFeed.DEFAULT_MAXSIZE, resource); } catch (final MalformedURLException e) { diff --git a/source/de/anomic/crawler/retrieval/HTTPLoader.java b/source/de/anomic/crawler/retrieval/HTTPLoader.java index 9668ddef4..9c29426b4 100644 --- a/source/de/anomic/crawler/retrieval/HTTPLoader.java +++ b/source/de/anomic/crawler/retrieval/HTTPLoader.java @@ -69,14 +69,14 @@ public final class HTTPLoader { this.socketTimeout = (int) sb.getConfigLong("crawler.clientTimeout", 30000); } - public Response load(final Request entry, final int maxFileSize, final boolean checkBlacklist) throws IOException { + public Response load(final Request entry, final int maxFileSize, final BlacklistType blacklistType) throws IOException { final long start = System.currentTimeMillis(); - final Response doc = load(entry, DEFAULT_CRAWLING_RETRY_COUNT, maxFileSize, checkBlacklist); + final Response doc = load(entry, DEFAULT_CRAWLING_RETRY_COUNT, maxFileSize, blacklistType); Latency.update(entry.url(), System.currentTimeMillis() - start); return doc; } - private Response load(final Request request, final int retryCount, final int maxFileSize, final boolean checkBlacklist) throws IOException { + private Response load(final Request request, final int retryCount, final int maxFileSize, final BlacklistType blacklistType) throws IOException { byte[] myHash = this.sb.peers.mySeed().hash.getBytes(); @@ -96,7 +96,7 @@ public final class HTTPLoader { // check if url is in blacklist final String hostlow = host.toLowerCase(); - if (checkBlacklist && Switchboard.urlBlacklist.isListed(BlacklistType.CRAWLER, hostlow, path)) { + if (blacklistType != null && Switchboard.urlBlacklist.isListed(blacklistType, hostlow, path)) { this.sb.crawlQueues.errorURL.push(request, myHash, new Date(), 1, FailCategory.FINAL_LOAD_CONTEXT, "url in blacklist", -1); throw new IOException("CRAWLER Rejecting URL '" + request.url().toString() + "'. URL is in blacklist."); } @@ -175,7 +175,7 @@ public final class HTTPLoader { // retry crawling with new url request.redirectURL(redirectionUrl); - return load(request, retryCount - 1, maxFileSize, checkBlacklist); + return load(request, retryCount - 1, maxFileSize, blacklistType); } else { // we don't want to follow redirects this.sb.crawlQueues.errorURL.push(request, myHash, new Date(), 1, FailCategory.FINAL_PROCESS_CONTEXT, "redirection not wanted", statusCode); diff --git a/source/de/anomic/data/ymark/YMarkAutoTagger.java b/source/de/anomic/data/ymark/YMarkAutoTagger.java index b4f6ec51a..82f5deaa5 100644 --- a/source/de/anomic/data/ymark/YMarkAutoTagger.java +++ b/source/de/anomic/data/ymark/YMarkAutoTagger.java @@ -69,7 +69,7 @@ public class YMarkAutoTagger implements Runnable, Thread.UncaughtExceptionHandle return null; } try { - response = loader.load(loader.request(uri, true, false), CacheStrategy.IFEXIST, Integer.MAX_VALUE, true); + response = loader.load(loader.request(uri, true, false), CacheStrategy.IFEXIST, Integer.MAX_VALUE, null); } catch (final IOException e) { Log.logWarning(YMarkTables.BOOKMARKS_LOG, "loadDocument failed due to IOException for url: "+url); return null; diff --git a/source/de/anomic/data/ymark/YMarkMetadata.java b/source/de/anomic/data/ymark/YMarkMetadata.java index 9688d5baf..f4236c7ac 100644 --- a/source/de/anomic/data/ymark/YMarkMetadata.java +++ b/source/de/anomic/data/ymark/YMarkMetadata.java @@ -97,7 +97,7 @@ public class YMarkMetadata { public Document loadDocument(final LoaderDispatcher loader) throws IOException, Failure { if(this.document == null) { Response response = null; - response = loader.load(loader.request(this.uri, true, false), CacheStrategy.IFEXIST, Integer.MAX_VALUE, true); + response = loader.load(loader.request(this.uri, true, false), CacheStrategy.IFEXIST, Integer.MAX_VALUE, null); this.document = Document.mergeDocuments(response.url(), response.getMimeType(), response.parse()); } return this.document; diff --git a/source/net/yacy/document/importer/OAIListFriendsLoader.java b/source/net/yacy/document/importer/OAIListFriendsLoader.java index 9331a82b1..6d9f80a9e 100644 --- a/source/net/yacy/document/importer/OAIListFriendsLoader.java +++ b/source/net/yacy/document/importer/OAIListFriendsLoader.java @@ -62,7 +62,7 @@ public class OAIListFriendsLoader implements Serializable { listFriends.putAll(moreFriends); if (loader != null) for (final Map.Entry oaiFriend: listFriends.entrySet()) { try { - loader.loadIfNotExistBackground(new DigestURI(oaiFriend.getKey()), oaiFriend.getValue(), Integer.MAX_VALUE); + loader.loadIfNotExistBackground(new DigestURI(oaiFriend.getKey()), oaiFriend.getValue(), Integer.MAX_VALUE, null); } catch (final MalformedURLException e) { } } @@ -87,7 +87,7 @@ public class OAIListFriendsLoader implements Serializable { Map m; for (final Map.Entry oaiFriend: listFriends.entrySet()) try { if (!oaiFriend.getValue().exists()) { - final Response response = loader == null ? null : loader.load(loader.request(new DigestURI(oaiFriend.getKey()), false, true), CacheStrategy.NOCACHE, Integer.MAX_VALUE, true); + final Response response = loader == null ? null : loader.load(loader.request(new DigestURI(oaiFriend.getKey()), false, true), CacheStrategy.NOCACHE, Integer.MAX_VALUE, null); if (response != null) FileUtils.copy(response.getContent(), oaiFriend.getValue()); } @@ -116,7 +116,7 @@ public class OAIListFriendsLoader implements Serializable { } return parser; } - + // get a resumption token using a SAX xml parser from am input stream public static class Parser extends DefaultHandler { @@ -162,11 +162,12 @@ public class OAIListFriendsLoader implements Serializable { http://oai.repec.openlib.org/ */ - + public int getCounter() { return this.recordCounter; } + @Override public void startElement(final String uri, final String name, final String tag, final Attributes atts) throws SAXException { if ("baseURL".equals(tag)) { this.recordCounter++; @@ -175,6 +176,7 @@ public class OAIListFriendsLoader implements Serializable { } } + @Override public void endElement(final String uri, final String name, final String tag) { if (tag == null) return; if ("baseURL".equals(tag)) { @@ -184,6 +186,7 @@ public class OAIListFriendsLoader implements Serializable { } } + @Override public void characters(final char ch[], final int start, final int length) { if (this.parsingValue) { this.buffer.append(ch, start, length); diff --git a/source/net/yacy/document/importer/OAIPMHLoader.java b/source/net/yacy/document/importer/OAIPMHLoader.java index af7f9db86..0cdbc9679 100644 --- a/source/net/yacy/document/importer/OAIPMHLoader.java +++ b/source/net/yacy/document/importer/OAIPMHLoader.java @@ -54,7 +54,7 @@ public class OAIPMHLoader { for (int i = 0; i < 5; i++) { // make some retries if first attempt fails try { - response = loader.load(loader.request(source, false, true), CacheStrategy.NOCACHE, Integer.MAX_VALUE, true); + response = loader.load(loader.request(source, false, true), CacheStrategy.NOCACHE, Integer.MAX_VALUE, null); break; } catch (IOException e) { Log.logWarning("OAIPMHLoader", "loading failed at attempt " + (i + 1) + ": " + source.toNormalform(true, false)); diff --git a/source/net/yacy/peers/graphics/OSMTile.java b/source/net/yacy/peers/graphics/OSMTile.java index 484b39240..0463da8d5 100644 --- a/source/net/yacy/peers/graphics/OSMTile.java +++ b/source/net/yacy/peers/graphics/OSMTile.java @@ -83,6 +83,7 @@ public class OSMTile { public Place(final RasterPlotter m, final int xt, final int yt, final int xc, final int yc, final int z) { this.m = m; this.xt = xt; this.yt = yt; this.xc = xc; this.yc = yc; this.z = z; } + @Override public void run() { final tileCoordinates t = new tileCoordinates(this.xt, this.yt, this.z); BufferedImage bi = null; @@ -111,7 +112,7 @@ public class OSMTile { // download resource using the crawler and keep resource in memory if possible Response entry = null; try { - entry = Switchboard.getSwitchboard().loader.load(Switchboard.getSwitchboard().loader.request(tileURL, false, false), CacheStrategy.IFEXIST, Integer.MAX_VALUE, true); + entry = Switchboard.getSwitchboard().loader.load(Switchboard.getSwitchboard().loader.request(tileURL, false, false), CacheStrategy.IFEXIST, Integer.MAX_VALUE, null); } catch (final IOException e) { Log.logWarning("OSMTile", "cannot load: " + e.getMessage()); return null; diff --git a/source/net/yacy/peers/operation/yacyRelease.java b/source/net/yacy/peers/operation/yacyRelease.java index 796686dee..5f2af9c6f 100644 --- a/source/net/yacy/peers/operation/yacyRelease.java +++ b/source/net/yacy/peers/operation/yacyRelease.java @@ -240,7 +240,7 @@ public final class yacyRelease extends yacyVersion { try { final DigestURI uri = location.getLocationURL(); Thread.currentThread().setName("allReleaseFrom - host " + uri.getHost()); // makes it more easy to see which release blocks process in thread dump - scraper = Switchboard.getSwitchboard().loader.loadDocument(uri, CacheStrategy.NOCACHE); + scraper = Switchboard.getSwitchboard().loader.loadDocument(uri, CacheStrategy.NOCACHE, null); } catch (final IOException e) { return null; } diff --git a/source/net/yacy/repository/LoaderDispatcher.java b/source/net/yacy/repository/LoaderDispatcher.java index 2b9a13d0a..e002c7705 100644 --- a/source/net/yacy/repository/LoaderDispatcher.java +++ b/source/net/yacy/repository/LoaderDispatcher.java @@ -133,9 +133,9 @@ public final class LoaderDispatcher { 0); } - public void load(final DigestURI url, final CacheStrategy cacheStratgy, final int maxFileSize, final File targetFile) throws IOException { + public void load(final DigestURI url, final CacheStrategy cacheStratgy, final int maxFileSize, final File targetFile, BlacklistType blacklistType) throws IOException { - final byte[] b = load(request(url, false, true), cacheStratgy, maxFileSize, true).getContent(); + final byte[] b = load(request(url, false, true), cacheStratgy, maxFileSize, blacklistType).getContent(); if (b == null) throw new IOException("load == null"); final File tmp = new File(targetFile.getAbsolutePath() + ".tmp"); @@ -146,11 +146,11 @@ public final class LoaderDispatcher { tmp.renameTo(targetFile); } - public Response load(final Request request, final CacheStrategy cacheStrategy, final boolean checkBlacklist) throws IOException { - return load(request, cacheStrategy, protocolMaxFileSize(request.url()), checkBlacklist); + public Response load(final Request request, final CacheStrategy cacheStrategy, final BlacklistType blacklistType) throws IOException { + return load(request, cacheStrategy, protocolMaxFileSize(request.url()), blacklistType); } - public Response load(final Request request, final CacheStrategy cacheStrategy, final int maxFileSize, final boolean checkBlacklist) throws IOException { + public Response load(final Request request, final CacheStrategy cacheStrategy, final int maxFileSize, final BlacklistType blacklistType) throws IOException { Semaphore check = this.loaderSteering.get(request.url()); if (check != null) { // a loading process may be going on for that url @@ -161,7 +161,7 @@ public final class LoaderDispatcher { this.loaderSteering.put(request.url(), new Semaphore(0)); try { - final Response response = loadInternal(request, cacheStrategy, maxFileSize, checkBlacklist); + final Response response = loadInternal(request, cacheStrategy, maxFileSize, blacklistType); check = this.loaderSteering.remove(request.url()); if (check != null) check.release(1000); return response; @@ -181,7 +181,7 @@ public final class LoaderDispatcher { * @return the loaded entity in a Response object * @throws IOException */ - private Response loadInternal(final Request request, CacheStrategy cacheStrategy, final int maxFileSize, final boolean checkBlacklist) throws IOException { + private Response loadInternal(final Request request, CacheStrategy cacheStrategy, final int maxFileSize, final BlacklistType blacklistType) throws IOException { // get the protocol of the next URL final DigestURI url = request.url(); if (url.isFile() || url.isSMB()) cacheStrategy = CacheStrategy.NOCACHE; // load just from the file system @@ -189,7 +189,7 @@ public final class LoaderDispatcher { final String host = url.getHost(); // check if url is in blacklist - if (checkBlacklist && host != null && Switchboard.urlBlacklist.isListed(BlacklistType.CRAWLER, host.toLowerCase(), url.getFile())) { + if (blacklistType != null && host != null && Switchboard.urlBlacklist.isListed(blacklistType, host.toLowerCase(), url.getFile())) { this.sb.crawlQueues.errorURL.push(request, this.sb.peers.mySeed().hash.getBytes(), new Date(), 1, FailCategory.FINAL_LOAD_CONTEXT, "url in blacklist", -1); throw new IOException("DISPATCHER Rejecting URL '" + request.url().toString() + "'. URL is in blacklist."); } @@ -271,7 +271,7 @@ public final class LoaderDispatcher { // load resource from the internet Response response = null; if (protocol.equals("http") || protocol.equals("https")) { - response = this.httpLoader.load(request, maxFileSize, checkBlacklist); + response = this.httpLoader.load(request, maxFileSize, blacklistType); } else if (protocol.equals("ftp")) { response = this.ftpLoader.load(request, true); } else if (protocol.equals("smb")) { @@ -326,19 +326,19 @@ public final class LoaderDispatcher { * @return the content as {@link byte[]} * @throws IOException */ - public byte[] loadContent(final Request request, final CacheStrategy cacheStrategy) throws IOException { + public byte[] loadContent(final Request request, final CacheStrategy cacheStrategy, BlacklistType blacklistType) throws IOException { // try to download the resource using the loader - final Response entry = load(request, cacheStrategy, true); + final Response entry = load(request, cacheStrategy, blacklistType); if (entry == null) return null; // not found in web // read resource body (if it is there) return entry.getContent(); } - public Document[] loadDocuments(final Request request, final CacheStrategy cacheStrategy, final int timeout, final int maxFileSize) throws IOException, Parser.Failure { + public Document[] loadDocuments(final Request request, final CacheStrategy cacheStrategy, final int timeout, final int maxFileSize, BlacklistType blacklistType) throws IOException, Parser.Failure { // load resource - final Response response = load(request, cacheStrategy, maxFileSize, true); + final Response response = load(request, cacheStrategy, maxFileSize, blacklistType); final DigestURI url = request.url(); if (response == null) throw new IOException("no Response for url " + url); @@ -349,10 +349,10 @@ public final class LoaderDispatcher { return response.parse(); } - public Document loadDocument(final DigestURI location, final CacheStrategy cachePolicy) throws IOException { + public Document loadDocument(final DigestURI location, final CacheStrategy cachePolicy, BlacklistType blacklistType) throws IOException { // load resource Request request = request(location, true, false); - final Response response = this.load(request, cachePolicy, true); + final Response response = this.load(request, cachePolicy, blacklistType); final DigestURI url = request.url(); if (response == null) throw new IOException("no Response for url " + url); @@ -375,8 +375,8 @@ public final class LoaderDispatcher { * @return a map from URLs to the anchor texts of the urls * @throws IOException */ - public final Map loadLinks(final DigestURI url, final CacheStrategy cacheStrategy) throws IOException { - final Response response = load(request(url, true, false), cacheStrategy, Integer.MAX_VALUE, true); + public final Map loadLinks(final DigestURI url, final CacheStrategy cacheStrategy, BlacklistType blacklistType) throws IOException { + final Response response = load(request(url, true, false), cacheStrategy, Integer.MAX_VALUE, blacklistType); if (response == null) throw new IOException("response == null"); final ResponseHeader responseHeader = response.getResponseHeader(); if (response.getContent() == null) throw new IOException("resource == null"); @@ -405,12 +405,12 @@ public final class LoaderDispatcher { } } - public void loadIfNotExistBackground(final DigestURI url, final File cache, final int maxFileSize) { - new Loader(url, cache, maxFileSize, CacheStrategy.IFEXIST).start(); + public void loadIfNotExistBackground(final DigestURI url, final File cache, final int maxFileSize, BlacklistType blacklistType) { + new Loader(url, cache, maxFileSize, CacheStrategy.IFEXIST, blacklistType).start(); } - public void loadIfNotExistBackground(final DigestURI url, final int maxFileSize) { - new Loader(url, null, maxFileSize, CacheStrategy.IFEXIST).start(); + public void loadIfNotExistBackground(final DigestURI url, final int maxFileSize, BlacklistType blacklistType) { + new Loader(url, null, maxFileSize, CacheStrategy.IFEXIST, blacklistType).start(); } private class Loader extends Thread { @@ -419,12 +419,14 @@ public final class LoaderDispatcher { private final File cache; private final int maxFileSize; private final CacheStrategy cacheStrategy; + private final BlacklistType blacklistType; - public Loader(final DigestURI url, final File cache, final int maxFileSize, final CacheStrategy cacheStrategy) { + public Loader(final DigestURI url, final File cache, final int maxFileSize, final CacheStrategy cacheStrategy, BlacklistType blacklistType) { this.url = url; this.cache = cache; this.maxFileSize = maxFileSize; this.cacheStrategy = cacheStrategy; + this.blacklistType = blacklistType; } @Override @@ -432,7 +434,7 @@ public final class LoaderDispatcher { if (this.cache != null && this.cache.exists()) return; try { // load from the net - final Response response = load(request(this.url, false, true), this.cacheStrategy, this.maxFileSize, true); + final Response response = load(request(this.url, false, true), this.cacheStrategy, this.maxFileSize, this.blacklistType); final byte[] b = response.getContent(); if (this.cache != null) FileUtils.copy(b, this.cache); } catch (final MalformedURLException e) {} catch (final IOException e) {} diff --git a/source/net/yacy/search/Switchboard.java b/source/net/yacy/search/Switchboard.java index 47af48e3c..a6148027e 100644 --- a/source/net/yacy/search/Switchboard.java +++ b/source/net/yacy/search/Switchboard.java @@ -141,6 +141,7 @@ import net.yacy.peers.operation.yacyBuildProperties; import net.yacy.peers.operation.yacyRelease; import net.yacy.peers.operation.yacyUpdateLocation; import net.yacy.repository.Blacklist; +import net.yacy.repository.Blacklist.BlacklistType; import net.yacy.repository.FilterEngine; import net.yacy.repository.LoaderDispatcher; import net.yacy.search.index.Segment; @@ -2746,7 +2747,7 @@ public final class Switchboard extends serverSwitch Thread.currentThread().setName("Switchboard.addToIndex:" + urls); try { final Response response = - Switchboard.this.loader.load(request, CacheStrategy.IFFRESH, true); + Switchboard.this.loader.load(request, CacheStrategy.IFFRESH, BlacklistType.CRAWLER); if ( response == null ) { throw new IOException("response == null"); } @@ -3173,7 +3174,7 @@ public final class Switchboard extends serverSwitch final Map links; searchEvent.getRankingResult().oneFeederStarted(); try { - links = Switchboard.this.loader.loadLinks(url, CacheStrategy.NOCACHE); + links = Switchboard.this.loader.loadLinks(url, CacheStrategy.NOCACHE, BlacklistType.SEARCH); if ( links != null ) { final Iterator i = links.keySet().iterator(); while ( i.hasNext() ) { @@ -3212,7 +3213,7 @@ public final class Switchboard extends serverSwitch final Map links; DigestURI url; try { - links = Switchboard.this.loader.loadLinks(startUrl, CacheStrategy.IFFRESH); + links = Switchboard.this.loader.loadLinks(startUrl, CacheStrategy.IFFRESH, BlacklistType.SEARCH); if (links != null) { if (links.size() < 1000) { // limit to 1000 to skip large index pages final Iterator i = links.keySet().iterator(); @@ -3276,7 +3277,7 @@ public final class Switchboard extends serverSwitch searchEvent.getRankingResult().oneFeederStarted(); try { final Response response = - sb.loader.load(sb.loader.request(url, true, false), CacheStrategy.NOCACHE, true); + sb.loader.load(sb.loader.request(url, true, false), CacheStrategy.NOCACHE, BlacklistType.SEARCH); final byte[] resource = (response == null) ? null : response.getContent(); //System.out.println("BLEKKO: " + UTF8.String(resource)); rss = resource == null ? null : RSSReader.parse(RSSFeed.DEFAULT_MAXSIZE, resource); diff --git a/source/net/yacy/search/index/Segment.java b/source/net/yacy/search/index/Segment.java index 6fe2bc9a8..961ddfbd8 100644 --- a/source/net/yacy/search/index/Segment.java +++ b/source/net/yacy/search/index/Segment.java @@ -538,7 +538,7 @@ public class Segment { try { // parse the resource - final Document document = Document.mergeDocuments(entry.url(), null, loader.loadDocuments(loader.request(entry.url(), true, false), cacheStrategy, 10000, Integer.MAX_VALUE)); + final Document document = Document.mergeDocuments(entry.url(), null, loader.loadDocuments(loader.request(entry.url(), true, false), cacheStrategy, 10000, Integer.MAX_VALUE, null)); if (document == null) { // delete just the url entry urlMetadata().remove(urlhash); diff --git a/source/net/yacy/search/snippet/MediaSnippet.java b/source/net/yacy/search/snippet/MediaSnippet.java index 16765e59f..ade87edba 100644 --- a/source/net/yacy/search/snippet/MediaSnippet.java +++ b/source/net/yacy/search/snippet/MediaSnippet.java @@ -142,7 +142,7 @@ public class MediaSnippet implements Comparable, Comparator(); diff --git a/source/net/yacy/search/snippet/TextSnippet.java b/source/net/yacy/search/snippet/TextSnippet.java index 60da24613..417bb6f9a 100644 --- a/source/net/yacy/search/snippet/TextSnippet.java +++ b/source/net/yacy/search/snippet/TextSnippet.java @@ -53,6 +53,7 @@ import net.yacy.kelondro.order.Base64Order; import net.yacy.kelondro.util.ByteArray; import net.yacy.kelondro.util.ByteBuffer; import net.yacy.peers.RemoteSearch; +import net.yacy.repository.Blacklist.BlacklistType; import net.yacy.repository.LoaderDispatcher; import net.yacy.search.Switchboard; import de.anomic.crawler.retrieval.Request; @@ -209,7 +210,7 @@ public class TextSnippet implements Comparable, Comparator, Comparator