diff --git a/source/de/anomic/crawler/CrawlQueues.java b/source/de/anomic/crawler/CrawlQueues.java index 4e0cc6bde..869f6ab73 100644 --- a/source/de/anomic/crawler/CrawlQueues.java +++ b/source/de/anomic/crawler/CrawlQueues.java @@ -562,7 +562,7 @@ public class CrawlQueues { // returns null if everything went fine, a fail reason string if a problem occurred try { request.setStatus("loading", serverProcessorJob.STATUS_RUNNING); - Response response = sb.loader.load(request); + Response response = sb.loader.load(request, true); if (response == null) { request.setStatus("error", serverProcessorJob.STATUS_FINISHED); if (log.isFine()) log.logFine("problem loading " + request.url().toString() + ": no content (possibly caused by cache policy)"); diff --git a/source/de/anomic/crawler/retrieval/HTTPLoader.java b/source/de/anomic/crawler/retrieval/HTTPLoader.java index 1668c701f..b49244cca 100644 --- a/source/de/anomic/crawler/retrieval/HTTPLoader.java +++ b/source/de/anomic/crawler/retrieval/HTTPLoader.java @@ -73,14 +73,14 @@ public final class HTTPLoader { this.socketTimeout = (int) sb.getConfigLong("crawler.clientTimeout", 10000); } - public Response load(final Request entry) throws IOException { + public Response load(final Request entry, final boolean acceptOnlyParseable) throws IOException { long start = System.currentTimeMillis(); - Response doc = load(entry, DEFAULT_CRAWLING_RETRY_COUNT); + Response doc = load(entry, acceptOnlyParseable, DEFAULT_CRAWLING_RETRY_COUNT); Latency.update(entry.url().hash().substring(6), entry.url().getHost(), System.currentTimeMillis() - start); return doc; } - private Response load(final Request request, final int retryCount) throws IOException { + private Response load(final Request request, boolean acceptOnlyParseable, final int retryCount) throws IOException { if (retryCount < 0) { sb.crawlQueues.errorURL.newEntry(request, sb.peers.mySeed().hash, new Date(), 1, "redirection counter exceeded").store(); @@ -94,11 +94,13 @@ public final class HTTPLoader { if (port < 0) port = (ssl) ? 443 : 80; // if not the right file type then reject file - String supportError = Parser.supportsExtension(request.url()); - if (supportError != null) { - sb.crawlQueues.errorURL.newEntry(request, sb.peers.mySeed().hash, new Date(), 1, supportError); - throw new IOException("REJECTED WRONG EXTENSION TYPE: " + supportError); - } + if (acceptOnlyParseable) { + String supportError = Parser.supportsExtension(request.url()); + if (supportError != null) { + sb.crawlQueues.errorURL.newEntry(request, sb.peers.mySeed().hash, new Date(), 1, supportError); + throw new IOException("REJECTED WRONG EXTENSION TYPE: " + supportError); + } + } // check if url is in blacklist final String hostlow = host.toLowerCase(); @@ -134,13 +136,15 @@ public final class HTTPLoader { if (res.getStatusCode() == 200 || res.getStatusCode() == 203) { // the transfer is ok - // if the response has not the right file type then reject file - supportError = Parser.supports(request.url(), res.getResponseHeader().mime()); - if (supportError != null) { - sb.crawlQueues.errorURL.newEntry(request, sb.peers.mySeed().hash, new Date(), 1, supportError); - throw new IOException("REJECTED WRONG MIME TYPE: " + supportError); + if (acceptOnlyParseable) { + // if the response has not the right file type then reject file + String supportError = Parser.supports(request.url(), res.getResponseHeader().mime()); + if (supportError != null) { + sb.crawlQueues.errorURL.newEntry(request, sb.peers.mySeed().hash, new Date(), 1, supportError); + throw new IOException("REJECTED WRONG MIME TYPE: " + supportError); + } } - + // we write the new cache entry to file system directly res.setAccountingName("CRAWLER"); final byte[] responseBody = res.getData(); @@ -199,7 +203,7 @@ public final class HTTPLoader { // retry crawling with new url request.redirectURL(redirectionUrl); - return load(request, retryCount - 1); + return load(request, acceptOnlyParseable, retryCount - 1); } } else { // if the response has not the right response type then reject file diff --git a/source/de/anomic/crawler/retrieval/LoaderDispatcher.java b/source/de/anomic/crawler/retrieval/LoaderDispatcher.java index 40086de3a..f02f2582f 100644 --- a/source/de/anomic/crawler/retrieval/LoaderDispatcher.java +++ b/source/de/anomic/crawler/retrieval/LoaderDispatcher.java @@ -79,25 +79,12 @@ public final class LoaderDispatcher { return (HashSet) this.supportedProtocols.clone(); } - public static byte[] toBytes(Response response) { - if (response == null) return null; - return response.getContent(); - } - - public Response load(final yacyURL url) throws IOException { - return load(url, true, false); - } - - public Response load(final yacyURL url, int cachePolicy) throws IOException { - return load(url, true, false, cachePolicy); - } - public Response load( final yacyURL url, final boolean forText, final boolean global ) throws IOException { - return load(request(url, forText, global)); + return load(request(url, forText, global), forText); } public Response load( @@ -106,7 +93,7 @@ public final class LoaderDispatcher { final boolean global, int cacheStratgy ) throws IOException { - return load(request(url, forText, global), cacheStratgy); + return load(request(url, forText, global), forText, cacheStratgy); } public Request request( @@ -134,14 +121,14 @@ public final class LoaderDispatcher { 0); } - public Response load(final Request request) throws IOException { + public Response load(final Request request, final boolean acceptOnlyParseable) throws IOException { CrawlProfile.entry crawlProfile = sb.crawler.profilesActiveCrawls.getEntry(request.profileHandle()); int cacheStrategy = CrawlProfile.CACHE_STRATEGY_IFFRESH; if (crawlProfile != null) cacheStrategy = crawlProfile.cacheStrategy(); - return load(request, cacheStrategy); + return load(request, acceptOnlyParseable, cacheStrategy); } - public Response load(final Request request, int cacheStrategy) throws IOException { + public Response load(final Request request, final boolean acceptOnlyParseable, int cacheStrategy) throws IOException { // get the protocol of the next URL final String protocol = request.url().getProtocol(); final String host = request.url().getHost(); @@ -223,7 +210,7 @@ public final class LoaderDispatcher { // load resource from the internet Response response = null; - if ((protocol.equals("http") || (protocol.equals("https")))) response = httpLoader.load(request); + if ((protocol.equals("http") || (protocol.equals("https")))) response = httpLoader.load(request, acceptOnlyParseable); if (protocol.equals("ftp")) response = ftpLoader.load(request); if (response != null) { // we got something. Now check if we want to store that to the cache diff --git a/source/de/anomic/document/parser/html/ContentScraper.java b/source/de/anomic/document/parser/html/ContentScraper.java index 629bcaa94..df42c9295 100644 --- a/source/de/anomic/document/parser/html/ContentScraper.java +++ b/source/de/anomic/document/parser/html/ContentScraper.java @@ -44,6 +44,7 @@ import java.util.Properties; import javax.swing.event.EventListenerList; import de.anomic.crawler.retrieval.LoaderDispatcher; +import de.anomic.crawler.retrieval.Response; import de.anomic.document.parser.htmlParser; import de.anomic.kelondro.util.FileUtils; import de.anomic.server.serverCharBuffer; @@ -509,7 +510,8 @@ public class ContentScraper extends AbstractScraper implements Scraper { public static ContentScraper parseResource(final LoaderDispatcher loader, final yacyURL location, int cachePolicy) throws IOException { // load page - byte[] page = LoaderDispatcher.toBytes(loader.load(location, cachePolicy)); + Response r = loader.load(location, true, false, cachePolicy); + byte[] page = (r == null) ? null : r.getContent(); if (page == null) throw new IOException("no response from url " + location.toString()); // scrape content diff --git a/source/de/anomic/ymage/ymageOSM.java b/source/de/anomic/ymage/ymageOSM.java index a3dd2bd4a..056db0486 100644 --- a/source/de/anomic/ymage/ymageOSM.java +++ b/source/de/anomic/ymage/ymageOSM.java @@ -32,6 +32,7 @@ import java.io.EOFException; import java.io.IOException; import java.io.InputStream; import java.net.MalformedURLException; +import java.util.Random; import javax.imageio.ImageIO; @@ -99,6 +100,7 @@ public class ymageOSM { } } + public static final Random r = new Random(System.currentTimeMillis()); // to selet tile server public static class tileCoordinates { int xtile, ytile, zoom; @@ -116,7 +118,8 @@ public class ymageOSM { } public String url() { - return("http://tile.openstreetmap.org/" + zoom + "/" + xtile + "/" + ytile + ".png"); + char server = (char) ((int)'a' + r.nextInt(3)); + return("http://" + server + ".tile.openstreetmap.org/" + zoom + "/" + xtile + "/" + ytile + ".png"); } }