- fixed a problem loading images through yacy's document loader,

this denied non-parseable documents which excluded all images
- fixed url of osm tile server

git-svn-id: https://svn.berlios.de/svnroot/repos/yacy/trunk@6287 6c8d7289-2bf4-0310-a012-ef5d649a1542
This commit is contained in:
orbiter 2009-09-03 11:46:08 +00:00
parent 67eddaec4b
commit 44579fa06d
5 changed files with 33 additions and 37 deletions

View File

@ -562,7 +562,7 @@ public class CrawlQueues {
// returns null if everything went fine, a fail reason string if a problem occurred
try {
request.setStatus("loading", serverProcessorJob.STATUS_RUNNING);
Response response = sb.loader.load(request);
Response response = sb.loader.load(request, true);
if (response == null) {
request.setStatus("error", serverProcessorJob.STATUS_FINISHED);
if (log.isFine()) log.logFine("problem loading " + request.url().toString() + ": no content (possibly caused by cache policy)");

View File

@ -73,14 +73,14 @@ public final class HTTPLoader {
this.socketTimeout = (int) sb.getConfigLong("crawler.clientTimeout", 10000);
}
public Response load(final Request entry) throws IOException {
public Response load(final Request entry, final boolean acceptOnlyParseable) throws IOException {
long start = System.currentTimeMillis();
Response doc = load(entry, DEFAULT_CRAWLING_RETRY_COUNT);
Response doc = load(entry, acceptOnlyParseable, DEFAULT_CRAWLING_RETRY_COUNT);
Latency.update(entry.url().hash().substring(6), entry.url().getHost(), System.currentTimeMillis() - start);
return doc;
}
private Response load(final Request request, final int retryCount) throws IOException {
private Response load(final Request request, boolean acceptOnlyParseable, final int retryCount) throws IOException {
if (retryCount < 0) {
sb.crawlQueues.errorURL.newEntry(request, sb.peers.mySeed().hash, new Date(), 1, "redirection counter exceeded").store();
@ -94,11 +94,13 @@ public final class HTTPLoader {
if (port < 0) port = (ssl) ? 443 : 80;
// if not the right file type then reject file
String supportError = Parser.supportsExtension(request.url());
if (supportError != null) {
sb.crawlQueues.errorURL.newEntry(request, sb.peers.mySeed().hash, new Date(), 1, supportError);
throw new IOException("REJECTED WRONG EXTENSION TYPE: " + supportError);
}
if (acceptOnlyParseable) {
String supportError = Parser.supportsExtension(request.url());
if (supportError != null) {
sb.crawlQueues.errorURL.newEntry(request, sb.peers.mySeed().hash, new Date(), 1, supportError);
throw new IOException("REJECTED WRONG EXTENSION TYPE: " + supportError);
}
}
// check if url is in blacklist
final String hostlow = host.toLowerCase();
@ -134,13 +136,15 @@ public final class HTTPLoader {
if (res.getStatusCode() == 200 || res.getStatusCode() == 203) {
// the transfer is ok
// if the response has not the right file type then reject file
supportError = Parser.supports(request.url(), res.getResponseHeader().mime());
if (supportError != null) {
sb.crawlQueues.errorURL.newEntry(request, sb.peers.mySeed().hash, new Date(), 1, supportError);
throw new IOException("REJECTED WRONG MIME TYPE: " + supportError);
if (acceptOnlyParseable) {
// if the response has not the right file type then reject file
String supportError = Parser.supports(request.url(), res.getResponseHeader().mime());
if (supportError != null) {
sb.crawlQueues.errorURL.newEntry(request, sb.peers.mySeed().hash, new Date(), 1, supportError);
throw new IOException("REJECTED WRONG MIME TYPE: " + supportError);
}
}
// we write the new cache entry to file system directly
res.setAccountingName("CRAWLER");
final byte[] responseBody = res.getData();
@ -199,7 +203,7 @@ public final class HTTPLoader {
// retry crawling with new url
request.redirectURL(redirectionUrl);
return load(request, retryCount - 1);
return load(request, acceptOnlyParseable, retryCount - 1);
}
} else {
// if the response has not the right response type then reject file

View File

@ -79,25 +79,12 @@ public final class LoaderDispatcher {
return (HashSet<String>) this.supportedProtocols.clone();
}
public static byte[] toBytes(Response response) {
if (response == null) return null;
return response.getContent();
}
public Response load(final yacyURL url) throws IOException {
return load(url, true, false);
}
public Response load(final yacyURL url, int cachePolicy) throws IOException {
return load(url, true, false, cachePolicy);
}
public Response load(
final yacyURL url,
final boolean forText,
final boolean global
) throws IOException {
return load(request(url, forText, global));
return load(request(url, forText, global), forText);
}
public Response load(
@ -106,7 +93,7 @@ public final class LoaderDispatcher {
final boolean global,
int cacheStratgy
) throws IOException {
return load(request(url, forText, global), cacheStratgy);
return load(request(url, forText, global), forText, cacheStratgy);
}
public Request request(
@ -134,14 +121,14 @@ public final class LoaderDispatcher {
0);
}
public Response load(final Request request) throws IOException {
public Response load(final Request request, final boolean acceptOnlyParseable) throws IOException {
CrawlProfile.entry crawlProfile = sb.crawler.profilesActiveCrawls.getEntry(request.profileHandle());
int cacheStrategy = CrawlProfile.CACHE_STRATEGY_IFFRESH;
if (crawlProfile != null) cacheStrategy = crawlProfile.cacheStrategy();
return load(request, cacheStrategy);
return load(request, acceptOnlyParseable, cacheStrategy);
}
public Response load(final Request request, int cacheStrategy) throws IOException {
public Response load(final Request request, final boolean acceptOnlyParseable, int cacheStrategy) throws IOException {
// get the protocol of the next URL
final String protocol = request.url().getProtocol();
final String host = request.url().getHost();
@ -223,7 +210,7 @@ public final class LoaderDispatcher {
// load resource from the internet
Response response = null;
if ((protocol.equals("http") || (protocol.equals("https")))) response = httpLoader.load(request);
if ((protocol.equals("http") || (protocol.equals("https")))) response = httpLoader.load(request, acceptOnlyParseable);
if (protocol.equals("ftp")) response = ftpLoader.load(request);
if (response != null) {
// we got something. Now check if we want to store that to the cache

View File

@ -44,6 +44,7 @@ import java.util.Properties;
import javax.swing.event.EventListenerList;
import de.anomic.crawler.retrieval.LoaderDispatcher;
import de.anomic.crawler.retrieval.Response;
import de.anomic.document.parser.htmlParser;
import de.anomic.kelondro.util.FileUtils;
import de.anomic.server.serverCharBuffer;
@ -509,7 +510,8 @@ public class ContentScraper extends AbstractScraper implements Scraper {
public static ContentScraper parseResource(final LoaderDispatcher loader, final yacyURL location, int cachePolicy) throws IOException {
// load page
byte[] page = LoaderDispatcher.toBytes(loader.load(location, cachePolicy));
Response r = loader.load(location, true, false, cachePolicy);
byte[] page = (r == null) ? null : r.getContent();
if (page == null) throw new IOException("no response from url " + location.toString());
// scrape content

View File

@ -32,6 +32,7 @@ import java.io.EOFException;
import java.io.IOException;
import java.io.InputStream;
import java.net.MalformedURLException;
import java.util.Random;
import javax.imageio.ImageIO;
@ -99,6 +100,7 @@ public class ymageOSM {
}
}
public static final Random r = new Random(System.currentTimeMillis()); // to selet tile server
public static class tileCoordinates {
int xtile, ytile, zoom;
@ -116,7 +118,8 @@ public class ymageOSM {
}
public String url() {
return("http://tile.openstreetmap.org/" + zoom + "/" + xtile + "/" + ytile + ".png");
char server = (char) ((int)'a' + r.nextInt(3));
return("http://" + server + ".tile.openstreetmap.org/" + zoom + "/" + xtile + "/" + ytile + ".png");
}
}