mirror of
https://github.com/yacy/yacy_search_server.git
synced 2024-09-19 00:01:41 +02:00
- fixed a problem loading images through yacy's document loader,
this denied non-parseable documents which excluded all images - fixed url of osm tile server git-svn-id: https://svn.berlios.de/svnroot/repos/yacy/trunk@6287 6c8d7289-2bf4-0310-a012-ef5d649a1542
This commit is contained in:
parent
67eddaec4b
commit
44579fa06d
|
@ -562,7 +562,7 @@ public class CrawlQueues {
|
|||
// returns null if everything went fine, a fail reason string if a problem occurred
|
||||
try {
|
||||
request.setStatus("loading", serverProcessorJob.STATUS_RUNNING);
|
||||
Response response = sb.loader.load(request);
|
||||
Response response = sb.loader.load(request, true);
|
||||
if (response == null) {
|
||||
request.setStatus("error", serverProcessorJob.STATUS_FINISHED);
|
||||
if (log.isFine()) log.logFine("problem loading " + request.url().toString() + ": no content (possibly caused by cache policy)");
|
||||
|
|
|
@ -73,14 +73,14 @@ public final class HTTPLoader {
|
|||
this.socketTimeout = (int) sb.getConfigLong("crawler.clientTimeout", 10000);
|
||||
}
|
||||
|
||||
public Response load(final Request entry) throws IOException {
|
||||
public Response load(final Request entry, final boolean acceptOnlyParseable) throws IOException {
|
||||
long start = System.currentTimeMillis();
|
||||
Response doc = load(entry, DEFAULT_CRAWLING_RETRY_COUNT);
|
||||
Response doc = load(entry, acceptOnlyParseable, DEFAULT_CRAWLING_RETRY_COUNT);
|
||||
Latency.update(entry.url().hash().substring(6), entry.url().getHost(), System.currentTimeMillis() - start);
|
||||
return doc;
|
||||
}
|
||||
|
||||
private Response load(final Request request, final int retryCount) throws IOException {
|
||||
private Response load(final Request request, boolean acceptOnlyParseable, final int retryCount) throws IOException {
|
||||
|
||||
if (retryCount < 0) {
|
||||
sb.crawlQueues.errorURL.newEntry(request, sb.peers.mySeed().hash, new Date(), 1, "redirection counter exceeded").store();
|
||||
|
@ -94,11 +94,13 @@ public final class HTTPLoader {
|
|||
if (port < 0) port = (ssl) ? 443 : 80;
|
||||
|
||||
// if not the right file type then reject file
|
||||
String supportError = Parser.supportsExtension(request.url());
|
||||
if (supportError != null) {
|
||||
sb.crawlQueues.errorURL.newEntry(request, sb.peers.mySeed().hash, new Date(), 1, supportError);
|
||||
throw new IOException("REJECTED WRONG EXTENSION TYPE: " + supportError);
|
||||
}
|
||||
if (acceptOnlyParseable) {
|
||||
String supportError = Parser.supportsExtension(request.url());
|
||||
if (supportError != null) {
|
||||
sb.crawlQueues.errorURL.newEntry(request, sb.peers.mySeed().hash, new Date(), 1, supportError);
|
||||
throw new IOException("REJECTED WRONG EXTENSION TYPE: " + supportError);
|
||||
}
|
||||
}
|
||||
|
||||
// check if url is in blacklist
|
||||
final String hostlow = host.toLowerCase();
|
||||
|
@ -134,13 +136,15 @@ public final class HTTPLoader {
|
|||
if (res.getStatusCode() == 200 || res.getStatusCode() == 203) {
|
||||
// the transfer is ok
|
||||
|
||||
// if the response has not the right file type then reject file
|
||||
supportError = Parser.supports(request.url(), res.getResponseHeader().mime());
|
||||
if (supportError != null) {
|
||||
sb.crawlQueues.errorURL.newEntry(request, sb.peers.mySeed().hash, new Date(), 1, supportError);
|
||||
throw new IOException("REJECTED WRONG MIME TYPE: " + supportError);
|
||||
if (acceptOnlyParseable) {
|
||||
// if the response has not the right file type then reject file
|
||||
String supportError = Parser.supports(request.url(), res.getResponseHeader().mime());
|
||||
if (supportError != null) {
|
||||
sb.crawlQueues.errorURL.newEntry(request, sb.peers.mySeed().hash, new Date(), 1, supportError);
|
||||
throw new IOException("REJECTED WRONG MIME TYPE: " + supportError);
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
// we write the new cache entry to file system directly
|
||||
res.setAccountingName("CRAWLER");
|
||||
final byte[] responseBody = res.getData();
|
||||
|
@ -199,7 +203,7 @@ public final class HTTPLoader {
|
|||
|
||||
// retry crawling with new url
|
||||
request.redirectURL(redirectionUrl);
|
||||
return load(request, retryCount - 1);
|
||||
return load(request, acceptOnlyParseable, retryCount - 1);
|
||||
}
|
||||
} else {
|
||||
// if the response has not the right response type then reject file
|
||||
|
|
|
@ -79,25 +79,12 @@ public final class LoaderDispatcher {
|
|||
return (HashSet<String>) this.supportedProtocols.clone();
|
||||
}
|
||||
|
||||
public static byte[] toBytes(Response response) {
|
||||
if (response == null) return null;
|
||||
return response.getContent();
|
||||
}
|
||||
|
||||
public Response load(final yacyURL url) throws IOException {
|
||||
return load(url, true, false);
|
||||
}
|
||||
|
||||
public Response load(final yacyURL url, int cachePolicy) throws IOException {
|
||||
return load(url, true, false, cachePolicy);
|
||||
}
|
||||
|
||||
public Response load(
|
||||
final yacyURL url,
|
||||
final boolean forText,
|
||||
final boolean global
|
||||
) throws IOException {
|
||||
return load(request(url, forText, global));
|
||||
return load(request(url, forText, global), forText);
|
||||
}
|
||||
|
||||
public Response load(
|
||||
|
@ -106,7 +93,7 @@ public final class LoaderDispatcher {
|
|||
final boolean global,
|
||||
int cacheStratgy
|
||||
) throws IOException {
|
||||
return load(request(url, forText, global), cacheStratgy);
|
||||
return load(request(url, forText, global), forText, cacheStratgy);
|
||||
}
|
||||
|
||||
public Request request(
|
||||
|
@ -134,14 +121,14 @@ public final class LoaderDispatcher {
|
|||
0);
|
||||
}
|
||||
|
||||
public Response load(final Request request) throws IOException {
|
||||
public Response load(final Request request, final boolean acceptOnlyParseable) throws IOException {
|
||||
CrawlProfile.entry crawlProfile = sb.crawler.profilesActiveCrawls.getEntry(request.profileHandle());
|
||||
int cacheStrategy = CrawlProfile.CACHE_STRATEGY_IFFRESH;
|
||||
if (crawlProfile != null) cacheStrategy = crawlProfile.cacheStrategy();
|
||||
return load(request, cacheStrategy);
|
||||
return load(request, acceptOnlyParseable, cacheStrategy);
|
||||
}
|
||||
|
||||
public Response load(final Request request, int cacheStrategy) throws IOException {
|
||||
public Response load(final Request request, final boolean acceptOnlyParseable, int cacheStrategy) throws IOException {
|
||||
// get the protocol of the next URL
|
||||
final String protocol = request.url().getProtocol();
|
||||
final String host = request.url().getHost();
|
||||
|
@ -223,7 +210,7 @@ public final class LoaderDispatcher {
|
|||
|
||||
// load resource from the internet
|
||||
Response response = null;
|
||||
if ((protocol.equals("http") || (protocol.equals("https")))) response = httpLoader.load(request);
|
||||
if ((protocol.equals("http") || (protocol.equals("https")))) response = httpLoader.load(request, acceptOnlyParseable);
|
||||
if (protocol.equals("ftp")) response = ftpLoader.load(request);
|
||||
if (response != null) {
|
||||
// we got something. Now check if we want to store that to the cache
|
||||
|
|
|
@ -44,6 +44,7 @@ import java.util.Properties;
|
|||
import javax.swing.event.EventListenerList;
|
||||
|
||||
import de.anomic.crawler.retrieval.LoaderDispatcher;
|
||||
import de.anomic.crawler.retrieval.Response;
|
||||
import de.anomic.document.parser.htmlParser;
|
||||
import de.anomic.kelondro.util.FileUtils;
|
||||
import de.anomic.server.serverCharBuffer;
|
||||
|
@ -509,7 +510,8 @@ public class ContentScraper extends AbstractScraper implements Scraper {
|
|||
|
||||
public static ContentScraper parseResource(final LoaderDispatcher loader, final yacyURL location, int cachePolicy) throws IOException {
|
||||
// load page
|
||||
byte[] page = LoaderDispatcher.toBytes(loader.load(location, cachePolicy));
|
||||
Response r = loader.load(location, true, false, cachePolicy);
|
||||
byte[] page = (r == null) ? null : r.getContent();
|
||||
if (page == null) throw new IOException("no response from url " + location.toString());
|
||||
|
||||
// scrape content
|
||||
|
|
|
@ -32,6 +32,7 @@ import java.io.EOFException;
|
|||
import java.io.IOException;
|
||||
import java.io.InputStream;
|
||||
import java.net.MalformedURLException;
|
||||
import java.util.Random;
|
||||
|
||||
import javax.imageio.ImageIO;
|
||||
|
||||
|
@ -99,6 +100,7 @@ public class ymageOSM {
|
|||
}
|
||||
}
|
||||
|
||||
public static final Random r = new Random(System.currentTimeMillis()); // to selet tile server
|
||||
public static class tileCoordinates {
|
||||
|
||||
int xtile, ytile, zoom;
|
||||
|
@ -116,7 +118,8 @@ public class ymageOSM {
|
|||
}
|
||||
|
||||
public String url() {
|
||||
return("http://tile.openstreetmap.org/" + zoom + "/" + xtile + "/" + ytile + ".png");
|
||||
char server = (char) ((int)'a' + r.nextInt(3));
|
||||
return("http://" + server + ".tile.openstreetmap.org/" + zoom + "/" + xtile + "/" + ytile + ".png");
|
||||
}
|
||||
|
||||
}
|
||||
|
|
Loading…
Reference in New Issue
Block a user