Process large or local file images dealing directly with content

InputStream.
This commit is contained in:
luc 2015-11-18 10:15:06 +01:00
parent 3c4c77099d
commit f01d49c37a
6 changed files with 728 additions and 239 deletions

View File

@ -28,10 +28,13 @@ import java.awt.Image;
import java.awt.MediaTracker;
import java.awt.image.BufferedImage;
import java.awt.image.Raster;
import java.io.ByteArrayInputStream;
import java.io.IOException;
import java.io.InputStream;
import java.util.Map;
import javax.imageio.ImageIO;
import javax.imageio.stream.ImageInputStream;
import net.yacy.cora.document.id.DigestURL;
import net.yacy.cora.document.id.MultiProtocolURL;
import net.yacy.cora.federate.yacy.CacheStrategy;
@ -42,11 +45,11 @@ import net.yacy.cora.protocol.RequestHeader;
import net.yacy.cora.storage.ConcurrentARC;
import net.yacy.cora.util.ConcurrentLog;
import net.yacy.data.URLLicense;
import net.yacy.document.ImageParser;
import net.yacy.kelondro.util.MemoryControl;
import net.yacy.kelondro.workflow.WorkflowProcessor;
import net.yacy.peers.graphics.EncodedImage;
import net.yacy.repository.Blacklist.BlacklistType;
import net.yacy.repository.LoaderDispatcher;
import net.yacy.search.Switchboard;
import net.yacy.server.serverObjects;
import net.yacy.server.serverSwitch;
@ -74,8 +77,8 @@ public class ViewImage {
* when specified url is malformed, or a read/write error
* occured, or input or target image format is not supported.
* Sould end in a HTTP 500 error whose processing is more
* consistent across browsers than a response with zero
* content bytes.
* consistent across browsers than a response with zero content
* bytes.
*/
public static Object respond(final RequestHeader header, final serverObjects post, final serverSwitch env)
throws IOException {
@ -113,39 +116,81 @@ public class ViewImage {
if (image != null) {
encodedImage = new EncodedImage(image, ext, post.getBoolean("isStatic"));
} else {
byte[] resourceb = null;
if (url != null)
try {
String agentName = post.get("agentName", auth ? ClientIdentification.yacyIntranetCrawlerAgentName
: ClientIdentification.yacyInternetCrawlerAgentName);
ClientIdentification.Agent agent = ClientIdentification.getAgent(agentName);
resourceb = sb.loader.loadContent(sb.loader.request(url, false, true), CacheStrategy.IFEXIST,
BlacklistType.SEARCH, agent);
} catch (final IOException e) {
ConcurrentLog.fine("ViewImage", "cannot load: " + e.getMessage());
throw e;
}
boolean okToCache = true;
if (resourceb == null) {
/*
* Throw an exception, wich will end in a HTTP 500 response,
* better handled by browsers than an empty image
*/
throw new IOException("Image could not be loaded.");
}
String urlExt = MultiProtocolURL.getFileExtension(url.getFileName());
if (ext != null && ext.equalsIgnoreCase(urlExt) && isBrowserRendered(urlExt)) {
return new ByteArrayInputStream(resourceb);
return openInputStream(post, sb.loader, auth, url);
}
// read image
encodedImage = parseAndScale(post, auth, urlString, ext, okToCache, resourceb);
ImageInputStream imageInStream = null;
InputStream inStream = null;
/*
* When opening a file, the most efficient is to open
* ImageInputStream directly on file
*/
if (url.isFile()) {
imageInStream = ImageIO.createImageInputStream(url.getFSFile());
} else {
inStream = openInputStream(post, sb.loader, auth, url);
imageInStream = ImageIO.createImageInputStream(inStream);
}
try {
// read image
encodedImage = parseAndScale(post, auth, urlString, ext, imageInStream);
} finally {
/*
* imageInStream.close() method doesn't close source input
* stream
*/
if (inStream != null) {
try {
inStream.close();
} catch (IOException ignored) {
}
}
}
}
return encodedImage;
}
/**
* Open input stream on image url using provided loader. All parameters must
* not be null.
*
* @param post
* post parameters.
* @param loader.
* Resources loader.
* @param auth
* true when user has credentials to load full images.
* @param url
* image url.
* @return an open input stream instance (don't forget to close it).
* @throws IOException
* when a read/write error occured.
*/
private static InputStream openInputStream(final serverObjects post, final LoaderDispatcher loader,
final boolean auth, DigestURL url) throws IOException {
InputStream inStream = null;
if (url != null) {
try {
String agentName = post.get("agentName", auth ? ClientIdentification.yacyIntranetCrawlerAgentName
: ClientIdentification.yacyInternetCrawlerAgentName);
ClientIdentification.Agent agent = ClientIdentification.getAgent(agentName);
inStream = loader.openInputStream(loader.request(url, false, true), CacheStrategy.IFEXIST,
BlacklistType.SEARCH, agent);
} catch (final IOException e) {
ConcurrentLog.fine("ViewImage", "cannot load: " + e.getMessage());
throw e;
}
}
if (inStream == null) {
throw new IOException("Input stream could no be open");
}
return inStream;
}
/**
* @param formatName
* informal file format name. For example : "png".
@ -165,31 +210,35 @@ public class ViewImage {
}
/**
* Process resourceb byte array to try to produce an EncodedImage instance
* eventually scaled and cropped depending on post parameters.
* Process source image to try to produce an EncodedImage instance
* eventually scaled and clipped depending on post parameters. When
* processed, imageInStream is closed.
*
* @param post
* request post parameters. Must not be null.
* @param auth
* true when access rigths are OK.
* @param urlString
* image source URL. Must not be null.
* image source URL as String. Must not be null.
* @param ext
* image file extension. May be null.
* @param okToCache
* true when image can be cached
* @param resourceb
* byte array. Must not be null.
* target image file format. May be null.
* @param imageInStream
* open stream on image content. Must not be null.
* @return an EncodedImage instance.
* @throws IOException
* when image could not be parsed or encoded to specified format
*/
protected static EncodedImage parseAndScale(serverObjects post, boolean auth, String urlString, String ext,
boolean okToCache, byte[] resourceb) throws IOException {
ImageInputStream imageInStream) throws IOException {
EncodedImage encodedImage = null;
Image image = ImageParser.parse(urlString, resourceb);
Image image = ImageIO.read(imageInStream);
if (image == null) {
try {
/* When a null image is returned, we have to close the stream */
imageInStream.close();
} catch (IOException ignoredException) {
}
/*
* Throw an exception, wich will end in a HTTP 500 response, better
* handled by browsers than an empty image
@ -197,53 +246,52 @@ public class ViewImage {
throw new IOException("Image format is not supported.");
}
if (image != null) {
int maxwidth = post.getInt("maxwidth", 0);
int maxheight = post.getInt("maxheight", 0);
final boolean quadratic = post.containsKey("quadratic");
boolean isStatic = post.getBoolean("isStatic");
if (!auth || maxwidth != 0 || maxheight != 0) {
int maxwidth = post.getInt("maxwidth", 0);
int maxheight = post.getInt("maxheight", 0);
final boolean quadratic = post.containsKey("quadratic");
boolean isStatic = post.getBoolean("isStatic");
if (!auth || maxwidth != 0 || maxheight != 0) {
// find original size
int h = image.getHeight(null);
int w = image.getWidth(null);
// find original size
final int originWidth = image.getWidth(null);
final int originHeigth = image.getHeight(null);
// in case of not-authorized access shrink the image to
// prevent
// copyright problems, so that images are not larger than
// thumbnails
Dimension maxDimensions = calculateMaxDimensions(auth, w, h, maxwidth, maxheight);
// in case of not-authorized access shrink the image to
// prevent
// copyright problems, so that images are not larger than
// thumbnails
Dimension maxDimensions = calculateMaxDimensions(auth, originWidth, originHeigth, maxwidth, maxheight);
// if a quadratic flag is set, we cut the image out to be in
// quadratic shape
if (quadratic && w != h) {
image = makeSquare(image, h, w);
h = image.getHeight(null);
w = image.getWidth(null);
}
Dimension finalDimensions = calculateDimensions(w, h, maxDimensions);
if (w != finalDimensions.width && h != finalDimensions.height) {
image = scale(finalDimensions.width, finalDimensions.height, image);
}
if ((finalDimensions.width == 16) && (finalDimensions.height == 16) && okToCache) {
// this might be a favicon, store image to cache for
// faster
// re-load later on
iconcache.put(urlString, image);
}
// if a quadratic flag is set, we cut the image out to be in
// quadratic shape
int w = originWidth;
int h = originHeigth;
if (quadratic && originWidth != originHeigth) {
image = makeSquare(image, originHeigth, originWidth);
h = image.getHeight(null);
w = image.getWidth(null);
}
/*
* An error can still occur when transcoding from buffered image to
* target ext : in that case return null
*/
encodedImage = new EncodedImage(image, ext, isStatic);
if (encodedImage.getImage().length() == 0) {
throw new IOException("Image could not be encoded to format : " + ext);
Dimension finalDimensions = calculateDimensions(w, h, maxDimensions);
if (w != finalDimensions.width && h != finalDimensions.height) {
image = scale(finalDimensions.width, finalDimensions.height, image);
}
if (finalDimensions.width == 16 && finalDimensions.height == 16) {
// this might be a favicon, store image to cache for
// faster
// re-load later on
iconcache.put(urlString, image);
}
}
/*
* An error can still occur when transcoding from buffered image to
* target ext : in that case return null
*/
encodedImage = new EncodedImage(image, ext, isStatic);
if (encodedImage.getImage().length() == 0) {
throw new IOException("Image could not be encoded to format : " + ext);
}
return encodedImage;
}

View File

@ -0,0 +1,125 @@
/**
* HTTPInputStream
* Copyright 2014 by Michael Peter Christen; mc@yacy.net, Frankfurt a. M., Germany
* First published 26.11.2014 on http://yacy.net
*
* This library is free software; you can redistribute it and/or
* modify it under the terms of the GNU Lesser General Public
* License as published by the Free Software Foundation; either
* version 2.1 of the License, or (at your option) any later version.
*
* This library is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
* Lesser General Public License for more details.
*
* You should have received a copy of the GNU Lesser General Public License
* along with this program in the file lgpl21.txt
* If not, see <http://www.gnu.org/licenses/>.
*/
package net.yacy.cora.util;
import java.io.IOException;
import java.io.InputStream;
import net.yacy.cora.protocol.http.HTTPClient;
/**
* A HTTP InputStream delegating to HTTPClient. Use it when streaming HTTP content to easily finish HTTP client when closing stream.
* @author luc
*
*/
public class HTTPInputStream extends InputStream {
/** HTTP client */
private HTTPClient httpClient;
/** Encapsulated HTTP content stream */
private InputStream contentStream;
/**
* Constructs from a httpClient.
* @param httpClient a httpClient with accessible stream content.
* @throws IOException when content stream can not be open on httpClient
*/
public HTTPInputStream(HTTPClient httpClient) throws IOException {
if(httpClient == null) {
throw new IllegalArgumentException("httpClient is null");
}
this.httpClient = httpClient;
this.contentStream = httpClient.getContentstream();
if(this.contentStream == null) {
throw new IOException("content stream is null");
}
}
/**
* Close properly HTTP connection with httpClient
*/
@Override
public void close() throws IOException {
httpClient.finish();
}
@Override
public int read() throws IOException {
return contentStream.read();
}
@Override
public int hashCode() {
return contentStream.hashCode();
}
@Override
public int read(byte[] b) throws IOException {
return contentStream.read(b);
}
@Override
public boolean equals(Object obj) {
return contentStream.equals(obj);
}
@Override
public int read(byte[] b, int off, int len) throws IOException {
return contentStream.read(b, off, len);
}
@Override
public long skip(long n) throws IOException {
return contentStream.skip(n);
}
@Override
public String toString() {
return contentStream.toString();
}
@Override
public int available() throws IOException {
return contentStream.available();
}
@Override
public synchronized void mark(int readlimit) {
contentStream.mark(readlimit);
}
@Override
public synchronized void reset() throws IOException {
contentStream.reset();
}
@Override
public boolean markSupported() {
return contentStream.markSupported();
}
}

View File

@ -24,7 +24,9 @@
package net.yacy.crawler.retrieval;
import java.io.ByteArrayInputStream;
import java.io.IOException;
import java.io.InputStream;
import net.yacy.cora.document.id.DigestURL;
import net.yacy.cora.federate.solr.FailCategory;
@ -34,7 +36,9 @@ import net.yacy.cora.protocol.RequestHeader;
import net.yacy.cora.protocol.ResponseHeader;
import net.yacy.cora.protocol.http.HTTPClient;
import net.yacy.cora.util.ConcurrentLog;
import net.yacy.cora.util.HTTPInputStream;
import net.yacy.crawler.CrawlSwitchboard;
import net.yacy.crawler.data.Cache;
import net.yacy.crawler.data.CrawlProfile;
import net.yacy.crawler.data.Latency;
import net.yacy.kelondro.io.ByteCount;
@ -75,6 +79,208 @@ public final class HTTPLoader {
Latency.updateAfterLoad(entry.url(), System.currentTimeMillis() - start);
return doc;
}
/**
* Open input stream on a requested HTTP resource. When resource is small, fully load it and returns a ByteArrayInputStream instance.
* @param request
* @param profile crawl profile
* @param retryCount remaining redirect retries count
* @param maxFileSize max file size to load. -1 means no limit.
* @param blacklistType blacklist type to use
* @param agent agent identifier
* @return an open input stream. Don't forget to close it.
* @throws IOException when an error occured
*/
public InputStream openInputStream(final Request request, CrawlProfile profile, final int retryCount,
final int maxFileSize, final BlacklistType blacklistType, final ClientIdentification.Agent agent)
throws IOException {
if (retryCount < 0) {
this.sb.crawlQueues.errorURL.push(request.url(), request.depth(), profile,
FailCategory.TEMPORARY_NETWORK_FAILURE, "retry counter exceeded", -1);
throw new IOException(
"retry counter exceeded for URL " + request.url().toString() + ". Processing aborted.$");
}
DigestURL url = request.url();
final String host = url.getHost();
if (host == null || host.length() < 2) {
throw new IOException("host is not well-formed: '" + host + "'");
}
final String path = url.getFile();
int port = url.getPort();
final boolean ssl = url.getProtocol().equals("https");
if (port < 0)
port = (ssl) ? 443 : 80;
// check if url is in blacklist
final String hostlow = host.toLowerCase();
if (blacklistType != null && Switchboard.urlBlacklist.isListed(blacklistType, hostlow, path)) {
this.sb.crawlQueues.errorURL.push(request.url(), request.depth(), profile, FailCategory.FINAL_LOAD_CONTEXT,
"url in blacklist", -1);
throw new IOException("CRAWLER Rejecting URL '" + request.url().toString() + "'. URL is in blacklist.$");
}
// resolve yacy and yacyh domains
final AlternativeDomainNames yacyResolver = this.sb.peers;
if (yacyResolver != null) {
final String yAddress = yacyResolver.resolve(host);
if (yAddress != null) {
url = new DigestURL(url.getProtocol() + "://" + yAddress + path);
}
}
// create a request header
final RequestHeader requestHeader = createRequestheader(request, agent);
// HTTP-Client
final HTTPClient client = new HTTPClient(agent);
client.setRedirecting(false); // we want to handle redirection
// ourselves, so we don't index pages
// twice
client.setTimout(this.socketTimeout);
client.setHeader(requestHeader.entrySet());
// send request
client.GET(url, false);
final int statusCode = client.getHttpResponse().getStatusLine().getStatusCode();
final ResponseHeader responseHeader = new ResponseHeader(statusCode, client.getHttpResponse().getAllHeaders());
String requestURLString = request.url().toNormalform(true);
// check redirection
if (statusCode > 299 && statusCode < 310) {
final DigestURL redirectionUrl = extractRedirectURL(request, profile, url, client, statusCode,
responseHeader, requestURLString);
if (this.sb.getConfigBool(SwitchboardConstants.CRAWLER_FOLLOW_REDIRECTS, true)) {
// we have two use cases here: loading from a crawl or just
// loading the url. Check this:
if (profile != null && !CrawlSwitchboard.DEFAULT_PROFILES.contains(profile.name())) {
// put redirect url on the crawler queue to repeat a
// double-check
request.redirectURL(redirectionUrl);
this.sb.crawlStacker.stackCrawl(request);
// in the end we must throw an exception (even if this is
// not an error, just to abort the current process
throw new IOException("CRAWLER Redirect of URL=" + requestURLString + " to "
+ redirectionUrl.toNormalform(false) + " placed on crawler queue for double-check");
}
// if we are already doing a shutdown we don't need to retry
// crawling
if (Thread.currentThread().isInterrupted()) {
this.sb.crawlQueues.errorURL.push(request.url(), request.depth(), profile,
FailCategory.FINAL_LOAD_CONTEXT, "server shutdown", statusCode);
throw new IOException(
"CRAWLER Redirect of URL=" + requestURLString + " aborted because of server shutdown.$");
}
// retry crawling with new url
request.redirectURL(redirectionUrl);
return openInputStream(request, profile, retryCount - 1, maxFileSize, blacklistType, agent);
}
// we don't want to follow redirects
this.sb.crawlQueues.errorURL.push(request.url(), request.depth(), profile,
FailCategory.FINAL_PROCESS_CONTEXT, "redirection not wanted", statusCode);
throw new IOException("REJECTED UNWANTED REDIRECTION '" + client.getHttpResponse().getStatusLine()
+ "' for URL '" + requestURLString + "'$");
} else if (statusCode == 200 || statusCode == 203) {
// the transfer is ok
/*
* When content is not large (less than 1MB), we have better cache it if cache is enabled and url is not local
*/
long contentLength = client.getHttpResponse().getEntity().getContentLength();
if (profile != null && profile.storeHTCache() && contentLength > 0 && contentLength < (1024 * 1024) && !url.isLocal()) {
byte[] content = HTTPClient.getByteArray(client.getHttpResponse().getEntity(), maxFileSize);
try {
Cache.store(url, responseHeader, content);
} catch (final IOException e) {
this.log.warn("cannot write " + url + " to Cache (3): " + e.getMessage(), e);
}
return new ByteArrayInputStream(content);
}
/*
* Returns a HTTPInputStream delegating to
* client.getContentstream(). Close method will ensure client is
* properly closed.
*/
return new HTTPInputStream(client);
} else {
// if the response has not the right response type then reject file
this.sb.crawlQueues.errorURL.push(request.url(), request.depth(), profile,
FailCategory.TEMPORARY_NETWORK_FAILURE, "wrong http status code", statusCode);
throw new IOException("REJECTED WRONG STATUS TYPE '" + client.getHttpResponse().getStatusLine()
+ "' for URL '" + requestURLString + "'$");
}
}
/**
* Extract redirect URL from response header. Status code is supposed to be between 299 and 310. Parameters must not be null.
* @return redirect URL
* @throws IOException when an error occured
*/
private DigestURL extractRedirectURL(final Request request, CrawlProfile profile, DigestURL url,
final HTTPClient client, final int statusCode, final ResponseHeader responseHeader, String requestURLString)
throws IOException {
// read redirection URL
String redirectionUrlString = responseHeader.get(HeaderFramework.LOCATION);
redirectionUrlString = redirectionUrlString == null ? "" : redirectionUrlString.trim();
if (redirectionUrlString.isEmpty()) {
this.sb.crawlQueues.errorURL.push(request.url(), request.depth(), profile,
FailCategory.TEMPORARY_NETWORK_FAILURE,
"no redirection url provided, field '" + HeaderFramework.LOCATION + "' is empty", statusCode);
throw new IOException("REJECTED EMTPY REDIRECTION '" + client.getHttpResponse().getStatusLine()
+ "' for URL '" + requestURLString + "'$");
}
// normalize URL
final DigestURL redirectionUrl = DigestURL.newURL(request.url(), redirectionUrlString);
// restart crawling with new url
this.log.info("CRAWLER Redirection detected ('" + client.getHttpResponse().getStatusLine() + "') for URL "
+ requestURLString);
this.log.info("CRAWLER ..Redirecting request to: " + redirectionUrl.toNormalform(false));
this.sb.webStructure.generateCitationReference(url, redirectionUrl);
if (this.sb.getConfigBool(SwitchboardConstants.CRAWLER_RECORD_REDIRECTS, true)) {
this.sb.crawlQueues.errorURL.push(request.url(), request.depth(), profile,
FailCategory.FINAL_REDIRECT_RULE, "redirect to " + redirectionUrlString, statusCode);
}
return redirectionUrl;
}
/**
* Create request header for loading content.
* @param request search request
* @param agent agent identification information
* @return a request header
* @throws IOException when an error occured
*/
private RequestHeader createRequestheader(final Request request, final ClientIdentification.Agent agent)
throws IOException {
final RequestHeader requestHeader = new RequestHeader();
requestHeader.put(HeaderFramework.USER_AGENT, agent.userAgent);
DigestURL refererURL = null;
if (request.referrerhash() != null) {
refererURL = this.sb.getURL(request.referrerhash());
}
if (refererURL != null) {
requestHeader.put(RequestHeader.REFERER, refererURL.toNormalform(true));
}
requestHeader.put(HeaderFramework.ACCEPT, this.sb.getConfig("crawler.http.accept", DEFAULT_ACCEPT));
requestHeader.put(HeaderFramework.ACCEPT_LANGUAGE,
this.sb.getConfig("crawler.http.acceptLanguage", DEFAULT_LANGUAGE));
requestHeader.put(HeaderFramework.ACCEPT_CHARSET,
this.sb.getConfig("crawler.http.acceptCharset", DEFAULT_CHARSET));
requestHeader.put(HeaderFramework.ACCEPT_ENCODING,
this.sb.getConfig("crawler.http.acceptEncoding", DEFAULT_ENCODING));
return requestHeader;
}
private Response load(final Request request, CrawlProfile profile, final int retryCount, final int maxFileSize, final BlacklistType blacklistType, final ClientIdentification.Agent agent) throws IOException {
@ -112,15 +318,7 @@ public final class HTTPLoader {
Response response = null;
// create a request header
final RequestHeader requestHeader = new RequestHeader();
requestHeader.put(HeaderFramework.USER_AGENT, agent.userAgent);
DigestURL refererURL = null;
if (request.referrerhash() != null) refererURL = this.sb.getURL(request.referrerhash());
if (refererURL != null) requestHeader.put(RequestHeader.REFERER, refererURL.toNormalform(true));
requestHeader.put(HeaderFramework.ACCEPT, this.sb.getConfig("crawler.http.accept", DEFAULT_ACCEPT));
requestHeader.put(HeaderFramework.ACCEPT_LANGUAGE, this.sb.getConfig("crawler.http.acceptLanguage", DEFAULT_LANGUAGE));
requestHeader.put(HeaderFramework.ACCEPT_CHARSET, this.sb.getConfig("crawler.http.acceptCharset", DEFAULT_CHARSET));
requestHeader.put(HeaderFramework.ACCEPT_ENCODING, this.sb.getConfig("crawler.http.acceptEncoding", DEFAULT_ENCODING));
final RequestHeader requestHeader = createRequestheader(request, agent);
// HTTP-Client
final HTTPClient client = new HTTPClient(agent);
@ -137,27 +335,8 @@ public final class HTTPLoader {
// check redirection
if (statusCode > 299 && statusCode < 310) {
// read redirection URL
String redirectionUrlString = responseHeader.get(HeaderFramework.LOCATION);
redirectionUrlString = redirectionUrlString == null ? "" : redirectionUrlString.trim();
if (redirectionUrlString.isEmpty()) {
this.sb.crawlQueues.errorURL.push(request.url(), request.depth(), profile, FailCategory.TEMPORARY_NETWORK_FAILURE, "no redirection url provided, field '" + HeaderFramework.LOCATION + "' is empty", statusCode);
throw new IOException("REJECTED EMTPY REDIRECTION '" + client.getHttpResponse().getStatusLine() + "' for URL '" + requestURLString + "'$");
}
// normalize URL
final DigestURL redirectionUrl = DigestURL.newURL(request.url(), redirectionUrlString);
// restart crawling with new url
this.log.info("CRAWLER Redirection detected ('" + client.getHttpResponse().getStatusLine() + "') for URL " + requestURLString);
this.log.info("CRAWLER ..Redirecting request to: " + redirectionUrl.toNormalform(false));
this.sb.webStructure.generateCitationReference(url, redirectionUrl);
if (this.sb.getConfigBool(SwitchboardConstants.CRAWLER_RECORD_REDIRECTS, true)) {
this.sb.crawlQueues.errorURL.push(request.url(), request.depth(), profile, FailCategory.FINAL_REDIRECT_RULE, "redirect to " + redirectionUrlString, statusCode);
}
final DigestURL redirectionUrl = extractRedirectURL(request, profile, url, client, statusCode,
responseHeader, requestURLString);
if (this.sb.getConfigBool(SwitchboardConstants.CRAWLER_FOLLOW_REDIRECTS, true)) {
// we have two use cases here: loading from a crawl or just loading the url. Check this:

View File

@ -26,8 +26,10 @@
package net.yacy.repository;
import java.io.ByteArrayInputStream;
import java.io.File;
import java.io.IOException;
import java.io.InputStream;
import java.net.MalformedURLException;
import java.util.Arrays;
import java.util.Date;
@ -209,54 +211,9 @@ public final class LoaderDispatcher {
}
// check if we have the page in the cache
if (cacheStrategy != CacheStrategy.NOCACHE && crawlProfile != null) {
// we have passed a first test if caching is allowed
// now see if there is a cache entry
final ResponseHeader cachedResponse = (url.isLocal()) ? null : Cache.getResponseHeader(url.hash());
if (cachedResponse != null && Cache.hasContent(url.hash())) {
// yes we have the content
// create request header values and a response object because we need that
// in case that we want to return the cached content in the next step
final RequestHeader requestHeader = new RequestHeader();
requestHeader.put(HeaderFramework.USER_AGENT, agent.userAgent);
DigestURL refererURL = null;
if (request.referrerhash() != null) refererURL = this.sb.getURL(request.referrerhash());
if (refererURL != null) requestHeader.put(RequestHeader.REFERER, refererURL.toNormalform(true));
final Response response = new Response(
request,
requestHeader,
cachedResponse,
crawlProfile,
true,
null);
// check which caching strategy shall be used
if (cacheStrategy == CacheStrategy.IFEXIST || cacheStrategy == CacheStrategy.CACHEONLY) {
// well, just take the cache and don't care about freshness of the content
final byte[] content = Cache.getContent(url.hash());
if (content != null) {
LoaderDispatcher.log.info("cache hit/useall for: " + url.toNormalform(true));
response.setContent(content);
return response;
}
}
// now the cacheStrategy must be CACHE_STRATEGY_IFFRESH, that means we should do a proxy freshness test
//assert cacheStrategy == CacheStrategy.IFFRESH : "cacheStrategy = " + cacheStrategy;
if (response.isFreshForProxy()) {
final byte[] content = Cache.getContent(url.hash());
if (content != null) {
LoaderDispatcher.log.info("cache hit/fresh for: " + url.toNormalform(true));
response.setContent(content);
return response;
}
}
LoaderDispatcher.log.info("cache hit/stale for: " + url.toNormalform(true));
} else if (cachedResponse != null) {
LoaderDispatcher.log.warn("HTCACHE contained response header, but not content for url " + url.toNormalform(true));
}
Response response = loadFromCache(request, cacheStrategy, agent, url, crawlProfile);
if(response != null) {
return response;
}
// check case where we want results from the cache exclusively, and never from the Internet (offline mode)
@ -269,21 +226,7 @@ public final class LoaderDispatcher {
// check access time: this is a double-check (we checked possibly already in the balancer)
// to make sure that we don't DoS the target by mistake
if (!url.isLocal()) {
final Long lastAccess = accessTime.get(host);
long wait = 0;
if (lastAccess != null) wait = Math.max(0, agent.minimumDelta + lastAccess.longValue() - System.currentTimeMillis());
if (wait > 0) {
// force a sleep here. Instead just sleep we clean up the accessTime map
final long untilTime = System.currentTimeMillis() + wait;
cleanupAccessTimeTable(untilTime);
if (System.currentTimeMillis() < untilTime) {
long frcdslp = untilTime - System.currentTimeMillis();
LoaderDispatcher.log.info("Forcing sleep of " + frcdslp + " ms for host " + host);
try {Thread.sleep(frcdslp);} catch (final InterruptedException ee) {}
}
}
}
checkAccessTime(agent, url);
// now it's for sure that we will access the target. Remember the access time
if (host != null) {
@ -292,7 +235,6 @@ public final class LoaderDispatcher {
}
// load resource from the internet
Response response = null;
if (protocol.equals("http") || protocol.equals("https")) {
response = this.httpLoader.load(request, crawlProfile, maxFileSize, blacklistType, agent);
} else if (protocol.equals("ftp")) {
@ -331,6 +273,167 @@ public final class LoaderDispatcher {
return response;
}
/**
* Try loading requested resource from cache according to cache strategy
* @param request request to resource
* @param cacheStrategy cache strategy to use
* @param agent agent identifier
* @param url resource url
* @param crawlProfile crawl profile
* @return a Response instance when resource could be loaded from cache, or null.
* @throws IOException when an error occured
*/
private Response loadFromCache(final Request request, CacheStrategy cacheStrategy, ClientIdentification.Agent agent,
final DigestURL url, final CrawlProfile crawlProfile) throws IOException {
Response response = null;
if (cacheStrategy != CacheStrategy.NOCACHE && crawlProfile != null) {
// we have passed a first test if caching is allowed
// now see if there is a cache entry
final ResponseHeader cachedResponse = (url.isLocal()) ? null : Cache.getResponseHeader(url.hash());
if (cachedResponse != null && Cache.hasContent(url.hash())) {
// yes we have the content
// create request header values and a response object because we need that
// in case that we want to return the cached content in the next step
final RequestHeader requestHeader = new RequestHeader();
requestHeader.put(HeaderFramework.USER_AGENT, agent.userAgent);
DigestURL refererURL = null;
if (request.referrerhash() != null) refererURL = this.sb.getURL(request.referrerhash());
if (refererURL != null) requestHeader.put(RequestHeader.REFERER, refererURL.toNormalform(true));
response = new Response(
request,
requestHeader,
cachedResponse,
crawlProfile,
true,
null);
// check which caching strategy shall be used
if (cacheStrategy == CacheStrategy.IFEXIST || cacheStrategy == CacheStrategy.CACHEONLY) {
// well, just take the cache and don't care about freshness of the content
final byte[] content = Cache.getContent(url.hash());
if (content != null) {
LoaderDispatcher.log.info("cache hit/useall for: " + url.toNormalform(true));
response.setContent(content);
return response;
}
}
// now the cacheStrategy must be CACHE_STRATEGY_IFFRESH, that means we should do a proxy freshness test
//assert cacheStrategy == CacheStrategy.IFFRESH : "cacheStrategy = " + cacheStrategy;
if (response.isFreshForProxy()) {
final byte[] content = Cache.getContent(url.hash());
if (content != null) {
LoaderDispatcher.log.info("cache hit/fresh for: " + url.toNormalform(true));
response.setContent(content);
return response;
}
}
LoaderDispatcher.log.info("cache hit/stale for: " + url.toNormalform(true));
} else if (cachedResponse != null) {
LoaderDispatcher.log.warn("HTCACHE contained response header, but not content for url " + url.toNormalform(true));
}
}
return response;
}
/**
* Open an InputStream on a resource from the web, from ftp, from smb or a file
* @param request the request essentials
* @param cacheStratgy strategy according to NOCACHE, IFFRESH, IFEXIST, CACHEONLY
* @return an open ImageInputStream. Don't forget to close it once used!
* @throws IOException when url is malformed, blacklisted, or CacheStrategy is CACHEONLY and content is unavailable
*/
private InputStream openInputStreamInternal(final Request request, CacheStrategy cacheStrategy, final int maxFileSize, final BlacklistType blacklistType, ClientIdentification.Agent agent) throws IOException {
// get the protocol of the next URL
final DigestURL url = request.url();
if (url.isFile() || url.isSMB()) {
cacheStrategy = CacheStrategy.NOCACHE; // load just from the file
// system
}
final String protocol = url.getProtocol();
final String host = url.getHost();
final CrawlProfile crawlProfile = request.profileHandle() == null ? null : this.sb.crawler.get(UTF8.getBytes(request.profileHandle()));
// check if url is in blacklist
if (blacklistType != null && host != null && Switchboard.urlBlacklist.isListed(blacklistType, host.toLowerCase(), url.getFile())) {
this.sb.crawlQueues.errorURL.push(request.url(), request.depth(), crawlProfile, FailCategory.FINAL_LOAD_CONTEXT, "url in blacklist", -1);
throw new IOException("DISPATCHER Rejecting URL '" + request.url().toString() + "'. URL is in blacklist.$");
}
// check if we have the page in the cache
Response cachedResponse = loadFromCache(request, cacheStrategy, agent, url, crawlProfile);
if(cachedResponse != null) {
return new ByteArrayInputStream(cachedResponse.getContent());
}
// check case where we want results from the cache exclusively, and never from the Internet (offline mode)
if (cacheStrategy == CacheStrategy.CACHEONLY) {
// we had a chance to get the content from the cache .. its over. We don't have it.
throw new IOException("cache only strategy");
}
// now forget about the cache, nothing there. Try to load the content from the Internet
// check access time: this is a double-check (we checked possibly already in the balancer)
// to make sure that we don't DoS the target by mistake
checkAccessTime(agent, url);
// now it's for sure that we will access the target. Remember the access time
if (host != null) {
if (accessTime.size() > accessTimeMaxsize) accessTime.clear(); // prevent a memory leak here
accessTime.put(host, System.currentTimeMillis());
}
// load resource from the internet
InputStream inStream = null;
if (protocol.equals("http") || protocol.equals("https")) {
inStream = this.httpLoader.openInputStream(request, crawlProfile, 1, maxFileSize, blacklistType, agent);
} else if (protocol.equals("ftp") || protocol.equals("smb") || protocol.equals("file")) {
// may also open directly stream with ftp loader
inStream = url.getInputStream(agent, null, null);
} else {
throw new IOException("Unsupported protocol '" + protocol + "' in url " + url);
}
if (inStream == null) {
throw new IOException("Unable to open content stream");
}
return inStream;
}
/**
* Check access time: this is a double-check (we checked possibly already in the balancer)
* to make sure that we don't DoS the target by mistake
* @param agent agent identifier
* @param url target url
*/
private void checkAccessTime(ClientIdentification.Agent agent, final DigestURL url) {
if (!url.isLocal()) {
String host = url.getHost();
final Long lastAccess = accessTime.get(host);
long wait = 0;
if (lastAccess != null)
wait = Math.max(0, agent.minimumDelta + lastAccess.longValue() - System.currentTimeMillis());
if (wait > 0) {
// force a sleep here. Instead just sleep we clean up the
// accessTime map
final long untilTime = System.currentTimeMillis() + wait;
cleanupAccessTimeTable(untilTime);
if (System.currentTimeMillis() < untilTime) {
long frcdslp = untilTime - System.currentTimeMillis();
LoaderDispatcher.log.info("Forcing sleep of " + frcdslp + " ms for host " + host);
try {
Thread.sleep(frcdslp);
} catch (final InterruptedException ee) {
}
}
}
}
}
private int protocolMaxFileSize(final DigestURL url) {
if (url.isHTTP() || url.isHTTPS())
return this.sb.getConfigInt("crawler.http.maxFileSize", HTTPLoader.DEFAULT_MAXFILESIZE);
@ -357,6 +460,53 @@ public final class LoaderDispatcher {
// read resource body (if it is there)
return entry.getContent();
}
/**
* Open url as InputStream from the web or the cache
* @param request must be not null
* @param cacheStrategy cache strategy to use
* @param blacklistType black list
* @param agent agent identification for HTTP requests
* @return an open InputStream on content. Don't forget to close it once used.
* @throws IOException when url is malformed or blacklisted
*/
public InputStream openInputStream(final Request request, final CacheStrategy cacheStrategy,
BlacklistType blacklistType, final ClientIdentification.Agent agent) throws IOException {
final int maxFileSize = protocolMaxFileSize(request.url());
InputStream stream = null;
Semaphore check = this.loaderSteering.get(request.url());
if (check != null && cacheStrategy != CacheStrategy.NOCACHE) {
// a loading process is going on for that url
long t = System.currentTimeMillis();
try {
check.tryAcquire(5, TimeUnit.SECONDS);
} catch (final InterruptedException e) {
}
ConcurrentLog.info("LoaderDispatcher",
"waited " + (System.currentTimeMillis() - t) + " ms for " + request.url().toNormalform(true));
// now the process may have terminated and we run a normal loading
// which may be successful faster because of a cache hit
}
this.loaderSteering.put(request.url(), new Semaphore(0));
try {
stream = openInputStreamInternal(request, cacheStrategy, maxFileSize, blacklistType, agent);
} catch(IOException ioe) {
/* Do not re encapsulate eventual IOException in an IOException */
throw ioe;
} catch (final Throwable e) {
throw new IOException(e);
} finally {
// release the semaphore anyway
check = this.loaderSteering.remove(request.url());
if (check != null) {
check.release(1000); // don't block any other
}
}
return stream;
}
public Document[] loadDocuments(final Request request, final CacheStrategy cacheStrategy, final int maxFileSize, BlacklistType blacklistType, final ClientIdentification.Agent agent) throws IOException, Parser.Failure {

View File

@ -8,6 +8,9 @@ import java.util.List;
import java.util.Map;
import java.util.TreeMap;
import javax.imageio.ImageIO;
import javax.imageio.stream.ImageInputStream;
import net.yacy.cora.util.ConcurrentLog;
import net.yacy.peers.graphics.EncodedImage;
import net.yacy.server.serverObjects;
@ -75,8 +78,9 @@ public class ViewImagePerfTest extends ViewImageTest {
}
/**
* Process inFile image, update processedFiles list and failures map, and append measurements to results_perfs.txt. All
* parameters must not be null.
* Process inFile image, update processedFiles list and failures map, and
* append measurements to results_perfs.txt. All parameters must not be
* null.
*
* @param ext
* output encoding image format
@ -92,7 +96,7 @@ public class ViewImagePerfTest extends ViewImageTest {
* when an read/write error occured
*/
@Override
protected void processFile(String ext, File outDir, serverObjects post, Map<String, Exception> failures,
protected void processFile(String ext, File outDir, serverObjects post, Map<String, Throwable> failures,
File inFile) throws IOException {
/* Delete eventual previous result file */
System.out
@ -102,43 +106,43 @@ public class ViewImagePerfTest extends ViewImageTest {
outFile.delete();
}
byte[] resourceb = getBytes(inFile);
String urlString = inFile.getAbsolutePath();
EncodedImage img = null;
Exception error = null;
long beginTime = System.nanoTime(), time, minTime = Long.MAX_VALUE, maxTime = 0, meanTime = 0, totalTime = 0;
int step = 0;
for (step = 0; (totalTime / 1000000000) < this.minMeasureTime; step++) {
beginTime = System.nanoTime();
ImageInputStream inStream = ImageIO.createImageInputStream(inFile);
try {
img = ViewImage.parseAndScale(post, true, urlString, ext, inStream);
} catch (Exception e) {
error = e;
}
time = System.nanoTime() - beginTime;
minTime = Math.min(minTime, time);
maxTime = Math.max(maxTime, time);
totalTime += time;
}
if (step > 0) {
meanTime = totalTime / step;
} else {
meanTime = totalTime;
}
PrintWriter resultsWriter = new PrintWriter(new FileWriter(new File(outDir, "results_perfs.txt"), true));
try {
long beginTime, time, minTime = Long.MAX_VALUE, maxTime = 0, meanTime = 0, totalTime = 0;
int step = 0;
for (step = 0; (totalTime / 1000000000) < this.minMeasureTime; step++) {
beginTime = System.nanoTime();
img = ViewImage.parseAndScale(post, true, urlString, ext, false, resourceb);
time = System.nanoTime() - beginTime;
if (img == null) {
break;
}
minTime = Math.min(minTime, time);
maxTime = Math.max(maxTime, time);
totalTime += time;
writeMessage("Measured ViewImage render with file : " + inFile.getAbsolutePath() + " encoded To : " + ext,
resultsWriter);
if(img == null) {
writeMessage("Image could not be rendered! Measurement show time needed to read and parse image data until error detection.", resultsWriter);
}
if (img == null) {
System.out.println("Image could not be rendered!");
} else {
meanTime = totalTime / step;
PrintWriter resultsWriter = new PrintWriter(new FileWriter(new File(outDir, "results_perfs.txt"), true));
try {
writeMessage("Measured ViewImage render with file : " + inFile.getAbsolutePath() + " encoded To : "
+ ext, resultsWriter);
writeMessage("Render total time (ms) : " + (totalTime) / 1000000 + " on " + step + " steps.",
resultsWriter);
writeMessage("Render mean time (ms) : " + (meanTime) / 1000000, resultsWriter);
writeMessage("Render min time (ms) : " + (minTime) / 1000000, resultsWriter);
writeMessage("Render max time (ms) : " + (maxTime) / 1000000, resultsWriter);
} finally {
resultsWriter.close();
}
}
} catch (Exception e) {
error = e;
writeMessage("Render total time (ms) : " + (totalTime) / 1000000 + " on " + step + " steps.",
resultsWriter);
writeMessage("Render mean time (ms) : " + (meanTime) / 1000000, resultsWriter);
writeMessage("Render min time (ms) : " + (minTime) / 1000000, resultsWriter);
writeMessage("Render max time (ms) : " + (maxTime) / 1000000, resultsWriter);
} finally {
resultsWriter.close();
}
if (img == null) {
@ -218,7 +222,7 @@ public class ViewImagePerfTest extends ViewImageTest {
System.out.println("Rendered images will be written in dir : " + outDir.getAbsolutePath());
List<File> processedFiles = new ArrayList<File>();
Map<String, Exception> failures = new TreeMap<>();
Map<String, Throwable> failures = new TreeMap<>();
try {
long time = System.nanoTime();
test.processFiles(ext, recursive, outDir, post, inFiles, processedFiles, failures);

View File

@ -1,9 +1,7 @@
import java.io.File;
import java.io.FileInputStream;
import java.io.FileOutputStream;
import java.io.FileWriter;
import java.io.IOException;
import java.io.InputStream;
import java.io.PrintWriter;
import java.net.URL;
import java.util.ArrayList;
@ -12,6 +10,9 @@ import java.util.Map;
import java.util.Map.Entry;
import java.util.TreeMap;
import javax.imageio.ImageIO;
import javax.imageio.stream.ImageInputStream;
import net.yacy.cora.util.ConcurrentLog;
import net.yacy.peers.graphics.EncodedImage;
import net.yacy.server.serverObjects;
@ -52,24 +53,6 @@ public class ViewImageTest {
/** Default output encoding format */
private static final String DEFAULT_OUT_EXT = "png";
/**
* @param testFile
* file to load
* @return testFile content as a bytes array
* @throws IOException
* when an error occured while loading
*/
protected byte[] getBytes(File testFile) throws IOException {
InputStream inStream = new FileInputStream(testFile);
byte[] res = new byte[inStream.available()];
try {
inStream.read(res);
} finally {
inStream.close();
}
return res;
}
/**
* @param args
* main parameters. first item may contain input file or folder
@ -207,7 +190,7 @@ public class ViewImageTest {
* @param processedFiles
* all processed image files
* @param failures
* map input file url which failed with eventual cause exception
* map input file url which failed with eventual cause error
* @param time
* total processing time in nanoseconds
* @param outDir
@ -215,7 +198,7 @@ public class ViewImageTest {
* @throws IOException
* when a write error occured writing the results file
*/
protected void displayResults(List<File> processedFiles, Map<String, Exception> failures, long time, File outDir)
protected void displayResults(List<File> processedFiles, Map<String, Throwable> failures, long time, File outDir)
throws IOException {
PrintWriter resultsWriter = new PrintWriter(new FileWriter(new File(outDir, "results.txt")));
try {
@ -226,7 +209,7 @@ public class ViewImageTest {
} else {
writeMessage("Some input files could not be processed :", resultsWriter);
}
for (Entry<String, Exception> entry : failures.entrySet()) {
for (Entry<String, Throwable> entry : failures.entrySet()) {
writeMessage(entry.getKey(), resultsWriter);
if (entry.getValue() != null) {
writeMessage("cause : " + entry.getValue(), resultsWriter);
@ -266,7 +249,7 @@ public class ViewImageTest {
* when an read/write error occured
*/
protected void processFiles(String ext, boolean recursive, File outDir, serverObjects post, File[] inFiles,
List<File> processedFiles, Map<String, Exception> failures) throws IOException {
List<File> processedFiles, Map<String, Throwable> failures) throws IOException {
for (File inFile : inFiles) {
if (inFile.isDirectory()) {
if (recursive) {
@ -291,7 +274,7 @@ public class ViewImageTest {
* @param inFile file image to process
* @throws IOException when an read/write error occured
*/
protected void processFile(String ext, File outDir, serverObjects post, Map<String, Exception> failures, File inFile)
protected void processFile(String ext, File outDir, serverObjects post, Map<String, Throwable> failures, File inFile)
throws IOException {
/* Delete eventual previous result file */
File outFile = new File(outDir, inFile.getName() + "." + ext);
@ -299,13 +282,13 @@ public class ViewImageTest {
outFile.delete();
}
byte[] resourceb = getBytes(inFile);
ImageInputStream inStream = ImageIO.createImageInputStream(inFile);
String urlString = inFile.getAbsolutePath();
EncodedImage img = null;
Exception error = null;
Throwable error = null;
try {
img = ViewImage.parseAndScale(post, true, urlString, ext, false, resourceb);
} catch (Exception e) {
img = ViewImage.parseAndScale(post, true, urlString, ext, inStream);
} catch (Throwable e) {
error = e;
}
@ -383,7 +366,7 @@ public class ViewImageTest {
System.out.println("Rendered images will be written in dir : " + outDir.getAbsolutePath());
List<File> processedFiles = new ArrayList<File>();
Map<String, Exception> failures = new TreeMap<>();
Map<String, Throwable> failures = new TreeMap<>();
try {
long time = System.nanoTime();
test.processFiles(ext, recursive, outDir, post, inFiles, processedFiles, failures);