mirror of
https://github.com/yacy/yacy_search_server.git
synced 2024-09-19 00:01:41 +02:00
Process large or local file images dealing directly with content
InputStream.
This commit is contained in:
parent
3c4c77099d
commit
f01d49c37a
|
@ -28,10 +28,13 @@ import java.awt.Image;
|
|||
import java.awt.MediaTracker;
|
||||
import java.awt.image.BufferedImage;
|
||||
import java.awt.image.Raster;
|
||||
import java.io.ByteArrayInputStream;
|
||||
import java.io.IOException;
|
||||
import java.io.InputStream;
|
||||
import java.util.Map;
|
||||
|
||||
import javax.imageio.ImageIO;
|
||||
import javax.imageio.stream.ImageInputStream;
|
||||
|
||||
import net.yacy.cora.document.id.DigestURL;
|
||||
import net.yacy.cora.document.id.MultiProtocolURL;
|
||||
import net.yacy.cora.federate.yacy.CacheStrategy;
|
||||
|
@ -42,11 +45,11 @@ import net.yacy.cora.protocol.RequestHeader;
|
|||
import net.yacy.cora.storage.ConcurrentARC;
|
||||
import net.yacy.cora.util.ConcurrentLog;
|
||||
import net.yacy.data.URLLicense;
|
||||
import net.yacy.document.ImageParser;
|
||||
import net.yacy.kelondro.util.MemoryControl;
|
||||
import net.yacy.kelondro.workflow.WorkflowProcessor;
|
||||
import net.yacy.peers.graphics.EncodedImage;
|
||||
import net.yacy.repository.Blacklist.BlacklistType;
|
||||
import net.yacy.repository.LoaderDispatcher;
|
||||
import net.yacy.search.Switchboard;
|
||||
import net.yacy.server.serverObjects;
|
||||
import net.yacy.server.serverSwitch;
|
||||
|
@ -74,8 +77,8 @@ public class ViewImage {
|
|||
* when specified url is malformed, or a read/write error
|
||||
* occured, or input or target image format is not supported.
|
||||
* Sould end in a HTTP 500 error whose processing is more
|
||||
* consistent across browsers than a response with zero
|
||||
* content bytes.
|
||||
* consistent across browsers than a response with zero content
|
||||
* bytes.
|
||||
*/
|
||||
public static Object respond(final RequestHeader header, final serverObjects post, final serverSwitch env)
|
||||
throws IOException {
|
||||
|
@ -113,39 +116,81 @@ public class ViewImage {
|
|||
if (image != null) {
|
||||
encodedImage = new EncodedImage(image, ext, post.getBoolean("isStatic"));
|
||||
} else {
|
||||
byte[] resourceb = null;
|
||||
if (url != null)
|
||||
try {
|
||||
String agentName = post.get("agentName", auth ? ClientIdentification.yacyIntranetCrawlerAgentName
|
||||
: ClientIdentification.yacyInternetCrawlerAgentName);
|
||||
ClientIdentification.Agent agent = ClientIdentification.getAgent(agentName);
|
||||
resourceb = sb.loader.loadContent(sb.loader.request(url, false, true), CacheStrategy.IFEXIST,
|
||||
BlacklistType.SEARCH, agent);
|
||||
} catch (final IOException e) {
|
||||
ConcurrentLog.fine("ViewImage", "cannot load: " + e.getMessage());
|
||||
throw e;
|
||||
}
|
||||
boolean okToCache = true;
|
||||
if (resourceb == null) {
|
||||
/*
|
||||
* Throw an exception, wich will end in a HTTP 500 response,
|
||||
* better handled by browsers than an empty image
|
||||
*/
|
||||
throw new IOException("Image could not be loaded.");
|
||||
}
|
||||
|
||||
String urlExt = MultiProtocolURL.getFileExtension(url.getFileName());
|
||||
if (ext != null && ext.equalsIgnoreCase(urlExt) && isBrowserRendered(urlExt)) {
|
||||
return new ByteArrayInputStream(resourceb);
|
||||
return openInputStream(post, sb.loader, auth, url);
|
||||
}
|
||||
|
||||
// read image
|
||||
encodedImage = parseAndScale(post, auth, urlString, ext, okToCache, resourceb);
|
||||
ImageInputStream imageInStream = null;
|
||||
InputStream inStream = null;
|
||||
/*
|
||||
* When opening a file, the most efficient is to open
|
||||
* ImageInputStream directly on file
|
||||
*/
|
||||
if (url.isFile()) {
|
||||
imageInStream = ImageIO.createImageInputStream(url.getFSFile());
|
||||
} else {
|
||||
inStream = openInputStream(post, sb.loader, auth, url);
|
||||
imageInStream = ImageIO.createImageInputStream(inStream);
|
||||
}
|
||||
try {
|
||||
// read image
|
||||
encodedImage = parseAndScale(post, auth, urlString, ext, imageInStream);
|
||||
} finally {
|
||||
/*
|
||||
* imageInStream.close() method doesn't close source input
|
||||
* stream
|
||||
*/
|
||||
if (inStream != null) {
|
||||
try {
|
||||
inStream.close();
|
||||
} catch (IOException ignored) {
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
return encodedImage;
|
||||
}
|
||||
|
||||
/**
|
||||
* Open input stream on image url using provided loader. All parameters must
|
||||
* not be null.
|
||||
*
|
||||
* @param post
|
||||
* post parameters.
|
||||
* @param loader.
|
||||
* Resources loader.
|
||||
* @param auth
|
||||
* true when user has credentials to load full images.
|
||||
* @param url
|
||||
* image url.
|
||||
* @return an open input stream instance (don't forget to close it).
|
||||
* @throws IOException
|
||||
* when a read/write error occured.
|
||||
*/
|
||||
private static InputStream openInputStream(final serverObjects post, final LoaderDispatcher loader,
|
||||
final boolean auth, DigestURL url) throws IOException {
|
||||
InputStream inStream = null;
|
||||
if (url != null) {
|
||||
try {
|
||||
String agentName = post.get("agentName", auth ? ClientIdentification.yacyIntranetCrawlerAgentName
|
||||
: ClientIdentification.yacyInternetCrawlerAgentName);
|
||||
ClientIdentification.Agent agent = ClientIdentification.getAgent(agentName);
|
||||
inStream = loader.openInputStream(loader.request(url, false, true), CacheStrategy.IFEXIST,
|
||||
BlacklistType.SEARCH, agent);
|
||||
} catch (final IOException e) {
|
||||
ConcurrentLog.fine("ViewImage", "cannot load: " + e.getMessage());
|
||||
throw e;
|
||||
}
|
||||
}
|
||||
if (inStream == null) {
|
||||
throw new IOException("Input stream could no be open");
|
||||
}
|
||||
return inStream;
|
||||
}
|
||||
|
||||
/**
|
||||
* @param formatName
|
||||
* informal file format name. For example : "png".
|
||||
|
@ -165,31 +210,35 @@ public class ViewImage {
|
|||
}
|
||||
|
||||
/**
|
||||
* Process resourceb byte array to try to produce an EncodedImage instance
|
||||
* eventually scaled and cropped depending on post parameters.
|
||||
* Process source image to try to produce an EncodedImage instance
|
||||
* eventually scaled and clipped depending on post parameters. When
|
||||
* processed, imageInStream is closed.
|
||||
*
|
||||
* @param post
|
||||
* request post parameters. Must not be null.
|
||||
* @param auth
|
||||
* true when access rigths are OK.
|
||||
* @param urlString
|
||||
* image source URL. Must not be null.
|
||||
* image source URL as String. Must not be null.
|
||||
* @param ext
|
||||
* image file extension. May be null.
|
||||
* @param okToCache
|
||||
* true when image can be cached
|
||||
* @param resourceb
|
||||
* byte array. Must not be null.
|
||||
* target image file format. May be null.
|
||||
* @param imageInStream
|
||||
* open stream on image content. Must not be null.
|
||||
* @return an EncodedImage instance.
|
||||
* @throws IOException
|
||||
* when image could not be parsed or encoded to specified format
|
||||
*/
|
||||
protected static EncodedImage parseAndScale(serverObjects post, boolean auth, String urlString, String ext,
|
||||
boolean okToCache, byte[] resourceb) throws IOException {
|
||||
ImageInputStream imageInStream) throws IOException {
|
||||
EncodedImage encodedImage = null;
|
||||
|
||||
Image image = ImageParser.parse(urlString, resourceb);
|
||||
Image image = ImageIO.read(imageInStream);
|
||||
if (image == null) {
|
||||
try {
|
||||
/* When a null image is returned, we have to close the stream */
|
||||
imageInStream.close();
|
||||
} catch (IOException ignoredException) {
|
||||
}
|
||||
/*
|
||||
* Throw an exception, wich will end in a HTTP 500 response, better
|
||||
* handled by browsers than an empty image
|
||||
|
@ -197,53 +246,52 @@ public class ViewImage {
|
|||
throw new IOException("Image format is not supported.");
|
||||
}
|
||||
|
||||
if (image != null) {
|
||||
int maxwidth = post.getInt("maxwidth", 0);
|
||||
int maxheight = post.getInt("maxheight", 0);
|
||||
final boolean quadratic = post.containsKey("quadratic");
|
||||
boolean isStatic = post.getBoolean("isStatic");
|
||||
if (!auth || maxwidth != 0 || maxheight != 0) {
|
||||
int maxwidth = post.getInt("maxwidth", 0);
|
||||
int maxheight = post.getInt("maxheight", 0);
|
||||
final boolean quadratic = post.containsKey("quadratic");
|
||||
boolean isStatic = post.getBoolean("isStatic");
|
||||
if (!auth || maxwidth != 0 || maxheight != 0) {
|
||||
|
||||
// find original size
|
||||
int h = image.getHeight(null);
|
||||
int w = image.getWidth(null);
|
||||
// find original size
|
||||
final int originWidth = image.getWidth(null);
|
||||
final int originHeigth = image.getHeight(null);
|
||||
|
||||
// in case of not-authorized access shrink the image to
|
||||
// prevent
|
||||
// copyright problems, so that images are not larger than
|
||||
// thumbnails
|
||||
Dimension maxDimensions = calculateMaxDimensions(auth, w, h, maxwidth, maxheight);
|
||||
// in case of not-authorized access shrink the image to
|
||||
// prevent
|
||||
// copyright problems, so that images are not larger than
|
||||
// thumbnails
|
||||
Dimension maxDimensions = calculateMaxDimensions(auth, originWidth, originHeigth, maxwidth, maxheight);
|
||||
|
||||
// if a quadratic flag is set, we cut the image out to be in
|
||||
// quadratic shape
|
||||
if (quadratic && w != h) {
|
||||
image = makeSquare(image, h, w);
|
||||
h = image.getHeight(null);
|
||||
w = image.getWidth(null);
|
||||
}
|
||||
|
||||
Dimension finalDimensions = calculateDimensions(w, h, maxDimensions);
|
||||
|
||||
if (w != finalDimensions.width && h != finalDimensions.height) {
|
||||
image = scale(finalDimensions.width, finalDimensions.height, image);
|
||||
|
||||
}
|
||||
|
||||
if ((finalDimensions.width == 16) && (finalDimensions.height == 16) && okToCache) {
|
||||
// this might be a favicon, store image to cache for
|
||||
// faster
|
||||
// re-load later on
|
||||
iconcache.put(urlString, image);
|
||||
}
|
||||
// if a quadratic flag is set, we cut the image out to be in
|
||||
// quadratic shape
|
||||
int w = originWidth;
|
||||
int h = originHeigth;
|
||||
if (quadratic && originWidth != originHeigth) {
|
||||
image = makeSquare(image, originHeigth, originWidth);
|
||||
h = image.getHeight(null);
|
||||
w = image.getWidth(null);
|
||||
}
|
||||
/*
|
||||
* An error can still occur when transcoding from buffered image to
|
||||
* target ext : in that case return null
|
||||
*/
|
||||
encodedImage = new EncodedImage(image, ext, isStatic);
|
||||
if (encodedImage.getImage().length() == 0) {
|
||||
throw new IOException("Image could not be encoded to format : " + ext);
|
||||
|
||||
Dimension finalDimensions = calculateDimensions(w, h, maxDimensions);
|
||||
|
||||
if (w != finalDimensions.width && h != finalDimensions.height) {
|
||||
image = scale(finalDimensions.width, finalDimensions.height, image);
|
||||
}
|
||||
|
||||
if (finalDimensions.width == 16 && finalDimensions.height == 16) {
|
||||
// this might be a favicon, store image to cache for
|
||||
// faster
|
||||
// re-load later on
|
||||
iconcache.put(urlString, image);
|
||||
}
|
||||
}
|
||||
/*
|
||||
* An error can still occur when transcoding from buffered image to
|
||||
* target ext : in that case return null
|
||||
*/
|
||||
encodedImage = new EncodedImage(image, ext, isStatic);
|
||||
if (encodedImage.getImage().length() == 0) {
|
||||
throw new IOException("Image could not be encoded to format : " + ext);
|
||||
}
|
||||
return encodedImage;
|
||||
}
|
||||
|
|
125
source/net/yacy/cora/util/HTTPInputStream.java
Executable file
125
source/net/yacy/cora/util/HTTPInputStream.java
Executable file
|
@ -0,0 +1,125 @@
|
|||
/**
|
||||
* HTTPInputStream
|
||||
* Copyright 2014 by Michael Peter Christen; mc@yacy.net, Frankfurt a. M., Germany
|
||||
* First published 26.11.2014 on http://yacy.net
|
||||
*
|
||||
* This library is free software; you can redistribute it and/or
|
||||
* modify it under the terms of the GNU Lesser General Public
|
||||
* License as published by the Free Software Foundation; either
|
||||
* version 2.1 of the License, or (at your option) any later version.
|
||||
*
|
||||
* This library is distributed in the hope that it will be useful,
|
||||
* but WITHOUT ANY WARRANTY; without even the implied warranty of
|
||||
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
|
||||
* Lesser General Public License for more details.
|
||||
*
|
||||
* You should have received a copy of the GNU Lesser General Public License
|
||||
* along with this program in the file lgpl21.txt
|
||||
* If not, see <http://www.gnu.org/licenses/>.
|
||||
*/
|
||||
|
||||
package net.yacy.cora.util;
|
||||
|
||||
import java.io.IOException;
|
||||
import java.io.InputStream;
|
||||
|
||||
import net.yacy.cora.protocol.http.HTTPClient;
|
||||
|
||||
/**
|
||||
* A HTTP InputStream delegating to HTTPClient. Use it when streaming HTTP content to easily finish HTTP client when closing stream.
|
||||
* @author luc
|
||||
*
|
||||
*/
|
||||
public class HTTPInputStream extends InputStream {
|
||||
|
||||
/** HTTP client */
|
||||
private HTTPClient httpClient;
|
||||
|
||||
/** Encapsulated HTTP content stream */
|
||||
private InputStream contentStream;
|
||||
|
||||
|
||||
/**
|
||||
* Constructs from a httpClient.
|
||||
* @param httpClient a httpClient with accessible stream content.
|
||||
* @throws IOException when content stream can not be open on httpClient
|
||||
*/
|
||||
public HTTPInputStream(HTTPClient httpClient) throws IOException {
|
||||
if(httpClient == null) {
|
||||
throw new IllegalArgumentException("httpClient is null");
|
||||
}
|
||||
this.httpClient = httpClient;
|
||||
this.contentStream = httpClient.getContentstream();
|
||||
if(this.contentStream == null) {
|
||||
throw new IOException("content stream is null");
|
||||
}
|
||||
}
|
||||
|
||||
/**
|
||||
* Close properly HTTP connection with httpClient
|
||||
*/
|
||||
@Override
|
||||
public void close() throws IOException {
|
||||
httpClient.finish();
|
||||
}
|
||||
|
||||
|
||||
@Override
|
||||
public int read() throws IOException {
|
||||
return contentStream.read();
|
||||
}
|
||||
|
||||
|
||||
@Override
|
||||
public int hashCode() {
|
||||
return contentStream.hashCode();
|
||||
}
|
||||
|
||||
@Override
|
||||
public int read(byte[] b) throws IOException {
|
||||
return contentStream.read(b);
|
||||
}
|
||||
|
||||
@Override
|
||||
public boolean equals(Object obj) {
|
||||
return contentStream.equals(obj);
|
||||
}
|
||||
|
||||
@Override
|
||||
public int read(byte[] b, int off, int len) throws IOException {
|
||||
return contentStream.read(b, off, len);
|
||||
}
|
||||
|
||||
@Override
|
||||
public long skip(long n) throws IOException {
|
||||
return contentStream.skip(n);
|
||||
}
|
||||
|
||||
@Override
|
||||
public String toString() {
|
||||
return contentStream.toString();
|
||||
}
|
||||
|
||||
@Override
|
||||
public int available() throws IOException {
|
||||
return contentStream.available();
|
||||
}
|
||||
|
||||
@Override
|
||||
public synchronized void mark(int readlimit) {
|
||||
contentStream.mark(readlimit);
|
||||
}
|
||||
|
||||
@Override
|
||||
public synchronized void reset() throws IOException {
|
||||
contentStream.reset();
|
||||
}
|
||||
|
||||
@Override
|
||||
public boolean markSupported() {
|
||||
return contentStream.markSupported();
|
||||
}
|
||||
|
||||
|
||||
|
||||
}
|
|
@ -24,7 +24,9 @@
|
|||
|
||||
package net.yacy.crawler.retrieval;
|
||||
|
||||
import java.io.ByteArrayInputStream;
|
||||
import java.io.IOException;
|
||||
import java.io.InputStream;
|
||||
|
||||
import net.yacy.cora.document.id.DigestURL;
|
||||
import net.yacy.cora.federate.solr.FailCategory;
|
||||
|
@ -34,7 +36,9 @@ import net.yacy.cora.protocol.RequestHeader;
|
|||
import net.yacy.cora.protocol.ResponseHeader;
|
||||
import net.yacy.cora.protocol.http.HTTPClient;
|
||||
import net.yacy.cora.util.ConcurrentLog;
|
||||
import net.yacy.cora.util.HTTPInputStream;
|
||||
import net.yacy.crawler.CrawlSwitchboard;
|
||||
import net.yacy.crawler.data.Cache;
|
||||
import net.yacy.crawler.data.CrawlProfile;
|
||||
import net.yacy.crawler.data.Latency;
|
||||
import net.yacy.kelondro.io.ByteCount;
|
||||
|
@ -75,6 +79,208 @@ public final class HTTPLoader {
|
|||
Latency.updateAfterLoad(entry.url(), System.currentTimeMillis() - start);
|
||||
return doc;
|
||||
}
|
||||
|
||||
/**
|
||||
* Open input stream on a requested HTTP resource. When resource is small, fully load it and returns a ByteArrayInputStream instance.
|
||||
* @param request
|
||||
* @param profile crawl profile
|
||||
* @param retryCount remaining redirect retries count
|
||||
* @param maxFileSize max file size to load. -1 means no limit.
|
||||
* @param blacklistType blacklist type to use
|
||||
* @param agent agent identifier
|
||||
* @return an open input stream. Don't forget to close it.
|
||||
* @throws IOException when an error occured
|
||||
*/
|
||||
public InputStream openInputStream(final Request request, CrawlProfile profile, final int retryCount,
|
||||
final int maxFileSize, final BlacklistType blacklistType, final ClientIdentification.Agent agent)
|
||||
throws IOException {
|
||||
if (retryCount < 0) {
|
||||
this.sb.crawlQueues.errorURL.push(request.url(), request.depth(), profile,
|
||||
FailCategory.TEMPORARY_NETWORK_FAILURE, "retry counter exceeded", -1);
|
||||
throw new IOException(
|
||||
"retry counter exceeded for URL " + request.url().toString() + ". Processing aborted.$");
|
||||
}
|
||||
DigestURL url = request.url();
|
||||
|
||||
final String host = url.getHost();
|
||||
if (host == null || host.length() < 2) {
|
||||
throw new IOException("host is not well-formed: '" + host + "'");
|
||||
}
|
||||
final String path = url.getFile();
|
||||
int port = url.getPort();
|
||||
final boolean ssl = url.getProtocol().equals("https");
|
||||
if (port < 0)
|
||||
port = (ssl) ? 443 : 80;
|
||||
|
||||
// check if url is in blacklist
|
||||
final String hostlow = host.toLowerCase();
|
||||
if (blacklistType != null && Switchboard.urlBlacklist.isListed(blacklistType, hostlow, path)) {
|
||||
this.sb.crawlQueues.errorURL.push(request.url(), request.depth(), profile, FailCategory.FINAL_LOAD_CONTEXT,
|
||||
"url in blacklist", -1);
|
||||
throw new IOException("CRAWLER Rejecting URL '" + request.url().toString() + "'. URL is in blacklist.$");
|
||||
}
|
||||
|
||||
// resolve yacy and yacyh domains
|
||||
final AlternativeDomainNames yacyResolver = this.sb.peers;
|
||||
if (yacyResolver != null) {
|
||||
final String yAddress = yacyResolver.resolve(host);
|
||||
if (yAddress != null) {
|
||||
url = new DigestURL(url.getProtocol() + "://" + yAddress + path);
|
||||
}
|
||||
}
|
||||
|
||||
// create a request header
|
||||
final RequestHeader requestHeader = createRequestheader(request, agent);
|
||||
|
||||
// HTTP-Client
|
||||
final HTTPClient client = new HTTPClient(agent);
|
||||
client.setRedirecting(false); // we want to handle redirection
|
||||
// ourselves, so we don't index pages
|
||||
// twice
|
||||
client.setTimout(this.socketTimeout);
|
||||
client.setHeader(requestHeader.entrySet());
|
||||
|
||||
// send request
|
||||
client.GET(url, false);
|
||||
final int statusCode = client.getHttpResponse().getStatusLine().getStatusCode();
|
||||
final ResponseHeader responseHeader = new ResponseHeader(statusCode, client.getHttpResponse().getAllHeaders());
|
||||
String requestURLString = request.url().toNormalform(true);
|
||||
|
||||
// check redirection
|
||||
if (statusCode > 299 && statusCode < 310) {
|
||||
|
||||
final DigestURL redirectionUrl = extractRedirectURL(request, profile, url, client, statusCode,
|
||||
responseHeader, requestURLString);
|
||||
|
||||
if (this.sb.getConfigBool(SwitchboardConstants.CRAWLER_FOLLOW_REDIRECTS, true)) {
|
||||
// we have two use cases here: loading from a crawl or just
|
||||
// loading the url. Check this:
|
||||
if (profile != null && !CrawlSwitchboard.DEFAULT_PROFILES.contains(profile.name())) {
|
||||
// put redirect url on the crawler queue to repeat a
|
||||
// double-check
|
||||
request.redirectURL(redirectionUrl);
|
||||
this.sb.crawlStacker.stackCrawl(request);
|
||||
// in the end we must throw an exception (even if this is
|
||||
// not an error, just to abort the current process
|
||||
throw new IOException("CRAWLER Redirect of URL=" + requestURLString + " to "
|
||||
+ redirectionUrl.toNormalform(false) + " placed on crawler queue for double-check");
|
||||
}
|
||||
|
||||
// if we are already doing a shutdown we don't need to retry
|
||||
// crawling
|
||||
if (Thread.currentThread().isInterrupted()) {
|
||||
this.sb.crawlQueues.errorURL.push(request.url(), request.depth(), profile,
|
||||
FailCategory.FINAL_LOAD_CONTEXT, "server shutdown", statusCode);
|
||||
throw new IOException(
|
||||
"CRAWLER Redirect of URL=" + requestURLString + " aborted because of server shutdown.$");
|
||||
}
|
||||
|
||||
// retry crawling with new url
|
||||
request.redirectURL(redirectionUrl);
|
||||
return openInputStream(request, profile, retryCount - 1, maxFileSize, blacklistType, agent);
|
||||
}
|
||||
// we don't want to follow redirects
|
||||
this.sb.crawlQueues.errorURL.push(request.url(), request.depth(), profile,
|
||||
FailCategory.FINAL_PROCESS_CONTEXT, "redirection not wanted", statusCode);
|
||||
throw new IOException("REJECTED UNWANTED REDIRECTION '" + client.getHttpResponse().getStatusLine()
|
||||
+ "' for URL '" + requestURLString + "'$");
|
||||
} else if (statusCode == 200 || statusCode == 203) {
|
||||
// the transfer is ok
|
||||
|
||||
/*
|
||||
* When content is not large (less than 1MB), we have better cache it if cache is enabled and url is not local
|
||||
*/
|
||||
long contentLength = client.getHttpResponse().getEntity().getContentLength();
|
||||
if (profile != null && profile.storeHTCache() && contentLength > 0 && contentLength < (1024 * 1024) && !url.isLocal()) {
|
||||
byte[] content = HTTPClient.getByteArray(client.getHttpResponse().getEntity(), maxFileSize);
|
||||
|
||||
try {
|
||||
Cache.store(url, responseHeader, content);
|
||||
} catch (final IOException e) {
|
||||
this.log.warn("cannot write " + url + " to Cache (3): " + e.getMessage(), e);
|
||||
}
|
||||
|
||||
return new ByteArrayInputStream(content);
|
||||
}
|
||||
/*
|
||||
* Returns a HTTPInputStream delegating to
|
||||
* client.getContentstream(). Close method will ensure client is
|
||||
* properly closed.
|
||||
*/
|
||||
return new HTTPInputStream(client);
|
||||
} else {
|
||||
// if the response has not the right response type then reject file
|
||||
this.sb.crawlQueues.errorURL.push(request.url(), request.depth(), profile,
|
||||
FailCategory.TEMPORARY_NETWORK_FAILURE, "wrong http status code", statusCode);
|
||||
throw new IOException("REJECTED WRONG STATUS TYPE '" + client.getHttpResponse().getStatusLine()
|
||||
+ "' for URL '" + requestURLString + "'$");
|
||||
}
|
||||
}
|
||||
|
||||
/**
|
||||
* Extract redirect URL from response header. Status code is supposed to be between 299 and 310. Parameters must not be null.
|
||||
* @return redirect URL
|
||||
* @throws IOException when an error occured
|
||||
*/
|
||||
private DigestURL extractRedirectURL(final Request request, CrawlProfile profile, DigestURL url,
|
||||
final HTTPClient client, final int statusCode, final ResponseHeader responseHeader, String requestURLString)
|
||||
throws IOException {
|
||||
// read redirection URL
|
||||
String redirectionUrlString = responseHeader.get(HeaderFramework.LOCATION);
|
||||
redirectionUrlString = redirectionUrlString == null ? "" : redirectionUrlString.trim();
|
||||
|
||||
if (redirectionUrlString.isEmpty()) {
|
||||
this.sb.crawlQueues.errorURL.push(request.url(), request.depth(), profile,
|
||||
FailCategory.TEMPORARY_NETWORK_FAILURE,
|
||||
"no redirection url provided, field '" + HeaderFramework.LOCATION + "' is empty", statusCode);
|
||||
throw new IOException("REJECTED EMTPY REDIRECTION '" + client.getHttpResponse().getStatusLine()
|
||||
+ "' for URL '" + requestURLString + "'$");
|
||||
}
|
||||
|
||||
// normalize URL
|
||||
final DigestURL redirectionUrl = DigestURL.newURL(request.url(), redirectionUrlString);
|
||||
|
||||
// restart crawling with new url
|
||||
this.log.info("CRAWLER Redirection detected ('" + client.getHttpResponse().getStatusLine() + "') for URL "
|
||||
+ requestURLString);
|
||||
this.log.info("CRAWLER ..Redirecting request to: " + redirectionUrl.toNormalform(false));
|
||||
|
||||
this.sb.webStructure.generateCitationReference(url, redirectionUrl);
|
||||
|
||||
if (this.sb.getConfigBool(SwitchboardConstants.CRAWLER_RECORD_REDIRECTS, true)) {
|
||||
this.sb.crawlQueues.errorURL.push(request.url(), request.depth(), profile,
|
||||
FailCategory.FINAL_REDIRECT_RULE, "redirect to " + redirectionUrlString, statusCode);
|
||||
}
|
||||
return redirectionUrl;
|
||||
}
|
||||
|
||||
/**
|
||||
* Create request header for loading content.
|
||||
* @param request search request
|
||||
* @param agent agent identification information
|
||||
* @return a request header
|
||||
* @throws IOException when an error occured
|
||||
*/
|
||||
private RequestHeader createRequestheader(final Request request, final ClientIdentification.Agent agent)
|
||||
throws IOException {
|
||||
final RequestHeader requestHeader = new RequestHeader();
|
||||
requestHeader.put(HeaderFramework.USER_AGENT, agent.userAgent);
|
||||
DigestURL refererURL = null;
|
||||
if (request.referrerhash() != null) {
|
||||
refererURL = this.sb.getURL(request.referrerhash());
|
||||
}
|
||||
if (refererURL != null) {
|
||||
requestHeader.put(RequestHeader.REFERER, refererURL.toNormalform(true));
|
||||
}
|
||||
requestHeader.put(HeaderFramework.ACCEPT, this.sb.getConfig("crawler.http.accept", DEFAULT_ACCEPT));
|
||||
requestHeader.put(HeaderFramework.ACCEPT_LANGUAGE,
|
||||
this.sb.getConfig("crawler.http.acceptLanguage", DEFAULT_LANGUAGE));
|
||||
requestHeader.put(HeaderFramework.ACCEPT_CHARSET,
|
||||
this.sb.getConfig("crawler.http.acceptCharset", DEFAULT_CHARSET));
|
||||
requestHeader.put(HeaderFramework.ACCEPT_ENCODING,
|
||||
this.sb.getConfig("crawler.http.acceptEncoding", DEFAULT_ENCODING));
|
||||
return requestHeader;
|
||||
}
|
||||
|
||||
private Response load(final Request request, CrawlProfile profile, final int retryCount, final int maxFileSize, final BlacklistType blacklistType, final ClientIdentification.Agent agent) throws IOException {
|
||||
|
||||
|
@ -112,15 +318,7 @@ public final class HTTPLoader {
|
|||
Response response = null;
|
||||
|
||||
// create a request header
|
||||
final RequestHeader requestHeader = new RequestHeader();
|
||||
requestHeader.put(HeaderFramework.USER_AGENT, agent.userAgent);
|
||||
DigestURL refererURL = null;
|
||||
if (request.referrerhash() != null) refererURL = this.sb.getURL(request.referrerhash());
|
||||
if (refererURL != null) requestHeader.put(RequestHeader.REFERER, refererURL.toNormalform(true));
|
||||
requestHeader.put(HeaderFramework.ACCEPT, this.sb.getConfig("crawler.http.accept", DEFAULT_ACCEPT));
|
||||
requestHeader.put(HeaderFramework.ACCEPT_LANGUAGE, this.sb.getConfig("crawler.http.acceptLanguage", DEFAULT_LANGUAGE));
|
||||
requestHeader.put(HeaderFramework.ACCEPT_CHARSET, this.sb.getConfig("crawler.http.acceptCharset", DEFAULT_CHARSET));
|
||||
requestHeader.put(HeaderFramework.ACCEPT_ENCODING, this.sb.getConfig("crawler.http.acceptEncoding", DEFAULT_ENCODING));
|
||||
final RequestHeader requestHeader = createRequestheader(request, agent);
|
||||
|
||||
// HTTP-Client
|
||||
final HTTPClient client = new HTTPClient(agent);
|
||||
|
@ -137,27 +335,8 @@ public final class HTTPLoader {
|
|||
// check redirection
|
||||
if (statusCode > 299 && statusCode < 310) {
|
||||
|
||||
// read redirection URL
|
||||
String redirectionUrlString = responseHeader.get(HeaderFramework.LOCATION);
|
||||
redirectionUrlString = redirectionUrlString == null ? "" : redirectionUrlString.trim();
|
||||
|
||||
if (redirectionUrlString.isEmpty()) {
|
||||
this.sb.crawlQueues.errorURL.push(request.url(), request.depth(), profile, FailCategory.TEMPORARY_NETWORK_FAILURE, "no redirection url provided, field '" + HeaderFramework.LOCATION + "' is empty", statusCode);
|
||||
throw new IOException("REJECTED EMTPY REDIRECTION '" + client.getHttpResponse().getStatusLine() + "' for URL '" + requestURLString + "'$");
|
||||
}
|
||||
|
||||
// normalize URL
|
||||
final DigestURL redirectionUrl = DigestURL.newURL(request.url(), redirectionUrlString);
|
||||
|
||||
// restart crawling with new url
|
||||
this.log.info("CRAWLER Redirection detected ('" + client.getHttpResponse().getStatusLine() + "') for URL " + requestURLString);
|
||||
this.log.info("CRAWLER ..Redirecting request to: " + redirectionUrl.toNormalform(false));
|
||||
|
||||
this.sb.webStructure.generateCitationReference(url, redirectionUrl);
|
||||
|
||||
if (this.sb.getConfigBool(SwitchboardConstants.CRAWLER_RECORD_REDIRECTS, true)) {
|
||||
this.sb.crawlQueues.errorURL.push(request.url(), request.depth(), profile, FailCategory.FINAL_REDIRECT_RULE, "redirect to " + redirectionUrlString, statusCode);
|
||||
}
|
||||
final DigestURL redirectionUrl = extractRedirectURL(request, profile, url, client, statusCode,
|
||||
responseHeader, requestURLString);
|
||||
|
||||
if (this.sb.getConfigBool(SwitchboardConstants.CRAWLER_FOLLOW_REDIRECTS, true)) {
|
||||
// we have two use cases here: loading from a crawl or just loading the url. Check this:
|
||||
|
|
|
@ -26,8 +26,10 @@
|
|||
|
||||
package net.yacy.repository;
|
||||
|
||||
import java.io.ByteArrayInputStream;
|
||||
import java.io.File;
|
||||
import java.io.IOException;
|
||||
import java.io.InputStream;
|
||||
import java.net.MalformedURLException;
|
||||
import java.util.Arrays;
|
||||
import java.util.Date;
|
||||
|
@ -209,54 +211,9 @@ public final class LoaderDispatcher {
|
|||
}
|
||||
|
||||
// check if we have the page in the cache
|
||||
if (cacheStrategy != CacheStrategy.NOCACHE && crawlProfile != null) {
|
||||
// we have passed a first test if caching is allowed
|
||||
// now see if there is a cache entry
|
||||
|
||||
final ResponseHeader cachedResponse = (url.isLocal()) ? null : Cache.getResponseHeader(url.hash());
|
||||
if (cachedResponse != null && Cache.hasContent(url.hash())) {
|
||||
// yes we have the content
|
||||
|
||||
// create request header values and a response object because we need that
|
||||
// in case that we want to return the cached content in the next step
|
||||
final RequestHeader requestHeader = new RequestHeader();
|
||||
requestHeader.put(HeaderFramework.USER_AGENT, agent.userAgent);
|
||||
DigestURL refererURL = null;
|
||||
if (request.referrerhash() != null) refererURL = this.sb.getURL(request.referrerhash());
|
||||
if (refererURL != null) requestHeader.put(RequestHeader.REFERER, refererURL.toNormalform(true));
|
||||
final Response response = new Response(
|
||||
request,
|
||||
requestHeader,
|
||||
cachedResponse,
|
||||
crawlProfile,
|
||||
true,
|
||||
null);
|
||||
|
||||
// check which caching strategy shall be used
|
||||
if (cacheStrategy == CacheStrategy.IFEXIST || cacheStrategy == CacheStrategy.CACHEONLY) {
|
||||
// well, just take the cache and don't care about freshness of the content
|
||||
final byte[] content = Cache.getContent(url.hash());
|
||||
if (content != null) {
|
||||
LoaderDispatcher.log.info("cache hit/useall for: " + url.toNormalform(true));
|
||||
response.setContent(content);
|
||||
return response;
|
||||
}
|
||||
}
|
||||
|
||||
// now the cacheStrategy must be CACHE_STRATEGY_IFFRESH, that means we should do a proxy freshness test
|
||||
//assert cacheStrategy == CacheStrategy.IFFRESH : "cacheStrategy = " + cacheStrategy;
|
||||
if (response.isFreshForProxy()) {
|
||||
final byte[] content = Cache.getContent(url.hash());
|
||||
if (content != null) {
|
||||
LoaderDispatcher.log.info("cache hit/fresh for: " + url.toNormalform(true));
|
||||
response.setContent(content);
|
||||
return response;
|
||||
}
|
||||
}
|
||||
LoaderDispatcher.log.info("cache hit/stale for: " + url.toNormalform(true));
|
||||
} else if (cachedResponse != null) {
|
||||
LoaderDispatcher.log.warn("HTCACHE contained response header, but not content for url " + url.toNormalform(true));
|
||||
}
|
||||
Response response = loadFromCache(request, cacheStrategy, agent, url, crawlProfile);
|
||||
if(response != null) {
|
||||
return response;
|
||||
}
|
||||
|
||||
// check case where we want results from the cache exclusively, and never from the Internet (offline mode)
|
||||
|
@ -269,21 +226,7 @@ public final class LoaderDispatcher {
|
|||
|
||||
// check access time: this is a double-check (we checked possibly already in the balancer)
|
||||
// to make sure that we don't DoS the target by mistake
|
||||
if (!url.isLocal()) {
|
||||
final Long lastAccess = accessTime.get(host);
|
||||
long wait = 0;
|
||||
if (lastAccess != null) wait = Math.max(0, agent.minimumDelta + lastAccess.longValue() - System.currentTimeMillis());
|
||||
if (wait > 0) {
|
||||
// force a sleep here. Instead just sleep we clean up the accessTime map
|
||||
final long untilTime = System.currentTimeMillis() + wait;
|
||||
cleanupAccessTimeTable(untilTime);
|
||||
if (System.currentTimeMillis() < untilTime) {
|
||||
long frcdslp = untilTime - System.currentTimeMillis();
|
||||
LoaderDispatcher.log.info("Forcing sleep of " + frcdslp + " ms for host " + host);
|
||||
try {Thread.sleep(frcdslp);} catch (final InterruptedException ee) {}
|
||||
}
|
||||
}
|
||||
}
|
||||
checkAccessTime(agent, url);
|
||||
|
||||
// now it's for sure that we will access the target. Remember the access time
|
||||
if (host != null) {
|
||||
|
@ -292,7 +235,6 @@ public final class LoaderDispatcher {
|
|||
}
|
||||
|
||||
// load resource from the internet
|
||||
Response response = null;
|
||||
if (protocol.equals("http") || protocol.equals("https")) {
|
||||
response = this.httpLoader.load(request, crawlProfile, maxFileSize, blacklistType, agent);
|
||||
} else if (protocol.equals("ftp")) {
|
||||
|
@ -331,6 +273,167 @@ public final class LoaderDispatcher {
|
|||
return response;
|
||||
}
|
||||
|
||||
/**
|
||||
* Try loading requested resource from cache according to cache strategy
|
||||
* @param request request to resource
|
||||
* @param cacheStrategy cache strategy to use
|
||||
* @param agent agent identifier
|
||||
* @param url resource url
|
||||
* @param crawlProfile crawl profile
|
||||
* @return a Response instance when resource could be loaded from cache, or null.
|
||||
* @throws IOException when an error occured
|
||||
*/
|
||||
private Response loadFromCache(final Request request, CacheStrategy cacheStrategy, ClientIdentification.Agent agent,
|
||||
final DigestURL url, final CrawlProfile crawlProfile) throws IOException {
|
||||
Response response = null;
|
||||
if (cacheStrategy != CacheStrategy.NOCACHE && crawlProfile != null) {
|
||||
// we have passed a first test if caching is allowed
|
||||
// now see if there is a cache entry
|
||||
|
||||
final ResponseHeader cachedResponse = (url.isLocal()) ? null : Cache.getResponseHeader(url.hash());
|
||||
if (cachedResponse != null && Cache.hasContent(url.hash())) {
|
||||
// yes we have the content
|
||||
|
||||
// create request header values and a response object because we need that
|
||||
// in case that we want to return the cached content in the next step
|
||||
final RequestHeader requestHeader = new RequestHeader();
|
||||
requestHeader.put(HeaderFramework.USER_AGENT, agent.userAgent);
|
||||
DigestURL refererURL = null;
|
||||
if (request.referrerhash() != null) refererURL = this.sb.getURL(request.referrerhash());
|
||||
if (refererURL != null) requestHeader.put(RequestHeader.REFERER, refererURL.toNormalform(true));
|
||||
response = new Response(
|
||||
request,
|
||||
requestHeader,
|
||||
cachedResponse,
|
||||
crawlProfile,
|
||||
true,
|
||||
null);
|
||||
|
||||
// check which caching strategy shall be used
|
||||
if (cacheStrategy == CacheStrategy.IFEXIST || cacheStrategy == CacheStrategy.CACHEONLY) {
|
||||
// well, just take the cache and don't care about freshness of the content
|
||||
final byte[] content = Cache.getContent(url.hash());
|
||||
if (content != null) {
|
||||
LoaderDispatcher.log.info("cache hit/useall for: " + url.toNormalform(true));
|
||||
response.setContent(content);
|
||||
return response;
|
||||
}
|
||||
}
|
||||
|
||||
// now the cacheStrategy must be CACHE_STRATEGY_IFFRESH, that means we should do a proxy freshness test
|
||||
//assert cacheStrategy == CacheStrategy.IFFRESH : "cacheStrategy = " + cacheStrategy;
|
||||
if (response.isFreshForProxy()) {
|
||||
final byte[] content = Cache.getContent(url.hash());
|
||||
if (content != null) {
|
||||
LoaderDispatcher.log.info("cache hit/fresh for: " + url.toNormalform(true));
|
||||
response.setContent(content);
|
||||
return response;
|
||||
}
|
||||
}
|
||||
LoaderDispatcher.log.info("cache hit/stale for: " + url.toNormalform(true));
|
||||
} else if (cachedResponse != null) {
|
||||
LoaderDispatcher.log.warn("HTCACHE contained response header, but not content for url " + url.toNormalform(true));
|
||||
}
|
||||
}
|
||||
return response;
|
||||
}
|
||||
|
||||
/**
|
||||
* Open an InputStream on a resource from the web, from ftp, from smb or a file
|
||||
* @param request the request essentials
|
||||
* @param cacheStratgy strategy according to NOCACHE, IFFRESH, IFEXIST, CACHEONLY
|
||||
* @return an open ImageInputStream. Don't forget to close it once used!
|
||||
* @throws IOException when url is malformed, blacklisted, or CacheStrategy is CACHEONLY and content is unavailable
|
||||
*/
|
||||
private InputStream openInputStreamInternal(final Request request, CacheStrategy cacheStrategy, final int maxFileSize, final BlacklistType blacklistType, ClientIdentification.Agent agent) throws IOException {
|
||||
// get the protocol of the next URL
|
||||
final DigestURL url = request.url();
|
||||
if (url.isFile() || url.isSMB()) {
|
||||
cacheStrategy = CacheStrategy.NOCACHE; // load just from the file
|
||||
// system
|
||||
}
|
||||
final String protocol = url.getProtocol();
|
||||
final String host = url.getHost();
|
||||
final CrawlProfile crawlProfile = request.profileHandle() == null ? null : this.sb.crawler.get(UTF8.getBytes(request.profileHandle()));
|
||||
|
||||
// check if url is in blacklist
|
||||
if (blacklistType != null && host != null && Switchboard.urlBlacklist.isListed(blacklistType, host.toLowerCase(), url.getFile())) {
|
||||
this.sb.crawlQueues.errorURL.push(request.url(), request.depth(), crawlProfile, FailCategory.FINAL_LOAD_CONTEXT, "url in blacklist", -1);
|
||||
throw new IOException("DISPATCHER Rejecting URL '" + request.url().toString() + "'. URL is in blacklist.$");
|
||||
}
|
||||
|
||||
// check if we have the page in the cache
|
||||
Response cachedResponse = loadFromCache(request, cacheStrategy, agent, url, crawlProfile);
|
||||
if(cachedResponse != null) {
|
||||
return new ByteArrayInputStream(cachedResponse.getContent());
|
||||
}
|
||||
|
||||
// check case where we want results from the cache exclusively, and never from the Internet (offline mode)
|
||||
if (cacheStrategy == CacheStrategy.CACHEONLY) {
|
||||
// we had a chance to get the content from the cache .. its over. We don't have it.
|
||||
throw new IOException("cache only strategy");
|
||||
}
|
||||
|
||||
// now forget about the cache, nothing there. Try to load the content from the Internet
|
||||
|
||||
// check access time: this is a double-check (we checked possibly already in the balancer)
|
||||
// to make sure that we don't DoS the target by mistake
|
||||
checkAccessTime(agent, url);
|
||||
|
||||
// now it's for sure that we will access the target. Remember the access time
|
||||
if (host != null) {
|
||||
if (accessTime.size() > accessTimeMaxsize) accessTime.clear(); // prevent a memory leak here
|
||||
accessTime.put(host, System.currentTimeMillis());
|
||||
}
|
||||
|
||||
// load resource from the internet
|
||||
InputStream inStream = null;
|
||||
if (protocol.equals("http") || protocol.equals("https")) {
|
||||
inStream = this.httpLoader.openInputStream(request, crawlProfile, 1, maxFileSize, blacklistType, agent);
|
||||
} else if (protocol.equals("ftp") || protocol.equals("smb") || protocol.equals("file")) {
|
||||
// may also open directly stream with ftp loader
|
||||
inStream = url.getInputStream(agent, null, null);
|
||||
} else {
|
||||
throw new IOException("Unsupported protocol '" + protocol + "' in url " + url);
|
||||
}
|
||||
if (inStream == null) {
|
||||
throw new IOException("Unable to open content stream");
|
||||
}
|
||||
|
||||
return inStream;
|
||||
}
|
||||
|
||||
|
||||
/**
|
||||
* Check access time: this is a double-check (we checked possibly already in the balancer)
|
||||
* to make sure that we don't DoS the target by mistake
|
||||
* @param agent agent identifier
|
||||
* @param url target url
|
||||
*/
|
||||
private void checkAccessTime(ClientIdentification.Agent agent, final DigestURL url) {
|
||||
if (!url.isLocal()) {
|
||||
String host = url.getHost();
|
||||
final Long lastAccess = accessTime.get(host);
|
||||
long wait = 0;
|
||||
if (lastAccess != null)
|
||||
wait = Math.max(0, agent.minimumDelta + lastAccess.longValue() - System.currentTimeMillis());
|
||||
if (wait > 0) {
|
||||
// force a sleep here. Instead just sleep we clean up the
|
||||
// accessTime map
|
||||
final long untilTime = System.currentTimeMillis() + wait;
|
||||
cleanupAccessTimeTable(untilTime);
|
||||
if (System.currentTimeMillis() < untilTime) {
|
||||
long frcdslp = untilTime - System.currentTimeMillis();
|
||||
LoaderDispatcher.log.info("Forcing sleep of " + frcdslp + " ms for host " + host);
|
||||
try {
|
||||
Thread.sleep(frcdslp);
|
||||
} catch (final InterruptedException ee) {
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
private int protocolMaxFileSize(final DigestURL url) {
|
||||
if (url.isHTTP() || url.isHTTPS())
|
||||
return this.sb.getConfigInt("crawler.http.maxFileSize", HTTPLoader.DEFAULT_MAXFILESIZE);
|
||||
|
@ -357,6 +460,53 @@ public final class LoaderDispatcher {
|
|||
// read resource body (if it is there)
|
||||
return entry.getContent();
|
||||
}
|
||||
|
||||
/**
|
||||
* Open url as InputStream from the web or the cache
|
||||
* @param request must be not null
|
||||
* @param cacheStrategy cache strategy to use
|
||||
* @param blacklistType black list
|
||||
* @param agent agent identification for HTTP requests
|
||||
* @return an open InputStream on content. Don't forget to close it once used.
|
||||
* @throws IOException when url is malformed or blacklisted
|
||||
*/
|
||||
public InputStream openInputStream(final Request request, final CacheStrategy cacheStrategy,
|
||||
BlacklistType blacklistType, final ClientIdentification.Agent agent) throws IOException {
|
||||
final int maxFileSize = protocolMaxFileSize(request.url());
|
||||
InputStream stream = null;
|
||||
|
||||
Semaphore check = this.loaderSteering.get(request.url());
|
||||
if (check != null && cacheStrategy != CacheStrategy.NOCACHE) {
|
||||
// a loading process is going on for that url
|
||||
long t = System.currentTimeMillis();
|
||||
try {
|
||||
check.tryAcquire(5, TimeUnit.SECONDS);
|
||||
} catch (final InterruptedException e) {
|
||||
}
|
||||
ConcurrentLog.info("LoaderDispatcher",
|
||||
"waited " + (System.currentTimeMillis() - t) + " ms for " + request.url().toNormalform(true));
|
||||
// now the process may have terminated and we run a normal loading
|
||||
// which may be successful faster because of a cache hit
|
||||
}
|
||||
|
||||
this.loaderSteering.put(request.url(), new Semaphore(0));
|
||||
try {
|
||||
stream = openInputStreamInternal(request, cacheStrategy, maxFileSize, blacklistType, agent);
|
||||
} catch(IOException ioe) {
|
||||
/* Do not re encapsulate eventual IOException in an IOException */
|
||||
throw ioe;
|
||||
} catch (final Throwable e) {
|
||||
throw new IOException(e);
|
||||
} finally {
|
||||
// release the semaphore anyway
|
||||
check = this.loaderSteering.remove(request.url());
|
||||
if (check != null) {
|
||||
check.release(1000); // don't block any other
|
||||
}
|
||||
}
|
||||
|
||||
return stream;
|
||||
}
|
||||
|
||||
public Document[] loadDocuments(final Request request, final CacheStrategy cacheStrategy, final int maxFileSize, BlacklistType blacklistType, final ClientIdentification.Agent agent) throws IOException, Parser.Failure {
|
||||
|
||||
|
|
|
@ -8,6 +8,9 @@ import java.util.List;
|
|||
import java.util.Map;
|
||||
import java.util.TreeMap;
|
||||
|
||||
import javax.imageio.ImageIO;
|
||||
import javax.imageio.stream.ImageInputStream;
|
||||
|
||||
import net.yacy.cora.util.ConcurrentLog;
|
||||
import net.yacy.peers.graphics.EncodedImage;
|
||||
import net.yacy.server.serverObjects;
|
||||
|
@ -75,8 +78,9 @@ public class ViewImagePerfTest extends ViewImageTest {
|
|||
}
|
||||
|
||||
/**
|
||||
* Process inFile image, update processedFiles list and failures map, and append measurements to results_perfs.txt. All
|
||||
* parameters must not be null.
|
||||
* Process inFile image, update processedFiles list and failures map, and
|
||||
* append measurements to results_perfs.txt. All parameters must not be
|
||||
* null.
|
||||
*
|
||||
* @param ext
|
||||
* output encoding image format
|
||||
|
@ -92,7 +96,7 @@ public class ViewImagePerfTest extends ViewImageTest {
|
|||
* when an read/write error occured
|
||||
*/
|
||||
@Override
|
||||
protected void processFile(String ext, File outDir, serverObjects post, Map<String, Exception> failures,
|
||||
protected void processFile(String ext, File outDir, serverObjects post, Map<String, Throwable> failures,
|
||||
File inFile) throws IOException {
|
||||
/* Delete eventual previous result file */
|
||||
System.out
|
||||
|
@ -102,43 +106,43 @@ public class ViewImagePerfTest extends ViewImageTest {
|
|||
outFile.delete();
|
||||
}
|
||||
|
||||
byte[] resourceb = getBytes(inFile);
|
||||
String urlString = inFile.getAbsolutePath();
|
||||
EncodedImage img = null;
|
||||
Exception error = null;
|
||||
long beginTime = System.nanoTime(), time, minTime = Long.MAX_VALUE, maxTime = 0, meanTime = 0, totalTime = 0;
|
||||
int step = 0;
|
||||
for (step = 0; (totalTime / 1000000000) < this.minMeasureTime; step++) {
|
||||
beginTime = System.nanoTime();
|
||||
ImageInputStream inStream = ImageIO.createImageInputStream(inFile);
|
||||
try {
|
||||
img = ViewImage.parseAndScale(post, true, urlString, ext, inStream);
|
||||
} catch (Exception e) {
|
||||
error = e;
|
||||
}
|
||||
time = System.nanoTime() - beginTime;
|
||||
minTime = Math.min(minTime, time);
|
||||
maxTime = Math.max(maxTime, time);
|
||||
totalTime += time;
|
||||
}
|
||||
if (step > 0) {
|
||||
meanTime = totalTime / step;
|
||||
} else {
|
||||
meanTime = totalTime;
|
||||
}
|
||||
PrintWriter resultsWriter = new PrintWriter(new FileWriter(new File(outDir, "results_perfs.txt"), true));
|
||||
try {
|
||||
long beginTime, time, minTime = Long.MAX_VALUE, maxTime = 0, meanTime = 0, totalTime = 0;
|
||||
int step = 0;
|
||||
for (step = 0; (totalTime / 1000000000) < this.minMeasureTime; step++) {
|
||||
beginTime = System.nanoTime();
|
||||
img = ViewImage.parseAndScale(post, true, urlString, ext, false, resourceb);
|
||||
time = System.nanoTime() - beginTime;
|
||||
if (img == null) {
|
||||
break;
|
||||
}
|
||||
minTime = Math.min(minTime, time);
|
||||
maxTime = Math.max(maxTime, time);
|
||||
totalTime += time;
|
||||
writeMessage("Measured ViewImage render with file : " + inFile.getAbsolutePath() + " encoded To : " + ext,
|
||||
resultsWriter);
|
||||
if(img == null) {
|
||||
writeMessage("Image could not be rendered! Measurement show time needed to read and parse image data until error detection.", resultsWriter);
|
||||
}
|
||||
if (img == null) {
|
||||
System.out.println("Image could not be rendered!");
|
||||
} else {
|
||||
meanTime = totalTime / step;
|
||||
PrintWriter resultsWriter = new PrintWriter(new FileWriter(new File(outDir, "results_perfs.txt"), true));
|
||||
try {
|
||||
writeMessage("Measured ViewImage render with file : " + inFile.getAbsolutePath() + " encoded To : "
|
||||
+ ext, resultsWriter);
|
||||
writeMessage("Render total time (ms) : " + (totalTime) / 1000000 + " on " + step + " steps.",
|
||||
resultsWriter);
|
||||
writeMessage("Render mean time (ms) : " + (meanTime) / 1000000, resultsWriter);
|
||||
writeMessage("Render min time (ms) : " + (minTime) / 1000000, resultsWriter);
|
||||
writeMessage("Render max time (ms) : " + (maxTime) / 1000000, resultsWriter);
|
||||
} finally {
|
||||
resultsWriter.close();
|
||||
}
|
||||
}
|
||||
} catch (Exception e) {
|
||||
error = e;
|
||||
writeMessage("Render total time (ms) : " + (totalTime) / 1000000 + " on " + step + " steps.",
|
||||
resultsWriter);
|
||||
writeMessage("Render mean time (ms) : " + (meanTime) / 1000000, resultsWriter);
|
||||
writeMessage("Render min time (ms) : " + (minTime) / 1000000, resultsWriter);
|
||||
writeMessage("Render max time (ms) : " + (maxTime) / 1000000, resultsWriter);
|
||||
} finally {
|
||||
resultsWriter.close();
|
||||
}
|
||||
|
||||
if (img == null) {
|
||||
|
@ -218,7 +222,7 @@ public class ViewImagePerfTest extends ViewImageTest {
|
|||
System.out.println("Rendered images will be written in dir : " + outDir.getAbsolutePath());
|
||||
|
||||
List<File> processedFiles = new ArrayList<File>();
|
||||
Map<String, Exception> failures = new TreeMap<>();
|
||||
Map<String, Throwable> failures = new TreeMap<>();
|
||||
try {
|
||||
long time = System.nanoTime();
|
||||
test.processFiles(ext, recursive, outDir, post, inFiles, processedFiles, failures);
|
||||
|
|
|
@ -1,9 +1,7 @@
|
|||
import java.io.File;
|
||||
import java.io.FileInputStream;
|
||||
import java.io.FileOutputStream;
|
||||
import java.io.FileWriter;
|
||||
import java.io.IOException;
|
||||
import java.io.InputStream;
|
||||
import java.io.PrintWriter;
|
||||
import java.net.URL;
|
||||
import java.util.ArrayList;
|
||||
|
@ -12,6 +10,9 @@ import java.util.Map;
|
|||
import java.util.Map.Entry;
|
||||
import java.util.TreeMap;
|
||||
|
||||
import javax.imageio.ImageIO;
|
||||
import javax.imageio.stream.ImageInputStream;
|
||||
|
||||
import net.yacy.cora.util.ConcurrentLog;
|
||||
import net.yacy.peers.graphics.EncodedImage;
|
||||
import net.yacy.server.serverObjects;
|
||||
|
@ -52,24 +53,6 @@ public class ViewImageTest {
|
|||
/** Default output encoding format */
|
||||
private static final String DEFAULT_OUT_EXT = "png";
|
||||
|
||||
/**
|
||||
* @param testFile
|
||||
* file to load
|
||||
* @return testFile content as a bytes array
|
||||
* @throws IOException
|
||||
* when an error occured while loading
|
||||
*/
|
||||
protected byte[] getBytes(File testFile) throws IOException {
|
||||
InputStream inStream = new FileInputStream(testFile);
|
||||
byte[] res = new byte[inStream.available()];
|
||||
try {
|
||||
inStream.read(res);
|
||||
} finally {
|
||||
inStream.close();
|
||||
}
|
||||
return res;
|
||||
}
|
||||
|
||||
/**
|
||||
* @param args
|
||||
* main parameters. first item may contain input file or folder
|
||||
|
@ -207,7 +190,7 @@ public class ViewImageTest {
|
|||
* @param processedFiles
|
||||
* all processed image files
|
||||
* @param failures
|
||||
* map input file url which failed with eventual cause exception
|
||||
* map input file url which failed with eventual cause error
|
||||
* @param time
|
||||
* total processing time in nanoseconds
|
||||
* @param outDir
|
||||
|
@ -215,7 +198,7 @@ public class ViewImageTest {
|
|||
* @throws IOException
|
||||
* when a write error occured writing the results file
|
||||
*/
|
||||
protected void displayResults(List<File> processedFiles, Map<String, Exception> failures, long time, File outDir)
|
||||
protected void displayResults(List<File> processedFiles, Map<String, Throwable> failures, long time, File outDir)
|
||||
throws IOException {
|
||||
PrintWriter resultsWriter = new PrintWriter(new FileWriter(new File(outDir, "results.txt")));
|
||||
try {
|
||||
|
@ -226,7 +209,7 @@ public class ViewImageTest {
|
|||
} else {
|
||||
writeMessage("Some input files could not be processed :", resultsWriter);
|
||||
}
|
||||
for (Entry<String, Exception> entry : failures.entrySet()) {
|
||||
for (Entry<String, Throwable> entry : failures.entrySet()) {
|
||||
writeMessage(entry.getKey(), resultsWriter);
|
||||
if (entry.getValue() != null) {
|
||||
writeMessage("cause : " + entry.getValue(), resultsWriter);
|
||||
|
@ -266,7 +249,7 @@ public class ViewImageTest {
|
|||
* when an read/write error occured
|
||||
*/
|
||||
protected void processFiles(String ext, boolean recursive, File outDir, serverObjects post, File[] inFiles,
|
||||
List<File> processedFiles, Map<String, Exception> failures) throws IOException {
|
||||
List<File> processedFiles, Map<String, Throwable> failures) throws IOException {
|
||||
for (File inFile : inFiles) {
|
||||
if (inFile.isDirectory()) {
|
||||
if (recursive) {
|
||||
|
@ -291,7 +274,7 @@ public class ViewImageTest {
|
|||
* @param inFile file image to process
|
||||
* @throws IOException when an read/write error occured
|
||||
*/
|
||||
protected void processFile(String ext, File outDir, serverObjects post, Map<String, Exception> failures, File inFile)
|
||||
protected void processFile(String ext, File outDir, serverObjects post, Map<String, Throwable> failures, File inFile)
|
||||
throws IOException {
|
||||
/* Delete eventual previous result file */
|
||||
File outFile = new File(outDir, inFile.getName() + "." + ext);
|
||||
|
@ -299,13 +282,13 @@ public class ViewImageTest {
|
|||
outFile.delete();
|
||||
}
|
||||
|
||||
byte[] resourceb = getBytes(inFile);
|
||||
ImageInputStream inStream = ImageIO.createImageInputStream(inFile);
|
||||
String urlString = inFile.getAbsolutePath();
|
||||
EncodedImage img = null;
|
||||
Exception error = null;
|
||||
Throwable error = null;
|
||||
try {
|
||||
img = ViewImage.parseAndScale(post, true, urlString, ext, false, resourceb);
|
||||
} catch (Exception e) {
|
||||
img = ViewImage.parseAndScale(post, true, urlString, ext, inStream);
|
||||
} catch (Throwable e) {
|
||||
error = e;
|
||||
}
|
||||
|
||||
|
@ -383,7 +366,7 @@ public class ViewImageTest {
|
|||
System.out.println("Rendered images will be written in dir : " + outDir.getAbsolutePath());
|
||||
|
||||
List<File> processedFiles = new ArrayList<File>();
|
||||
Map<String, Exception> failures = new TreeMap<>();
|
||||
Map<String, Throwable> failures = new TreeMap<>();
|
||||
try {
|
||||
long time = System.nanoTime();
|
||||
test.processFiles(ext, recursive, outDir, post, inFiles, processedFiles, failures);
|
||||
|
|
Loading…
Reference in New Issue
Block a user