Improved consistency between loader openInputStream and load functions

This commit is contained in:
luccioman 2017-06-02 01:46:06 +02:00
parent cbccf97361
commit a9cb083fa1
8 changed files with 300 additions and 51 deletions

View File

@ -27,6 +27,7 @@
package net.yacy.crawler.retrieval;
import java.io.ByteArrayInputStream;
import java.io.ByteArrayOutputStream;
import java.io.IOException;
import java.io.PrintStream;
@ -167,6 +168,29 @@ public class FTPLoader {
return response;
}
/**
* Open a stream on the entry content from a FTP server
*
* @param request the request to process
* @param acceptOnlyParseable when true and no parser can be found to handle the detected MIME type, open a stream on the URL tokens
* @return a response with full meta data and embedding on open input stream on content. Don't forget to close the stream.
*/
public StreamResponse openInputStream(final Request request, final boolean acceptOnlyParseable) throws IOException {
final Response response = load(request, acceptOnlyParseable);
// TODO implement a true ftp content stream instead of a simple ByteArrayInputStream encapsulation
final StreamResponse streamResponse;
if(response.getContent() != null) {
streamResponse = new StreamResponse(response,
new ByteArrayInputStream(response.getContent()));
} else {
/* content can be null when no parser can handle it : then return the URL tokens as content */
streamResponse = new StreamResponse(response,
new ByteArrayInputStream(UTF8.getBytes(request.url().toTokens())));
}
return streamResponse;
}
/**
* @param ftpClient
*/

View File

@ -24,6 +24,7 @@
package net.yacy.crawler.retrieval;
import java.io.ByteArrayInputStream;
import java.io.IOException;
import java.io.InputStream;
import java.util.ArrayList;
@ -58,7 +59,31 @@ public class FileLoader {
this.maxFileSize = (int) sb.getConfigLong("crawler.file.maxFileSize", -1l);
}
/**
* Load fully the requested file in a byte buffer
*
* @param request the request to process
* @param acceptOnlyParseable when true and no parser can be found to handle the detected MIME type, the response content buffer contains only URL tokens
* @return a response with full meta data and embedding the content as a byte buffer
*/
public Response load(final Request request, boolean acceptOnlyParseable) throws IOException {
StreamResponse streamResponse = openInputStream(request, acceptOnlyParseable);
/* Read fully the stream and update the response */
byte[] content = FileUtils.read(streamResponse.getContentStream());
Response response = streamResponse.getResponse();
response.setContent(content);
return response;
}
/**
* Open a stream on the requested file
*
* @param request the request to process
* @param acceptOnlyParseable when true and no parser can be found to handle the detected MIME type, open a stream on the URL tokens
* @return a response with full meta data and embedding on open input stream on content. Don't forget to close the stream.
*/
public StreamResponse openInputStream(final Request request, final boolean acceptOnlyParseable) throws IOException {
DigestURL url = request.url();
if (!url.getProtocol().equals("file")) throw new IOException("wrong protocol for FileLoader: " + url.getProtocol());
@ -93,9 +118,9 @@ public class FileLoader {
responseHeader,
profile,
false,
UTF8.getBytes(content.toString()));
null);
return response;
return new StreamResponse(response, new ByteArrayInputStream(UTF8.getBytes(content.toString())));
}
// create response header
@ -133,13 +158,12 @@ public class FileLoader {
responseHeader,
profile,
false,
UTF8.getBytes(url.toTokens()));
return response;
null);
return new StreamResponse(response, new ByteArrayInputStream(UTF8.getBytes(url.toTokens())));
}
// load the resource
InputStream is = url.getInputStream(ClientIdentification.yacyInternetCrawlerAgent);
byte[] b = FileUtils.read(is);
final InputStream is = url.getInputStream(ClientIdentification.yacyInternetCrawlerAgent);
// create response with loaded content
final CrawlProfile profile = this.sb.crawler.get(ASCII.getBytes(request.profileHandle()));
@ -149,7 +173,7 @@ public class FileLoader {
responseHeader,
profile,
false,
b);
return response;
null);
return new StreamResponse(response, is);
}
}

View File

@ -28,6 +28,7 @@ import java.io.ByteArrayInputStream;
import java.io.IOException;
import java.io.InputStream;
import org.apache.http.HttpStatus;
import org.apache.http.StatusLine;
import net.yacy.cora.document.id.DigestURL;
@ -83,17 +84,18 @@ public final class HTTPLoader {
}
/**
* Open input stream on a requested HTTP resource. When resource is small, fully load it and returns a ByteArrayInputStream instance.
* Open an input stream on a requested HTTP resource. When the resource content size is small
* (lower than {@link Response#CRAWLER_MAX_SIZE_TO_CACHE}, fully load it and use a ByteArrayInputStream instance.
* @param request
* @param profile crawl profile
* @param retryCount remaining redirect retries count
* @param maxFileSize max file size to load. -1 means no limit.
* @param blacklistType blacklist type to use
* @param agent agent identifier
* @return an open input stream. Don't forget to close it.
* @throws IOException when an error occured
* @return a response with full meta data and embedding on open input stream on content. Don't forget to close the stream.
* @throws IOException when an error occurred
*/
public InputStream openInputStream(final Request request, CrawlProfile profile, final int retryCount,
public StreamResponse openInputStream(final Request request, CrawlProfile profile, final int retryCount,
final int maxFileSize, final BlacklistType blacklistType, final ClientIdentification.Agent agent)
throws IOException {
if (retryCount < 0) {
@ -200,13 +202,14 @@ public final class HTTPLoader {
FailCategory.FINAL_PROCESS_CONTEXT, "redirection not wanted", statusCode);
throw new IOException("REJECTED UNWANTED REDIRECTION '" + statusline
+ "' for URL '" + requestURLString + "'$");
} else if (statusCode == 200 || statusCode == 203) {
} else if (statusCode == HttpStatus.SC_OK || statusCode == HttpStatus.SC_NON_AUTHORITATIVE_INFORMATION) {
// the transfer is ok
/*
* When content is not large (less than 1MB), we have better cache it if cache is enabled and url is not local
* When content is not large (less than Response.CRAWLER_MAX_SIZE_TO_CACHE), we have better cache it if cache is enabled and url is not local
*/
long contentLength = client.getHttpResponse().getEntity().getContentLength();
final InputStream contentStream;
if (profile != null && profile.storeHTCache() && contentLength > 0 && contentLength < (Response.CRAWLER_MAX_SIZE_TO_CACHE) && !url.isLocal()) {
byte[] content = null;
try {
@ -218,14 +221,17 @@ public final class HTTPLoader {
client.finish();
}
return new ByteArrayInputStream(content);
}
contentStream = new ByteArrayInputStream(content);
} else {
/*
* Returns a HTTPInputStream delegating to
* Create a HTTPInputStream delegating to
* client.getContentstream(). Close method will ensure client is
* properly closed.
*/
return new HTTPInputStream(client);
contentStream = new HTTPInputStream(client);
}
return new StreamResponse(new Response(request, requestHeader, responseHeader, profile, false, null), contentStream);
} else {
client.finish();
// if the response has not the right response type then reject file

View File

@ -226,10 +226,21 @@ public class Response {
this.status = newStatus;
}
/**
* @return the original request that produced this response
*/
public Request getRequest() {
return request;
}
public ResponseHeader getResponseHeader() {
return this.responseHeader;
}
public RequestHeader getRequestHeader() {
return this.requestHeader;
}
public boolean fromCache() {
return this.fromCache;
}

View File

@ -27,6 +27,7 @@
package net.yacy.crawler.retrieval;
import java.io.ByteArrayInputStream;
import java.io.IOException;
import java.io.InputStream;
import java.net.MalformedURLException;
@ -69,7 +70,31 @@ public class SMBLoader {
}
/**
* Load fully the requested file in a byte buffer
*
* @param request the request to process
* @param acceptOnlyParseable when true and no parser can be found to handle the detected MIME type, the response content buffer contains only URL tokens
* @return a response with full meta data and embedding the content as a byte buffer
*/
public Response load(final Request request, boolean acceptOnlyParseable) throws IOException {
StreamResponse streamResponse = openInputStream(request, acceptOnlyParseable);
/* Read fully the stream and update the response */
byte[] content = FileUtils.read(streamResponse.getContentStream());
Response response = streamResponse.getResponse();
response.setContent(content);
return response;
}
/**
* Open a stream on the requested file
*
* @param request the request to process
* @param acceptOnlyParseable when true, do not open a stream on content when no parser can be found to handle the detected MIME type
* @return a response with full meta data and embedding on open input stream on content. Don't forget to close the stream.
*/
public StreamResponse openInputStream(final Request request, final boolean acceptOnlyParseable) throws IOException {
DigestURL url = request.url();
if (!url.getProtocol().equals("smb")) throw new IOException("wrong loader for SMBLoader: " + url.getProtocol());
@ -111,9 +136,9 @@ public class SMBLoader {
responseHeader,
profile,
false,
UTF8.getBytes(content.toString()));
null);
return response;
return new StreamResponse(response, new ByteArrayInputStream(UTF8.getBytes(content.toString())));
}
// create response header
@ -151,13 +176,12 @@ public class SMBLoader {
responseHeader,
profile,
false,
url.toTokens().getBytes());
return response;
null);
return new StreamResponse(response, new ByteArrayInputStream(url.toTokens().getBytes()));
}
// load the resource
InputStream is = url.getInputStream(ClientIdentification.yacyInternetCrawlerAgent);
byte[] b = FileUtils.read(is);
// create response with loaded content
final CrawlProfile profile = this.sb.crawler.get(request.profileHandle().getBytes());
@ -167,8 +191,8 @@ public class SMBLoader {
responseHeader,
profile,
false,
b);
return response;
null);
return new StreamResponse(response, is);
}
public static void main(String[] args) {

View File

@ -0,0 +1,120 @@
// StreamResponse.java
// ---------------------------
// Copyright 2017 by luccioman; https://github.com/luccioman
//
// This is a part of YaCy, a peer-to-peer based web search engine
//
// LICENSE
//
// This program is free software; you can redistribute it and/or modify
// it under the terms of the GNU General Public License as published by
// the Free Software Foundation; either version 2 of the License, or
// (at your option) any later version.
//
// This program is distributed in the hope that it will be useful,
// but WITHOUT ANY WARRANTY; without even the implied warranty of
// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
// GNU General Public License for more details.
//
// You should have received a copy of the GNU General Public License
// along with this program; if not, write to the Free Software
// Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
package net.yacy.crawler.retrieval;
import java.io.IOException;
import java.io.InputStream;
import java.nio.charset.StandardCharsets;
import net.yacy.cora.util.ConcurrentLog;
import net.yacy.document.Document;
import net.yacy.document.Parser;
import net.yacy.document.TextParser;
import net.yacy.document.VocabularyScraper;
/**
* A crawler load response, holding content as a stream.
*/
public class StreamResponse {
/** Logger */
private final static ConcurrentLog log = new ConcurrentLog(StreamResponse.class.getSimpleName());
/**
* Content as a stream.
*/
private InputStream contentStream;
/**
* The response details, including notably the request and response headers.
*/
private Response response;
/**
* @param response
* contains the complete crawler response details
* @param contentStream
* an open input stream on the response content
* @throws IllegalArgumentException
* when response is null
*/
public StreamResponse(final Response response, final InputStream contentStream) {
if (response == null) {
throw new IllegalArgumentException("response parameter must not be null");
}
this.response = response;
this.contentStream = contentStream;
}
/**
* @return the content stream. Don't forget to close it when processing is
* terminated.
*/
public InputStream getContentStream() {
return this.contentStream;
}
/**
* @return the crawler response with complete details
*/
public Response getResponse() {
return this.response;
}
/**
* Parse and close the content stream and return the parsed documents when
* possible
*
* @return the parsed documents or null when an error occurred
* @throws Parser.Failure
* when no parser support the content
*/
public Document[] parse() throws Parser.Failure {
final String supportError = TextParser.supports(this.response.url(),
this.response.getResponseHeader() == null ? null : this.response.getResponseHeader().getContentType());
if (supportError != null) {
throw new Parser.Failure("no parser support:" + supportError, this.response.url());
}
try {
return TextParser.parseSource(this.response.url(),
this.response.getResponseHeader() == null ? null
: this.response.getResponseHeader().getContentType(),
this.response.getResponseHeader() == null ? StandardCharsets.UTF_8.name()
: this.response.getResponseHeader().getCharacterEncoding(),
new VocabularyScraper(), this.response.getRequest().timezoneOffset(),
this.response.getRequest().depth(), this.response.size(), this.contentStream);
} catch (final Exception e) {
return null;
} finally {
if (this.contentStream != null) {
try {
this.contentStream.close();
} catch (IOException ignored) {
log.warn("Could not close content stream on url " + this.response.url());
}
}
}
}
}

View File

@ -29,7 +29,6 @@ package net.yacy.repository;
import java.io.ByteArrayInputStream;
import java.io.File;
import java.io.IOException;
import java.io.InputStream;
import java.net.MalformedURLException;
import java.util.Arrays;
import java.util.Date;
@ -59,6 +58,7 @@ import net.yacy.crawler.retrieval.HTTPLoader;
import net.yacy.crawler.retrieval.Request;
import net.yacy.crawler.retrieval.Response;
import net.yacy.crawler.retrieval.SMBLoader;
import net.yacy.crawler.retrieval.StreamResponse;
import net.yacy.document.Document;
import net.yacy.document.Parser;
import net.yacy.document.TextParser;
@ -347,7 +347,7 @@ public final class LoaderDispatcher {
* @return an open ImageInputStream. Don't forget to close it once used!
* @throws IOException when url is malformed, blacklisted, or CacheStrategy is CACHEONLY and content is unavailable
*/
private InputStream openInputStreamInternal(final Request request, CacheStrategy cacheStrategy, final int maxFileSize, final BlacklistType blacklistType, ClientIdentification.Agent agent) throws IOException {
private StreamResponse openInputStreamInternal(final Request request, CacheStrategy cacheStrategy, final int maxFileSize, final BlacklistType blacklistType, ClientIdentification.Agent agent) throws IOException {
// get the protocol of the next URL
final DigestURL url = request.url();
if (url.isFile() || url.isSMB()) {
@ -367,7 +367,7 @@ public final class LoaderDispatcher {
// check if we have the page in the cache
Response cachedResponse = loadFromCache(request, cacheStrategy, agent, url, crawlProfile);
if (cachedResponse != null) {
return new ByteArrayInputStream(cachedResponse.getContent());
return new StreamResponse(cachedResponse, new ByteArrayInputStream(cachedResponse.getContent()));
}
// check case where we want results from the cache exclusively, and never from the Internet (offline mode)
@ -389,20 +389,20 @@ public final class LoaderDispatcher {
}
// load resource from the internet
InputStream inStream = null;
StreamResponse response;
if (protocol.equals("http") || protocol.equals("https")) {
inStream = this.httpLoader.openInputStream(request, crawlProfile, 1, maxFileSize, blacklistType, agent);
} else if (protocol.equals("ftp") || protocol.equals("smb") || protocol.equals("file")) {
// may also open directly stream with ftp loader
inStream = url.getInputStream(agent);
response = this.httpLoader.openInputStream(request, crawlProfile, 1, maxFileSize, blacklistType, agent);
} else if (protocol.equals("ftp")) {
response = this.ftpLoader.openInputStream(request, true);
} else if (protocol.equals("smb")) {
response = this.smbLoader.openInputStream(request, true);
} else if (protocol.equals("file")) {
response = this.fileLoader.openInputStream(request, true);
} else {
throw new IOException("Unsupported protocol '" + protocol + "' in url " + url);
}
if (inStream == null) {
throw new IOException("Unable to open content stream");
}
return inStream;
return response;
}
@ -464,18 +464,18 @@ public final class LoaderDispatcher {
}
/**
* Open url as InputStream from the web or the cache
* Open the URL as an InputStream from the web or the cache
* @param request must be not null
* @param cacheStrategy cache strategy to use
* @param blacklistType black list
* @param agent agent identification for HTTP requests
* @return an open InputStream on content. Don't forget to close it once used.
* @return a response with full meta data and embedding on open input stream on content. Don't forget to close the stream.
* @throws IOException when url is malformed or blacklisted
*/
public InputStream openInputStream(final Request request, final CacheStrategy cacheStrategy,
public StreamResponse openInputStream(final Request request, final CacheStrategy cacheStrategy,
BlacklistType blacklistType, final ClientIdentification.Agent agent) throws IOException {
final int maxFileSize = protocolMaxFileSize(request.url());
InputStream stream = null;
StreamResponse response;
Semaphore check = this.loaderSteering.get(request.url());
if (check != null && cacheStrategy != CacheStrategy.NOCACHE) {
@ -493,9 +493,9 @@ public final class LoaderDispatcher {
this.loaderSteering.put(request.url(), new Semaphore(0));
try {
stream = openInputStreamInternal(request, cacheStrategy, maxFileSize, blacklistType, agent);
response = openInputStreamInternal(request, cacheStrategy, maxFileSize, blacklistType, agent);
} catch(IOException ioe) {
/* Do not re encapsulate eventual IOException in an IOException */
/* Do not re encapsulate any eventual IOException in an IOException */
throw ioe;
} catch (final Throwable e) {
throw new IOException(e);
@ -507,7 +507,7 @@ public final class LoaderDispatcher {
}
}
return stream;
return response;
}
public Document[] loadDocuments(final Request request, final CacheStrategy cacheStrategy, final int maxFileSize, BlacklistType blacklistType, final ClientIdentification.Agent agent) throws IOException, Parser.Failure {
@ -555,6 +555,44 @@ public final class LoaderDispatcher {
}
}
/**
* Similar to the loadDocument method, but streaming the resource content when possible instead of fully loading it in memory.
* @param location URL of the resource to load
* @param cachePolicy cache policy strategy
* @param blacklistType blacklist to use
* @param agent user agent identifier
* @return on parsed document or null when an error occurred while parsing
* @throws IOException when the content can not be fetched or no parser support it
*/
public Document loadDocumentAsStream(final DigestURL location, final CacheStrategy cachePolicy, BlacklistType blacklistType, final ClientIdentification.Agent agent) throws IOException {
// load resource
Request request = request(location, true, false);
final StreamResponse streamResponse = this.openInputStream(request, cachePolicy, blacklistType, agent);
final Response response = streamResponse.getResponse();
final DigestURL url = request.url();
if (response == null) throw new IOException("no Response for url " + url);
// if it is still not available, report an error
if (streamResponse.getContentStream() == null || response.getResponseHeader() == null) {
throw new IOException("no Content available for url " + url);
}
// parse resource
try {
Document[] documents = streamResponse.parse();
Document merged = Document.mergeDocuments(location, response.getMimeType(), documents);
String x_robots_tag = response.getResponseHeader().getXRobotsTag();
if (x_robots_tag.indexOf("noindex",0) >= 0) {
merged.setIndexingDenied(true);
}
return merged;
} catch(final Parser.Failure e) {
throw new IOException(e.getMessage());
}
}
/**
* load all links from a resource
* @param url the url that shall be loaded

View File

@ -44,6 +44,7 @@ import net.yacy.cora.protocol.ClientIdentification;
import net.yacy.cora.protocol.Domains;
import net.yacy.cora.protocol.RequestHeader;
import net.yacy.cora.util.ConcurrentLog;
import net.yacy.crawler.retrieval.StreamResponse;
import net.yacy.data.InvalidURLLicenceException;
import net.yacy.data.URLLicense;
import net.yacy.http.servlets.TemplateMissingParameterException;
@ -122,8 +123,9 @@ public class ImageViewer {
String agentName = post.get("agentName", auth ? ClientIdentification.yacyIntranetCrawlerAgentName
: ClientIdentification.yacyInternetCrawlerAgentName);
ClientIdentification.Agent agent = ClientIdentification.getAgent(agentName);
inStream = loader.openInputStream(loader.request(url, false, true), CacheStrategy.IFEXIST,
final StreamResponse response = loader.openInputStream(loader.request(url, false, true), CacheStrategy.IFEXIST,
BlacklistType.SEARCH, agent);
inStream = response.getContentStream();
} catch (final IOException e) {
/** No need to log full stack trace (in most cases resource is not available because of a network error) */
ConcurrentLog.fine("ImageViewer", "cannot load image. URL : " + url.toNormalform(true));