mirror of
https://github.com/yacy/yacy_search_server.git
synced 2024-09-19 00:01:41 +02:00
Improved consistency between loader openInputStream and load functions
This commit is contained in:
parent
cbccf97361
commit
a9cb083fa1
|
@ -27,6 +27,7 @@
|
|||
|
||||
package net.yacy.crawler.retrieval;
|
||||
|
||||
import java.io.ByteArrayInputStream;
|
||||
import java.io.ByteArrayOutputStream;
|
||||
import java.io.IOException;
|
||||
import java.io.PrintStream;
|
||||
|
@ -167,6 +168,29 @@ public class FTPLoader {
|
|||
return response;
|
||||
}
|
||||
|
||||
/**
|
||||
* Open a stream on the entry content from a FTP server
|
||||
*
|
||||
* @param request the request to process
|
||||
* @param acceptOnlyParseable when true and no parser can be found to handle the detected MIME type, open a stream on the URL tokens
|
||||
* @return a response with full meta data and embedding on open input stream on content. Don't forget to close the stream.
|
||||
*/
|
||||
public StreamResponse openInputStream(final Request request, final boolean acceptOnlyParseable) throws IOException {
|
||||
|
||||
final Response response = load(request, acceptOnlyParseable);
|
||||
// TODO implement a true ftp content stream instead of a simple ByteArrayInputStream encapsulation
|
||||
final StreamResponse streamResponse;
|
||||
if(response.getContent() != null) {
|
||||
streamResponse = new StreamResponse(response,
|
||||
new ByteArrayInputStream(response.getContent()));
|
||||
} else {
|
||||
/* content can be null when no parser can handle it : then return the URL tokens as content */
|
||||
streamResponse = new StreamResponse(response,
|
||||
new ByteArrayInputStream(UTF8.getBytes(request.url().toTokens())));
|
||||
}
|
||||
return streamResponse;
|
||||
}
|
||||
|
||||
/**
|
||||
* @param ftpClient
|
||||
*/
|
||||
|
|
|
@ -24,6 +24,7 @@
|
|||
|
||||
package net.yacy.crawler.retrieval;
|
||||
|
||||
import java.io.ByteArrayInputStream;
|
||||
import java.io.IOException;
|
||||
import java.io.InputStream;
|
||||
import java.util.ArrayList;
|
||||
|
@ -58,7 +59,31 @@ public class FileLoader {
|
|||
this.maxFileSize = (int) sb.getConfigLong("crawler.file.maxFileSize", -1l);
|
||||
}
|
||||
|
||||
/**
|
||||
* Load fully the requested file in a byte buffer
|
||||
*
|
||||
* @param request the request to process
|
||||
* @param acceptOnlyParseable when true and no parser can be found to handle the detected MIME type, the response content buffer contains only URL tokens
|
||||
* @return a response with full meta data and embedding the content as a byte buffer
|
||||
*/
|
||||
public Response load(final Request request, boolean acceptOnlyParseable) throws IOException {
|
||||
StreamResponse streamResponse = openInputStream(request, acceptOnlyParseable);
|
||||
|
||||
/* Read fully the stream and update the response */
|
||||
byte[] content = FileUtils.read(streamResponse.getContentStream());
|
||||
Response response = streamResponse.getResponse();
|
||||
response.setContent(content);
|
||||
return response;
|
||||
}
|
||||
|
||||
/**
|
||||
* Open a stream on the requested file
|
||||
*
|
||||
* @param request the request to process
|
||||
* @param acceptOnlyParseable when true and no parser can be found to handle the detected MIME type, open a stream on the URL tokens
|
||||
* @return a response with full meta data and embedding on open input stream on content. Don't forget to close the stream.
|
||||
*/
|
||||
public StreamResponse openInputStream(final Request request, final boolean acceptOnlyParseable) throws IOException {
|
||||
DigestURL url = request.url();
|
||||
if (!url.getProtocol().equals("file")) throw new IOException("wrong protocol for FileLoader: " + url.getProtocol());
|
||||
|
||||
|
@ -93,9 +118,9 @@ public class FileLoader {
|
|||
responseHeader,
|
||||
profile,
|
||||
false,
|
||||
UTF8.getBytes(content.toString()));
|
||||
null);
|
||||
|
||||
return response;
|
||||
return new StreamResponse(response, new ByteArrayInputStream(UTF8.getBytes(content.toString())));
|
||||
}
|
||||
|
||||
// create response header
|
||||
|
@ -133,13 +158,12 @@ public class FileLoader {
|
|||
responseHeader,
|
||||
profile,
|
||||
false,
|
||||
UTF8.getBytes(url.toTokens()));
|
||||
return response;
|
||||
null);
|
||||
return new StreamResponse(response, new ByteArrayInputStream(UTF8.getBytes(url.toTokens())));
|
||||
}
|
||||
|
||||
// load the resource
|
||||
InputStream is = url.getInputStream(ClientIdentification.yacyInternetCrawlerAgent);
|
||||
byte[] b = FileUtils.read(is);
|
||||
final InputStream is = url.getInputStream(ClientIdentification.yacyInternetCrawlerAgent);
|
||||
|
||||
// create response with loaded content
|
||||
final CrawlProfile profile = this.sb.crawler.get(ASCII.getBytes(request.profileHandle()));
|
||||
|
@ -149,7 +173,7 @@ public class FileLoader {
|
|||
responseHeader,
|
||||
profile,
|
||||
false,
|
||||
b);
|
||||
return response;
|
||||
null);
|
||||
return new StreamResponse(response, is);
|
||||
}
|
||||
}
|
||||
|
|
|
@ -28,6 +28,7 @@ import java.io.ByteArrayInputStream;
|
|||
import java.io.IOException;
|
||||
import java.io.InputStream;
|
||||
|
||||
import org.apache.http.HttpStatus;
|
||||
import org.apache.http.StatusLine;
|
||||
|
||||
import net.yacy.cora.document.id.DigestURL;
|
||||
|
@ -83,17 +84,18 @@ public final class HTTPLoader {
|
|||
}
|
||||
|
||||
/**
|
||||
* Open input stream on a requested HTTP resource. When resource is small, fully load it and returns a ByteArrayInputStream instance.
|
||||
* Open an input stream on a requested HTTP resource. When the resource content size is small
|
||||
* (lower than {@link Response#CRAWLER_MAX_SIZE_TO_CACHE}, fully load it and use a ByteArrayInputStream instance.
|
||||
* @param request
|
||||
* @param profile crawl profile
|
||||
* @param retryCount remaining redirect retries count
|
||||
* @param maxFileSize max file size to load. -1 means no limit.
|
||||
* @param blacklistType blacklist type to use
|
||||
* @param agent agent identifier
|
||||
* @return an open input stream. Don't forget to close it.
|
||||
* @throws IOException when an error occured
|
||||
* @return a response with full meta data and embedding on open input stream on content. Don't forget to close the stream.
|
||||
* @throws IOException when an error occurred
|
||||
*/
|
||||
public InputStream openInputStream(final Request request, CrawlProfile profile, final int retryCount,
|
||||
public StreamResponse openInputStream(final Request request, CrawlProfile profile, final int retryCount,
|
||||
final int maxFileSize, final BlacklistType blacklistType, final ClientIdentification.Agent agent)
|
||||
throws IOException {
|
||||
if (retryCount < 0) {
|
||||
|
@ -200,13 +202,14 @@ public final class HTTPLoader {
|
|||
FailCategory.FINAL_PROCESS_CONTEXT, "redirection not wanted", statusCode);
|
||||
throw new IOException("REJECTED UNWANTED REDIRECTION '" + statusline
|
||||
+ "' for URL '" + requestURLString + "'$");
|
||||
} else if (statusCode == 200 || statusCode == 203) {
|
||||
} else if (statusCode == HttpStatus.SC_OK || statusCode == HttpStatus.SC_NON_AUTHORITATIVE_INFORMATION) {
|
||||
// the transfer is ok
|
||||
|
||||
/*
|
||||
* When content is not large (less than 1MB), we have better cache it if cache is enabled and url is not local
|
||||
* When content is not large (less than Response.CRAWLER_MAX_SIZE_TO_CACHE), we have better cache it if cache is enabled and url is not local
|
||||
*/
|
||||
long contentLength = client.getHttpResponse().getEntity().getContentLength();
|
||||
final InputStream contentStream;
|
||||
if (profile != null && profile.storeHTCache() && contentLength > 0 && contentLength < (Response.CRAWLER_MAX_SIZE_TO_CACHE) && !url.isLocal()) {
|
||||
byte[] content = null;
|
||||
try {
|
||||
|
@ -218,14 +221,17 @@ public final class HTTPLoader {
|
|||
client.finish();
|
||||
}
|
||||
|
||||
return new ByteArrayInputStream(content);
|
||||
}
|
||||
contentStream = new ByteArrayInputStream(content);
|
||||
} else {
|
||||
/*
|
||||
* Returns a HTTPInputStream delegating to
|
||||
* Create a HTTPInputStream delegating to
|
||||
* client.getContentstream(). Close method will ensure client is
|
||||
* properly closed.
|
||||
*/
|
||||
return new HTTPInputStream(client);
|
||||
contentStream = new HTTPInputStream(client);
|
||||
}
|
||||
|
||||
return new StreamResponse(new Response(request, requestHeader, responseHeader, profile, false, null), contentStream);
|
||||
} else {
|
||||
client.finish();
|
||||
// if the response has not the right response type then reject file
|
||||
|
|
|
@ -226,10 +226,21 @@ public class Response {
|
|||
this.status = newStatus;
|
||||
}
|
||||
|
||||
/**
|
||||
* @return the original request that produced this response
|
||||
*/
|
||||
public Request getRequest() {
|
||||
return request;
|
||||
}
|
||||
|
||||
public ResponseHeader getResponseHeader() {
|
||||
return this.responseHeader;
|
||||
}
|
||||
|
||||
public RequestHeader getRequestHeader() {
|
||||
return this.requestHeader;
|
||||
}
|
||||
|
||||
public boolean fromCache() {
|
||||
return this.fromCache;
|
||||
}
|
||||
|
|
|
@ -27,6 +27,7 @@
|
|||
|
||||
package net.yacy.crawler.retrieval;
|
||||
|
||||
import java.io.ByteArrayInputStream;
|
||||
import java.io.IOException;
|
||||
import java.io.InputStream;
|
||||
import java.net.MalformedURLException;
|
||||
|
@ -69,7 +70,31 @@ public class SMBLoader {
|
|||
}
|
||||
|
||||
|
||||
/**
|
||||
* Load fully the requested file in a byte buffer
|
||||
*
|
||||
* @param request the request to process
|
||||
* @param acceptOnlyParseable when true and no parser can be found to handle the detected MIME type, the response content buffer contains only URL tokens
|
||||
* @return a response with full meta data and embedding the content as a byte buffer
|
||||
*/
|
||||
public Response load(final Request request, boolean acceptOnlyParseable) throws IOException {
|
||||
StreamResponse streamResponse = openInputStream(request, acceptOnlyParseable);
|
||||
|
||||
/* Read fully the stream and update the response */
|
||||
byte[] content = FileUtils.read(streamResponse.getContentStream());
|
||||
Response response = streamResponse.getResponse();
|
||||
response.setContent(content);
|
||||
return response;
|
||||
}
|
||||
|
||||
/**
|
||||
* Open a stream on the requested file
|
||||
*
|
||||
* @param request the request to process
|
||||
* @param acceptOnlyParseable when true, do not open a stream on content when no parser can be found to handle the detected MIME type
|
||||
* @return a response with full meta data and embedding on open input stream on content. Don't forget to close the stream.
|
||||
*/
|
||||
public StreamResponse openInputStream(final Request request, final boolean acceptOnlyParseable) throws IOException {
|
||||
DigestURL url = request.url();
|
||||
if (!url.getProtocol().equals("smb")) throw new IOException("wrong loader for SMBLoader: " + url.getProtocol());
|
||||
|
||||
|
@ -111,9 +136,9 @@ public class SMBLoader {
|
|||
responseHeader,
|
||||
profile,
|
||||
false,
|
||||
UTF8.getBytes(content.toString()));
|
||||
null);
|
||||
|
||||
return response;
|
||||
return new StreamResponse(response, new ByteArrayInputStream(UTF8.getBytes(content.toString())));
|
||||
}
|
||||
|
||||
// create response header
|
||||
|
@ -151,13 +176,12 @@ public class SMBLoader {
|
|||
responseHeader,
|
||||
profile,
|
||||
false,
|
||||
url.toTokens().getBytes());
|
||||
return response;
|
||||
null);
|
||||
return new StreamResponse(response, new ByteArrayInputStream(url.toTokens().getBytes()));
|
||||
}
|
||||
|
||||
// load the resource
|
||||
InputStream is = url.getInputStream(ClientIdentification.yacyInternetCrawlerAgent);
|
||||
byte[] b = FileUtils.read(is);
|
||||
|
||||
// create response with loaded content
|
||||
final CrawlProfile profile = this.sb.crawler.get(request.profileHandle().getBytes());
|
||||
|
@ -167,8 +191,8 @@ public class SMBLoader {
|
|||
responseHeader,
|
||||
profile,
|
||||
false,
|
||||
b);
|
||||
return response;
|
||||
null);
|
||||
return new StreamResponse(response, is);
|
||||
}
|
||||
|
||||
public static void main(String[] args) {
|
||||
|
|
120
source/net/yacy/crawler/retrieval/StreamResponse.java
Normal file
120
source/net/yacy/crawler/retrieval/StreamResponse.java
Normal file
|
@ -0,0 +1,120 @@
|
|||
// StreamResponse.java
|
||||
// ---------------------------
|
||||
// Copyright 2017 by luccioman; https://github.com/luccioman
|
||||
//
|
||||
// This is a part of YaCy, a peer-to-peer based web search engine
|
||||
//
|
||||
// LICENSE
|
||||
//
|
||||
// This program is free software; you can redistribute it and/or modify
|
||||
// it under the terms of the GNU General Public License as published by
|
||||
// the Free Software Foundation; either version 2 of the License, or
|
||||
// (at your option) any later version.
|
||||
//
|
||||
// This program is distributed in the hope that it will be useful,
|
||||
// but WITHOUT ANY WARRANTY; without even the implied warranty of
|
||||
// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
|
||||
// GNU General Public License for more details.
|
||||
//
|
||||
// You should have received a copy of the GNU General Public License
|
||||
// along with this program; if not, write to the Free Software
|
||||
// Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
|
||||
|
||||
package net.yacy.crawler.retrieval;
|
||||
|
||||
import java.io.IOException;
|
||||
import java.io.InputStream;
|
||||
import java.nio.charset.StandardCharsets;
|
||||
|
||||
import net.yacy.cora.util.ConcurrentLog;
|
||||
import net.yacy.document.Document;
|
||||
import net.yacy.document.Parser;
|
||||
import net.yacy.document.TextParser;
|
||||
import net.yacy.document.VocabularyScraper;
|
||||
|
||||
/**
|
||||
* A crawler load response, holding content as a stream.
|
||||
*/
|
||||
public class StreamResponse {
|
||||
|
||||
/** Logger */
|
||||
private final static ConcurrentLog log = new ConcurrentLog(StreamResponse.class.getSimpleName());
|
||||
|
||||
/**
|
||||
* Content as a stream.
|
||||
*/
|
||||
private InputStream contentStream;
|
||||
|
||||
/**
|
||||
* The response details, including notably the request and response headers.
|
||||
*/
|
||||
private Response response;
|
||||
|
||||
/**
|
||||
* @param response
|
||||
* contains the complete crawler response details
|
||||
* @param contentStream
|
||||
* an open input stream on the response content
|
||||
* @throws IllegalArgumentException
|
||||
* when response is null
|
||||
*/
|
||||
public StreamResponse(final Response response, final InputStream contentStream) {
|
||||
if (response == null) {
|
||||
throw new IllegalArgumentException("response parameter must not be null");
|
||||
}
|
||||
this.response = response;
|
||||
this.contentStream = contentStream;
|
||||
}
|
||||
|
||||
/**
|
||||
* @return the content stream. Don't forget to close it when processing is
|
||||
* terminated.
|
||||
*/
|
||||
public InputStream getContentStream() {
|
||||
return this.contentStream;
|
||||
}
|
||||
|
||||
/**
|
||||
* @return the crawler response with complete details
|
||||
*/
|
||||
public Response getResponse() {
|
||||
return this.response;
|
||||
}
|
||||
|
||||
/**
|
||||
* Parse and close the content stream and return the parsed documents when
|
||||
* possible
|
||||
*
|
||||
* @return the parsed documents or null when an error occurred
|
||||
* @throws Parser.Failure
|
||||
* when no parser support the content
|
||||
*/
|
||||
public Document[] parse() throws Parser.Failure {
|
||||
final String supportError = TextParser.supports(this.response.url(),
|
||||
this.response.getResponseHeader() == null ? null : this.response.getResponseHeader().getContentType());
|
||||
if (supportError != null) {
|
||||
throw new Parser.Failure("no parser support:" + supportError, this.response.url());
|
||||
}
|
||||
try {
|
||||
return TextParser.parseSource(this.response.url(),
|
||||
this.response.getResponseHeader() == null ? null
|
||||
: this.response.getResponseHeader().getContentType(),
|
||||
this.response.getResponseHeader() == null ? StandardCharsets.UTF_8.name()
|
||||
: this.response.getResponseHeader().getCharacterEncoding(),
|
||||
new VocabularyScraper(), this.response.getRequest().timezoneOffset(),
|
||||
this.response.getRequest().depth(), this.response.size(), this.contentStream);
|
||||
} catch (final Exception e) {
|
||||
return null;
|
||||
} finally {
|
||||
if (this.contentStream != null) {
|
||||
try {
|
||||
this.contentStream.close();
|
||||
} catch (IOException ignored) {
|
||||
log.warn("Could not close content stream on url " + this.response.url());
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
}
|
||||
|
||||
}
|
|
@ -29,7 +29,6 @@ package net.yacy.repository;
|
|||
import java.io.ByteArrayInputStream;
|
||||
import java.io.File;
|
||||
import java.io.IOException;
|
||||
import java.io.InputStream;
|
||||
import java.net.MalformedURLException;
|
||||
import java.util.Arrays;
|
||||
import java.util.Date;
|
||||
|
@ -59,6 +58,7 @@ import net.yacy.crawler.retrieval.HTTPLoader;
|
|||
import net.yacy.crawler.retrieval.Request;
|
||||
import net.yacy.crawler.retrieval.Response;
|
||||
import net.yacy.crawler.retrieval.SMBLoader;
|
||||
import net.yacy.crawler.retrieval.StreamResponse;
|
||||
import net.yacy.document.Document;
|
||||
import net.yacy.document.Parser;
|
||||
import net.yacy.document.TextParser;
|
||||
|
@ -347,7 +347,7 @@ public final class LoaderDispatcher {
|
|||
* @return an open ImageInputStream. Don't forget to close it once used!
|
||||
* @throws IOException when url is malformed, blacklisted, or CacheStrategy is CACHEONLY and content is unavailable
|
||||
*/
|
||||
private InputStream openInputStreamInternal(final Request request, CacheStrategy cacheStrategy, final int maxFileSize, final BlacklistType blacklistType, ClientIdentification.Agent agent) throws IOException {
|
||||
private StreamResponse openInputStreamInternal(final Request request, CacheStrategy cacheStrategy, final int maxFileSize, final BlacklistType blacklistType, ClientIdentification.Agent agent) throws IOException {
|
||||
// get the protocol of the next URL
|
||||
final DigestURL url = request.url();
|
||||
if (url.isFile() || url.isSMB()) {
|
||||
|
@ -367,7 +367,7 @@ public final class LoaderDispatcher {
|
|||
// check if we have the page in the cache
|
||||
Response cachedResponse = loadFromCache(request, cacheStrategy, agent, url, crawlProfile);
|
||||
if (cachedResponse != null) {
|
||||
return new ByteArrayInputStream(cachedResponse.getContent());
|
||||
return new StreamResponse(cachedResponse, new ByteArrayInputStream(cachedResponse.getContent()));
|
||||
}
|
||||
|
||||
// check case where we want results from the cache exclusively, and never from the Internet (offline mode)
|
||||
|
@ -389,20 +389,20 @@ public final class LoaderDispatcher {
|
|||
}
|
||||
|
||||
// load resource from the internet
|
||||
InputStream inStream = null;
|
||||
StreamResponse response;
|
||||
if (protocol.equals("http") || protocol.equals("https")) {
|
||||
inStream = this.httpLoader.openInputStream(request, crawlProfile, 1, maxFileSize, blacklistType, agent);
|
||||
} else if (protocol.equals("ftp") || protocol.equals("smb") || protocol.equals("file")) {
|
||||
// may also open directly stream with ftp loader
|
||||
inStream = url.getInputStream(agent);
|
||||
response = this.httpLoader.openInputStream(request, crawlProfile, 1, maxFileSize, blacklistType, agent);
|
||||
} else if (protocol.equals("ftp")) {
|
||||
response = this.ftpLoader.openInputStream(request, true);
|
||||
} else if (protocol.equals("smb")) {
|
||||
response = this.smbLoader.openInputStream(request, true);
|
||||
} else if (protocol.equals("file")) {
|
||||
response = this.fileLoader.openInputStream(request, true);
|
||||
} else {
|
||||
throw new IOException("Unsupported protocol '" + protocol + "' in url " + url);
|
||||
}
|
||||
if (inStream == null) {
|
||||
throw new IOException("Unable to open content stream");
|
||||
}
|
||||
|
||||
return inStream;
|
||||
return response;
|
||||
}
|
||||
|
||||
|
||||
|
@ -464,18 +464,18 @@ public final class LoaderDispatcher {
|
|||
}
|
||||
|
||||
/**
|
||||
* Open url as InputStream from the web or the cache
|
||||
* Open the URL as an InputStream from the web or the cache
|
||||
* @param request must be not null
|
||||
* @param cacheStrategy cache strategy to use
|
||||
* @param blacklistType black list
|
||||
* @param agent agent identification for HTTP requests
|
||||
* @return an open InputStream on content. Don't forget to close it once used.
|
||||
* @return a response with full meta data and embedding on open input stream on content. Don't forget to close the stream.
|
||||
* @throws IOException when url is malformed or blacklisted
|
||||
*/
|
||||
public InputStream openInputStream(final Request request, final CacheStrategy cacheStrategy,
|
||||
public StreamResponse openInputStream(final Request request, final CacheStrategy cacheStrategy,
|
||||
BlacklistType blacklistType, final ClientIdentification.Agent agent) throws IOException {
|
||||
final int maxFileSize = protocolMaxFileSize(request.url());
|
||||
InputStream stream = null;
|
||||
StreamResponse response;
|
||||
|
||||
Semaphore check = this.loaderSteering.get(request.url());
|
||||
if (check != null && cacheStrategy != CacheStrategy.NOCACHE) {
|
||||
|
@ -493,9 +493,9 @@ public final class LoaderDispatcher {
|
|||
|
||||
this.loaderSteering.put(request.url(), new Semaphore(0));
|
||||
try {
|
||||
stream = openInputStreamInternal(request, cacheStrategy, maxFileSize, blacklistType, agent);
|
||||
response = openInputStreamInternal(request, cacheStrategy, maxFileSize, blacklistType, agent);
|
||||
} catch(IOException ioe) {
|
||||
/* Do not re encapsulate eventual IOException in an IOException */
|
||||
/* Do not re encapsulate any eventual IOException in an IOException */
|
||||
throw ioe;
|
||||
} catch (final Throwable e) {
|
||||
throw new IOException(e);
|
||||
|
@ -507,7 +507,7 @@ public final class LoaderDispatcher {
|
|||
}
|
||||
}
|
||||
|
||||
return stream;
|
||||
return response;
|
||||
}
|
||||
|
||||
public Document[] loadDocuments(final Request request, final CacheStrategy cacheStrategy, final int maxFileSize, BlacklistType blacklistType, final ClientIdentification.Agent agent) throws IOException, Parser.Failure {
|
||||
|
@ -555,6 +555,44 @@ public final class LoaderDispatcher {
|
|||
}
|
||||
}
|
||||
|
||||
/**
|
||||
* Similar to the loadDocument method, but streaming the resource content when possible instead of fully loading it in memory.
|
||||
* @param location URL of the resource to load
|
||||
* @param cachePolicy cache policy strategy
|
||||
* @param blacklistType blacklist to use
|
||||
* @param agent user agent identifier
|
||||
* @return on parsed document or null when an error occurred while parsing
|
||||
* @throws IOException when the content can not be fetched or no parser support it
|
||||
*/
|
||||
public Document loadDocumentAsStream(final DigestURL location, final CacheStrategy cachePolicy, BlacklistType blacklistType, final ClientIdentification.Agent agent) throws IOException {
|
||||
// load resource
|
||||
Request request = request(location, true, false);
|
||||
final StreamResponse streamResponse = this.openInputStream(request, cachePolicy, blacklistType, agent);
|
||||
final Response response = streamResponse.getResponse();
|
||||
final DigestURL url = request.url();
|
||||
if (response == null) throw new IOException("no Response for url " + url);
|
||||
|
||||
// if it is still not available, report an error
|
||||
if (streamResponse.getContentStream() == null || response.getResponseHeader() == null) {
|
||||
throw new IOException("no Content available for url " + url);
|
||||
}
|
||||
|
||||
// parse resource
|
||||
try {
|
||||
Document[] documents = streamResponse.parse();
|
||||
Document merged = Document.mergeDocuments(location, response.getMimeType(), documents);
|
||||
|
||||
String x_robots_tag = response.getResponseHeader().getXRobotsTag();
|
||||
if (x_robots_tag.indexOf("noindex",0) >= 0) {
|
||||
merged.setIndexingDenied(true);
|
||||
}
|
||||
|
||||
return merged;
|
||||
} catch(final Parser.Failure e) {
|
||||
throw new IOException(e.getMessage());
|
||||
}
|
||||
}
|
||||
|
||||
/**
|
||||
* load all links from a resource
|
||||
* @param url the url that shall be loaded
|
||||
|
|
|
@ -44,6 +44,7 @@ import net.yacy.cora.protocol.ClientIdentification;
|
|||
import net.yacy.cora.protocol.Domains;
|
||||
import net.yacy.cora.protocol.RequestHeader;
|
||||
import net.yacy.cora.util.ConcurrentLog;
|
||||
import net.yacy.crawler.retrieval.StreamResponse;
|
||||
import net.yacy.data.InvalidURLLicenceException;
|
||||
import net.yacy.data.URLLicense;
|
||||
import net.yacy.http.servlets.TemplateMissingParameterException;
|
||||
|
@ -122,8 +123,9 @@ public class ImageViewer {
|
|||
String agentName = post.get("agentName", auth ? ClientIdentification.yacyIntranetCrawlerAgentName
|
||||
: ClientIdentification.yacyInternetCrawlerAgentName);
|
||||
ClientIdentification.Agent agent = ClientIdentification.getAgent(agentName);
|
||||
inStream = loader.openInputStream(loader.request(url, false, true), CacheStrategy.IFEXIST,
|
||||
final StreamResponse response = loader.openInputStream(loader.request(url, false, true), CacheStrategy.IFEXIST,
|
||||
BlacklistType.SEARCH, agent);
|
||||
inStream = response.getContentStream();
|
||||
} catch (final IOException e) {
|
||||
/** No need to log full stack trace (in most cases resource is not available because of a network error) */
|
||||
ConcurrentLog.fine("ImageViewer", "cannot load image. URL : " + url.toNormalform(true));
|
||||
|
|
Loading…
Reference in New Issue
Block a user