Improved consistency between loader openInputStream and load functions

2024-09-19 00:01:41 +02:00 · 2017-06-02 01:46:06 +02:00 · 2017-06-02 01:46:06 +02:00 · a9cb083fa1
commit a9cb083fa1
parent cbccf97361
8 changed files with 300 additions and 51 deletions
--- a/source/net/yacy/crawler/retrieval/FTPLoader.java
+++ b/source/net/yacy/crawler/retrieval/FTPLoader.java
@ -27,6 +27,7 @@

 package net.yacy.crawler.retrieval;

+import java.io.ByteArrayInputStream;
 import java.io.ByteArrayOutputStream;
 import java.io.IOException;
 import java.io.PrintStream;
@ -167,6 +168,29 @@ public class FTPLoader {
        return response;
    }
    
+    /**
+     * Open a stream on the entry content from a FTP server
+     *
+     * @param request the request to process
+     * @param acceptOnlyParseable when true and no parser can be found to handle the detected MIME type, open a stream on the URL tokens
+     * @return a response with full meta data and embedding on open input stream on content. Don't forget to close the stream.
+     */
+    public StreamResponse openInputStream(final Request request, final boolean acceptOnlyParseable) throws IOException {
+
+        final Response response = load(request, acceptOnlyParseable);
+        // TODO implement a true ftp content stream instead of a simple ByteArrayInputStream encapsulation
+        final StreamResponse streamResponse;
+        if(response.getContent() != null) {
+        	streamResponse = new StreamResponse(response,
+				new ByteArrayInputStream(response.getContent()));
+        } else {
+        	/* content can be null when no parser can handle it : then return the URL tokens as content */
+        	streamResponse = new StreamResponse(response,
+        			new ByteArrayInputStream(UTF8.getBytes(request.url().toTokens())));        	
+        }
+		return streamResponse;
+    }
+
    /**
     * @param ftpClient
     */
--- a/source/net/yacy/crawler/retrieval/FileLoader.java
+++ b/source/net/yacy/crawler/retrieval/FileLoader.java
@ -24,6 +24,7 @@

 package net.yacy.crawler.retrieval;

+import java.io.ByteArrayInputStream;
 import java.io.IOException;
 import java.io.InputStream;
 import java.util.ArrayList;
@ -58,7 +59,31 @@ public class FileLoader {
        this.maxFileSize = (int) sb.getConfigLong("crawler.file.maxFileSize", -1l);
    }

+    /**
+     * Load fully the requested file in a byte buffer
+     *
+     * @param request the request to process
+     * @param acceptOnlyParseable when true and no parser can be found to handle the detected MIME type, the response content buffer contains only URL tokens
+     * @return a response with full meta data and embedding the content as a byte buffer
+     */
    public Response load(final Request request, boolean acceptOnlyParseable) throws IOException {
+        StreamResponse streamResponse = openInputStream(request, acceptOnlyParseable);
+        
+        /* Read fully the stream and update the response */
+        byte[] content = FileUtils.read(streamResponse.getContentStream());
+        Response response = streamResponse.getResponse();
+        response.setContent(content);
+        return response;
+    }
+    
+    /**
+     * Open a stream on the requested file
+     *
+     * @param request the request to process
+     * @param acceptOnlyParseable when true and no parser can be found to handle the detected MIME type, open a stream on the URL tokens
+     * @return a response with full meta data and embedding on open input stream on content. Don't forget to close the stream.
+     */
+    public StreamResponse openInputStream(final Request request, final boolean acceptOnlyParseable) throws IOException {
        DigestURL url = request.url();
        if (!url.getProtocol().equals("file")) throw new IOException("wrong protocol for FileLoader: " + url.getProtocol());

@ -93,9 +118,9 @@ public class FileLoader {
                    responseHeader,
                    profile,
                    false,
-                    UTF8.getBytes(content.toString()));
+                    null);

-            return response;
+            return new StreamResponse(response, new ByteArrayInputStream(UTF8.getBytes(content.toString())));
        }

        // create response header
@ -133,13 +158,12 @@ public class FileLoader {
                    responseHeader,
                    profile,
                    false,
-                    UTF8.getBytes(url.toTokens()));
-            return response;
+                    null);
+            return new StreamResponse(response, new ByteArrayInputStream(UTF8.getBytes(url.toTokens())));
        }

        // load the resource
-        InputStream is = url.getInputStream(ClientIdentification.yacyInternetCrawlerAgent);
-        byte[] b = FileUtils.read(is);
+        final InputStream is = url.getInputStream(ClientIdentification.yacyInternetCrawlerAgent);

        // create response with loaded content
        final CrawlProfile profile = this.sb.crawler.get(ASCII.getBytes(request.profileHandle()));
@ -149,7 +173,7 @@ public class FileLoader {
                responseHeader,
                profile,
                false,
-                b);
-        return response;
+                null);
+        return new StreamResponse(response, is);
    }
 }
--- a/source/net/yacy/crawler/retrieval/HTTPLoader.java
+++ b/source/net/yacy/crawler/retrieval/HTTPLoader.java
@ -28,6 +28,7 @@ import java.io.ByteArrayInputStream;
 import java.io.IOException;
 import java.io.InputStream;

+import org.apache.http.HttpStatus;
 import org.apache.http.StatusLine;

 import net.yacy.cora.document.id.DigestURL;
@ -83,17 +84,18 @@ public final class HTTPLoader {
    }
    
 	/**
-     * Open input stream on a requested HTTP resource. When resource is small, fully load it and returns a ByteArrayInputStream instance.
+     * Open an input stream on a requested HTTP resource. When the resource content size is small 
+     * (lower than {@link Response#CRAWLER_MAX_SIZE_TO_CACHE}, fully load it and use a ByteArrayInputStream instance.
     * @param request
     * @param profile crawl profile
     * @param retryCount remaining redirect retries count
     * @param maxFileSize max file size to load. -1 means no limit.
     * @param blacklistType blacklist type to use
     * @param agent agent identifier
-     * @return an open input stream. Don't forget to close it.
-     * @throws IOException when an error occured
+     * @return a response with full meta data and embedding on open input stream on content. Don't forget to close the stream.
+     * @throws IOException when an error occurred
     */
-	public InputStream openInputStream(final Request request, CrawlProfile profile, final int retryCount,
+	public StreamResponse openInputStream(final Request request, CrawlProfile profile, final int retryCount,
 			final int maxFileSize, final BlacklistType blacklistType, final ClientIdentification.Agent agent)
 					throws IOException {
 		if (retryCount < 0) {
@ -200,13 +202,14 @@ public final class HTTPLoader {
 					FailCategory.FINAL_PROCESS_CONTEXT, "redirection not wanted", statusCode);
 			throw new IOException("REJECTED UNWANTED REDIRECTION '" + statusline
 					+ "' for URL '" + requestURLString + "'$");
-		} else if (statusCode == 200 || statusCode == 203) {
+		} else if (statusCode == HttpStatus.SC_OK || statusCode == HttpStatus.SC_NON_AUTHORITATIVE_INFORMATION) {
 			// the transfer is ok

 			/*
-			 * When content is not large (less than 1MB), we have better cache it if cache is enabled and url is not local
+			 * When content is not large (less than Response.CRAWLER_MAX_SIZE_TO_CACHE), we have better cache it if cache is enabled and url is not local
 			 */
 			long contentLength = client.getHttpResponse().getEntity().getContentLength();
+			final InputStream contentStream;
 			if (profile != null && profile.storeHTCache() && contentLength > 0 && contentLength < (Response.CRAWLER_MAX_SIZE_TO_CACHE) && !url.isLocal()) {
 				byte[] content = null;
 				try {
@ -218,14 +221,17 @@ public final class HTTPLoader {
 					client.finish();
 				}

-				return new ByteArrayInputStream(content);
-			}
+				contentStream = new ByteArrayInputStream(content);
+			} else {
 				/*
-			 * Returns a HTTPInputStream delegating to
+				 * Create a HTTPInputStream delegating to
 				 * client.getContentstream(). Close method will ensure client is
 				 * properly closed.
 				 */
-			return new HTTPInputStream(client);
+				contentStream = new HTTPInputStream(client);
+			}
+
+			return new StreamResponse(new Response(request, requestHeader, responseHeader, profile, false, null), contentStream);
 		} else {
 			client.finish();
 			// if the response has not the right response type then reject file
--- a/source/net/yacy/crawler/retrieval/Response.java
+++ b/source/net/yacy/crawler/retrieval/Response.java
@ -226,10 +226,21 @@ public class Response {
        this.status = newStatus;
    }
    
+    /**
+     * @return the original request that produced this response
+     */
+    public Request getRequest() {
+		return request;
+	}
+
    public ResponseHeader getResponseHeader() {
        return this.responseHeader;
    }
    
+    public RequestHeader getRequestHeader() {
+		return this.requestHeader;
+	}
+
    public boolean fromCache() {
        return this.fromCache;
    }
--- a/source/net/yacy/crawler/retrieval/SMBLoader.java
+++ b/source/net/yacy/crawler/retrieval/SMBLoader.java
@ -27,6 +27,7 @@

 package net.yacy.crawler.retrieval;

+import java.io.ByteArrayInputStream;
 import java.io.IOException;
 import java.io.InputStream;
 import java.net.MalformedURLException;
@ -69,7 +70,31 @@ public class SMBLoader {
    }


+    /**
+     * Load fully the requested file in a byte buffer
+     *
+     * @param request the request to process
+     * @param acceptOnlyParseable when true and no parser can be found to handle the detected MIME type, the response content buffer contains only URL tokens
+     * @return a response with full meta data and embedding the content as a byte buffer
+     */
    public Response load(final Request request, boolean acceptOnlyParseable) throws IOException {
+        StreamResponse streamResponse = openInputStream(request, acceptOnlyParseable);
+        
+        /* Read fully the stream and update the response */
+        byte[] content = FileUtils.read(streamResponse.getContentStream());
+        Response response = streamResponse.getResponse();
+        response.setContent(content);
+        return response;
+    }
+    
+    /**
+     * Open a stream on the requested file
+     *
+     * @param request the request to process
+     * @param acceptOnlyParseable when true, do not open a stream on content when no parser can be found to handle the detected MIME type
+     * @return a response with full meta data and embedding on open input stream on content. Don't forget to close the stream.
+     */
+    public StreamResponse openInputStream(final Request request, final boolean acceptOnlyParseable) throws IOException {
        DigestURL url = request.url();
        if (!url.getProtocol().equals("smb")) throw new IOException("wrong loader for SMBLoader: " + url.getProtocol());

@ -111,9 +136,9 @@ public class SMBLoader {
                    responseHeader,
                    profile,
                    false,
-                    UTF8.getBytes(content.toString()));
+                    null);

-            return response;
+            return new StreamResponse(response, new ByteArrayInputStream(UTF8.getBytes(content.toString())));
        }

        // create response header
@ -151,13 +176,12 @@ public class SMBLoader {
                    responseHeader,
                    profile,
                    false,
-                    url.toTokens().getBytes());
-            return response;
+                    null);
+            return new StreamResponse(response, new ByteArrayInputStream(url.toTokens().getBytes()));
        }

        // load the resource
        InputStream is = url.getInputStream(ClientIdentification.yacyInternetCrawlerAgent);
-        byte[] b = FileUtils.read(is);

        // create response with loaded content
        final CrawlProfile profile = this.sb.crawler.get(request.profileHandle().getBytes());
@ -167,8 +191,8 @@ public class SMBLoader {
                responseHeader,
                profile,
                false,
-                b);
-        return response;
+                null);
+        return new StreamResponse(response, is);
    }

    public static void main(String[] args) {
--- a/source/net/yacy/crawler/retrieval/StreamResponse.java
+++ b/source/net/yacy/crawler/retrieval/StreamResponse.java
@ -0,0 +1,120 @@
+// StreamResponse.java
+// ---------------------------
+// Copyright 2017 by luccioman; https://github.com/luccioman
+//
+// This is a part of YaCy, a peer-to-peer based web search engine
+//
+// LICENSE
+//
+// This program is free software; you can redistribute it and/or modify
+// it under the terms of the GNU General Public License as published by
+// the Free Software Foundation; either version 2 of the License, or
+// (at your option) any later version.
+//
+// This program is distributed in the hope that it will be useful,
+// but WITHOUT ANY WARRANTY; without even the implied warranty of
+// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+// GNU General Public License for more details.
+//
+// You should have received a copy of the GNU General Public License
+// along with this program; if not, write to the Free Software
+// Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA  02111-1307  USA
+
+package net.yacy.crawler.retrieval;
+
+import java.io.IOException;
+import java.io.InputStream;
+import java.nio.charset.StandardCharsets;
+
+import net.yacy.cora.util.ConcurrentLog;
+import net.yacy.document.Document;
+import net.yacy.document.Parser;
+import net.yacy.document.TextParser;
+import net.yacy.document.VocabularyScraper;
+
+/**
+ * A crawler load response, holding content as a stream.
+ */
+public class StreamResponse {
+
+	/** Logger */
+	private final static ConcurrentLog log = new ConcurrentLog(StreamResponse.class.getSimpleName());
+
+	/**
+	 * Content as a stream.
+	 */
+	private InputStream contentStream;
+
+	/**
+	 * The response details, including notably the request and response headers.
+	 */
+	private Response response;
+
+	/**
+	 * @param response
+	 *            contains the complete crawler response details
+	 * @param contentStream
+	 *            an open input stream on the response content
+	 * @throws IllegalArgumentException
+	 *             when response is null
+	 */
+	public StreamResponse(final Response response, final InputStream contentStream) {
+		if (response == null) {
+			throw new IllegalArgumentException("response parameter must not be null");
+		}
+		this.response = response;
+		this.contentStream = contentStream;
+	}
+
+	/**
+	 * @return the content stream. Don't forget to close it when processing is
+	 *         terminated.
+	 */
+	public InputStream getContentStream() {
+		return this.contentStream;
+	}
+
+	/**
+	 * @return the crawler response with complete details
+	 */
+	public Response getResponse() {
+		return this.response;
+	}
+
+	/**
+	 * Parse and close the content stream and return the parsed documents when
+	 * possible
+	 * 
+	 * @return the parsed documents or null when an error occurred
+	 * @throws Parser.Failure
+	 *             when no parser support the content
+	 */
+	public Document[] parse() throws Parser.Failure {
+		final String supportError = TextParser.supports(this.response.url(),
+				this.response.getResponseHeader() == null ? null : this.response.getResponseHeader().getContentType());
+		if (supportError != null) {
+			throw new Parser.Failure("no parser support:" + supportError, this.response.url());
+		}
+		try {
+			return TextParser.parseSource(this.response.url(),
+					this.response.getResponseHeader() == null ? null
+							: this.response.getResponseHeader().getContentType(),
+					this.response.getResponseHeader() == null ? StandardCharsets.UTF_8.name()
+							: this.response.getResponseHeader().getCharacterEncoding(),
+					new VocabularyScraper(), this.response.getRequest().timezoneOffset(),
+					this.response.getRequest().depth(), this.response.size(), this.contentStream);
+		} catch (final Exception e) {
+			return null;
+		} finally {
+			if (this.contentStream != null) {
+				try {
+					this.contentStream.close();
+				} catch (IOException ignored) {
+					log.warn("Could not close content stream on url " + this.response.url());
+				}
+			}
+		}
+
+	}
+
+}
--- a/source/net/yacy/repository/LoaderDispatcher.java
+++ b/source/net/yacy/repository/LoaderDispatcher.java
@ -29,7 +29,6 @@ package net.yacy.repository;
 import java.io.ByteArrayInputStream;
 import java.io.File;
 import java.io.IOException;
-import java.io.InputStream;
 import java.net.MalformedURLException;
 import java.util.Arrays;
 import java.util.Date;
@ -59,6 +58,7 @@ import net.yacy.crawler.retrieval.HTTPLoader;
 import net.yacy.crawler.retrieval.Request;
 import net.yacy.crawler.retrieval.Response;
 import net.yacy.crawler.retrieval.SMBLoader;
+import net.yacy.crawler.retrieval.StreamResponse;
 import net.yacy.document.Document;
 import net.yacy.document.Parser;
 import net.yacy.document.TextParser;
@ -347,7 +347,7 @@ public final class LoaderDispatcher {
     * @return an open ImageInputStream. Don't forget to close it once used!
     * @throws IOException when url is malformed, blacklisted, or CacheStrategy is CACHEONLY and content is unavailable
     */
-    private InputStream openInputStreamInternal(final Request request, CacheStrategy cacheStrategy, final int maxFileSize, final BlacklistType blacklistType, ClientIdentification.Agent agent) throws IOException {
+    private StreamResponse openInputStreamInternal(final Request request, CacheStrategy cacheStrategy, final int maxFileSize, final BlacklistType blacklistType, ClientIdentification.Agent agent) throws IOException {
        // get the protocol of the next URL
        final DigestURL url = request.url();
 		if (url.isFile() || url.isSMB()) {
@ -367,7 +367,7 @@ public final class LoaderDispatcher {
        // check if we have the page in the cache
        Response cachedResponse = loadFromCache(request, cacheStrategy, agent, url, crawlProfile);
 		if (cachedResponse != null) {
-        	return new ByteArrayInputStream(cachedResponse.getContent());
+			return new StreamResponse(cachedResponse, new ByteArrayInputStream(cachedResponse.getContent()));
 		}

        // check case where we want results from the cache exclusively, and never from the Internet (offline mode)
@ -389,20 +389,20 @@ public final class LoaderDispatcher {
        }

        // load resource from the internet
-        InputStream inStream = null;
+        StreamResponse response;
        if (protocol.equals("http") || protocol.equals("https")) {
-        	inStream = this.httpLoader.openInputStream(request, crawlProfile, 1, maxFileSize, blacklistType, agent);
-        } else if (protocol.equals("ftp") || protocol.equals("smb") || protocol.equals("file")) {
-        	// may also open directly stream with ftp loader
-        	inStream = url.getInputStream(agent);
+        	response = this.httpLoader.openInputStream(request, crawlProfile, 1, maxFileSize, blacklistType, agent);
+        } else if (protocol.equals("ftp")) {
+        	response = this.ftpLoader.openInputStream(request, true);
+        } else if (protocol.equals("smb")) {
+            response = this.smbLoader.openInputStream(request, true);
+        } else if (protocol.equals("file")) {
+            response = this.fileLoader.openInputStream(request, true);
        } else {
            throw new IOException("Unsupported protocol '" + protocol + "' in url " + url);
        }
-        if (inStream == null) {
-            throw new IOException("Unable to open content stream");
-        }

-        return inStream;
+        return response;
    }
    

@ -464,18 +464,18 @@ public final class LoaderDispatcher {
    }
    
    /**
-     * Open url as InputStream from the web or the cache
+     * Open the URL as an InputStream from the web or the cache
     * @param request must be not null
     * @param cacheStrategy cache strategy to use
     * @param blacklistType black list
     * @param agent agent identification for HTTP requests
-     * @return an open InputStream on content. Don't forget to close it once used.
+     * @return a response with full meta data and embedding on open input stream on content. Don't forget to close the stream.
     * @throws IOException when url is malformed or blacklisted
     */
-	public InputStream openInputStream(final Request request, final CacheStrategy cacheStrategy,
+	public StreamResponse openInputStream(final Request request, final CacheStrategy cacheStrategy,
 			BlacklistType blacklistType, final ClientIdentification.Agent agent) throws IOException {
 		final int maxFileSize = protocolMaxFileSize(request.url());
-		InputStream stream = null;
+		StreamResponse response;

 		Semaphore check = this.loaderSteering.get(request.url());
 		if (check != null && cacheStrategy != CacheStrategy.NOCACHE) {
@ -493,9 +493,9 @@ public final class LoaderDispatcher {

 		this.loaderSteering.put(request.url(), new Semaphore(0));
 		try {
-			stream = openInputStreamInternal(request, cacheStrategy, maxFileSize, blacklistType, agent);
+			response = openInputStreamInternal(request, cacheStrategy, maxFileSize, blacklistType, agent);
 		} catch(IOException ioe) {
-			/* Do not re encapsulate eventual IOException in an IOException */
+			/* Do not re encapsulate any eventual IOException in an IOException */
 			throw ioe;
 		} catch (final Throwable e) {
 			throw new IOException(e);
@ -507,7 +507,7 @@ public final class LoaderDispatcher {
 			}
 		}

-		return stream;
+		return response;
 	}

    public Document[] loadDocuments(final Request request, final CacheStrategy cacheStrategy, final int maxFileSize, BlacklistType blacklistType, final ClientIdentification.Agent agent) throws IOException, Parser.Failure {
@ -555,6 +555,44 @@ public final class LoaderDispatcher {
        }
    }
    
+    /**
+     * Similar to the loadDocument method, but streaming the resource content when possible instead of fully loading it in memory.
+     * @param location URL of the resource to load
+     * @param cachePolicy cache policy strategy
+     * @param blacklistType blacklist to use
+     * @param agent user agent identifier
+     * @return on parsed document or null when an error occurred while parsing
+     * @throws IOException when the content can not be fetched or no parser support it
+     */
+    public Document loadDocumentAsStream(final DigestURL location, final CacheStrategy cachePolicy, BlacklistType blacklistType, final ClientIdentification.Agent agent) throws IOException {
+        // load resource
+        Request request = request(location, true, false);
+        final StreamResponse streamResponse = this.openInputStream(request, cachePolicy, blacklistType, agent);
+        final Response response = streamResponse.getResponse();
+        final DigestURL url = request.url();
+        if (response == null) throw new IOException("no Response for url " + url);
+
+        // if it is still not available, report an error
+        if (streamResponse.getContentStream() == null || response.getResponseHeader() == null) {
+        	throw new IOException("no Content available for url " + url);
+        }
+
+        // parse resource
+        try {
+            Document[] documents = streamResponse.parse();
+            Document merged = Document.mergeDocuments(location, response.getMimeType(), documents);
+            
+            String x_robots_tag = response.getResponseHeader().getXRobotsTag();
+            if (x_robots_tag.indexOf("noindex",0) >= 0) {
+            	merged.setIndexingDenied(true);
+            }
+            
+            return merged;
+        } catch(final Parser.Failure e) {
+            throw new IOException(e.getMessage());
+        }
+    }
+
    /**
     * load all links from a resource
     * @param url the url that shall be loaded
--- a/source/net/yacy/visualization/ImageViewer.java
+++ b/source/net/yacy/visualization/ImageViewer.java
@ -44,6 +44,7 @@ import net.yacy.cora.protocol.ClientIdentification;
 import net.yacy.cora.protocol.Domains;
 import net.yacy.cora.protocol.RequestHeader;
 import net.yacy.cora.util.ConcurrentLog;
+import net.yacy.crawler.retrieval.StreamResponse;
 import net.yacy.data.InvalidURLLicenceException;
 import net.yacy.data.URLLicense;
 import net.yacy.http.servlets.TemplateMissingParameterException;
@ -122,8 +123,9 @@ public class ImageViewer {
 				String agentName = post.get("agentName", auth ? ClientIdentification.yacyIntranetCrawlerAgentName
 						: ClientIdentification.yacyInternetCrawlerAgentName);
 				ClientIdentification.Agent agent = ClientIdentification.getAgent(agentName);
-				inStream = loader.openInputStream(loader.request(url, false, true), CacheStrategy.IFEXIST,
+				final StreamResponse response = loader.openInputStream(loader.request(url, false, true), CacheStrategy.IFEXIST,
 						BlacklistType.SEARCH, agent);
+				inStream = response.getContentStream();
 			} catch (final IOException e) {
 				/** No need to log full stack trace (in most cases resource is not available because of a network error) */
 				ConcurrentLog.fine("ImageViewer", "cannot load image. URL : " + url.toNormalform(true));