mirror of
https://github.com/yacy/yacy_search_server.git
synced 2024-09-19 00:01:41 +02:00
Support parsing gzip files from servers with redundant headers.
Some web servers provide both 'Content-Encoding : "gzip"' and 'Content-Type : "application/x-gzip"' HTTP headers on their ".gz" files. This was annoying to fail on such resources which are not so uncommon, while non conforming (see RFC 7231 section 3.1.2.2 for "Content-Encoding" header specification https://tools.ietf.org/html/rfc7231#section-3.1.2.2)
This commit is contained in:
parent
11a7f923d4
commit
5a646540cc
|
@ -30,7 +30,6 @@ import net.yacy.cora.util.ConcurrentLog;
|
|||
import net.yacy.document.Document;
|
||||
import net.yacy.document.Parser;
|
||||
import net.yacy.document.TextParser;
|
||||
import net.yacy.document.VocabularyScraper;
|
||||
|
||||
/**
|
||||
* A crawler load response, holding content as a stream.
|
||||
|
@ -90,31 +89,7 @@ public class StreamResponse {
|
|||
* when no parser support the content
|
||||
*/
|
||||
public Document[] parse() throws Parser.Failure {
|
||||
final String supportError = TextParser.supports(this.response.url(),
|
||||
this.response.getResponseHeader() == null ? null : this.response.getResponseHeader().getContentType());
|
||||
if (supportError != null) {
|
||||
throw new Parser.Failure("no parser support:" + supportError, this.response.url());
|
||||
}
|
||||
try {
|
||||
return TextParser.parseSource(this.response.url(),
|
||||
this.response.getResponseHeader() == null ? null
|
||||
: this.response.getResponseHeader().getContentType(),
|
||||
this.response.getResponseHeader() == null ? StandardCharsets.UTF_8.name()
|
||||
: this.response.getResponseHeader().getCharacterEncoding(),
|
||||
new VocabularyScraper(), this.response.getRequest().timezoneOffset(),
|
||||
this.response.getRequest().depth(), this.response.size(), this.contentStream);
|
||||
} catch (final Exception e) {
|
||||
return null;
|
||||
} finally {
|
||||
if (this.contentStream != null) {
|
||||
try {
|
||||
this.contentStream.close();
|
||||
} catch (IOException ignored) {
|
||||
log.warn("Could not close content stream on url " + this.response.url());
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
return parseWithLimits(Integer.MAX_VALUE, Long.MAX_VALUE);
|
||||
}
|
||||
|
||||
/**
|
||||
|
@ -151,9 +126,11 @@ public class StreamResponse {
|
|||
: this.response.getResponseHeader().getCharacterEncoding();
|
||||
|
||||
return TextParser.parseWithLimits(this.response.url(), mimeType, charsetName,
|
||||
this.response.getRequest().timezoneOffset(), this.response.size(), this.contentStream, maxLinks,
|
||||
maxBytes);
|
||||
} catch (final Exception e) {
|
||||
this.response.getRequest().timezoneOffset(), this.response.getRequest().depth(),
|
||||
this.response.size(), this.contentStream, maxLinks, maxBytes);
|
||||
} catch(Parser.Failure e) {
|
||||
throw e;
|
||||
}catch (final Exception e) {
|
||||
return null;
|
||||
} finally {
|
||||
if (this.contentStream != null) {
|
||||
|
|
|
@ -49,6 +49,7 @@ import net.yacy.document.parser.csvParser;
|
|||
import net.yacy.document.parser.docParser;
|
||||
import net.yacy.document.parser.genericParser;
|
||||
import net.yacy.document.parser.gzipParser;
|
||||
import net.yacy.document.parser.gzipParser.GZIPOpeningStreamException;
|
||||
import net.yacy.document.parser.htmlParser;
|
||||
import net.yacy.document.parser.linkScraperParser;
|
||||
import net.yacy.document.parser.mmParser;
|
||||
|
@ -296,6 +297,35 @@ public final class TextParser {
|
|||
/* Try to reset the marked stream. If the failed parser has consumed too many bytes :
|
||||
* too bad, the marks is invalid and process fails now with an IOException */
|
||||
bufferedStream.reset();
|
||||
|
||||
if(parser instanceof gzipParser && e.getCause() instanceof GZIPOpeningStreamException
|
||||
&& (idioms.size() == 1 || (idioms.size() == 2 && idioms.contains(genericIdiom)))) {
|
||||
/* The gzip parser failed directly when opening the content stream : before falling back to the generic parser,
|
||||
* let's have a chance to parse the stream as uncompressed. */
|
||||
/* Indeed, this can be a case of misconfigured web server, providing both headers "Content-Encoding" with value "gzip",
|
||||
* and "Content-type" with value such as "application/gzip".
|
||||
* In that case our HTTP client (see GzipResponseInterceptor) is already uncompressing the stream on the fly,
|
||||
* that's why the gzipparser fails opening the stream.
|
||||
* (see RFC 7231 section 3.1.2.2 for "Content-Encoding" header specification https://tools.ietf.org/html/rfc7231#section-3.1.2.2)*/
|
||||
gzipParser gzParser = (gzipParser)parser;
|
||||
|
||||
nonCloseInputStream = new CloseShieldInputStream(bufferedStream);
|
||||
|
||||
Document maindoc = gzipParser.createMainDocument(location, mimeType, charset, gzParser);
|
||||
|
||||
try {
|
||||
Document[] docs = gzParser.parseCompressedInputStream(location,
|
||||
charset, timezoneOffset, depth,
|
||||
nonCloseInputStream, maxLinks, maxBytes);
|
||||
if (docs != null) {
|
||||
maindoc.addSubDocuments(docs);
|
||||
}
|
||||
return new Document[] { maindoc };
|
||||
} catch(Exception e1) {
|
||||
/* Try again to reset the marked stream if the failed parser has not consumed too many bytes */
|
||||
bufferedStream.reset();
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
} catch (IOException e) {
|
||||
|
@ -345,6 +375,7 @@ public final class TextParser {
|
|||
* @param mimeType the mime type of the source, if known
|
||||
* @param charset the charset name of the source, if known
|
||||
* @param timezoneOffset the local time zone offset
|
||||
* @param depth the current depth of the crawl
|
||||
* @param contentLength the length of the source, if known (else -1 should be used)
|
||||
* @param source a input stream
|
||||
* @param maxLinks the maximum total number of links to parse and add to the result documents
|
||||
|
@ -353,9 +384,9 @@ public final class TextParser {
|
|||
* @throws Parser.Failure when the parser processing failed
|
||||
*/
|
||||
public static Document[] parseWithLimits(final DigestURL location, String mimeType, final String charset,
|
||||
final int timezoneOffset, final long contentLength, final InputStream sourceStream, int maxLinks,
|
||||
final int timezoneOffset, final int depth, final long contentLength, final InputStream sourceStream, int maxLinks,
|
||||
long maxBytes) throws Parser.Failure{
|
||||
return parseSource(location, mimeType, charset, new VocabularyScraper(), timezoneOffset, 0, contentLength,
|
||||
return parseSource(location, mimeType, charset, new VocabularyScraper(), timezoneOffset, depth, contentLength,
|
||||
sourceStream, maxLinks, maxBytes);
|
||||
}
|
||||
|
||||
|
@ -400,6 +431,8 @@ public final class TextParser {
|
|||
docs = parser.parse(location, mimeType, documentCharset, scraper, timezoneOffset, limitedSource);
|
||||
}
|
||||
return docs;
|
||||
} catch(Parser.Failure e) {
|
||||
throw e;
|
||||
} catch (final Exception e) {
|
||||
throw new Parser.Failure("parser failed: " + parser.getName(), location);
|
||||
}
|
||||
|
@ -460,8 +493,38 @@ public final class TextParser {
|
|||
docs = parser.parse(location, mimeType, documentCharset, scraper, timezoneOffset, bis);
|
||||
}
|
||||
} catch (final Parser.Failure e) {
|
||||
failedParser.put(parser, e);
|
||||
//log.logWarning("tried parser '" + parser.getName() + "' to parse " + location.toNormalform(true, false) + " but failed: " + e.getMessage(), e);
|
||||
if(parser instanceof gzipParser && e.getCause() instanceof GZIPOpeningStreamException &&
|
||||
(parsers.size() == 1 || (parsers.size() == 2 && parsers.contains(genericIdiom)))) {
|
||||
/* The gzip parser failed directly when opening the content stream : before falling back to the generic parser,
|
||||
* let's have a chance to parse the stream as uncompressed. */
|
||||
/* Indeed, this can be a case of misconfigured web server, providing both headers "Content-Encoding" with value "gzip",
|
||||
* and "Content-type" with value such as "application/gzip".
|
||||
* In that case our HTTP client (see GzipResponseInterceptor) is already uncompressing the stream on the fly,
|
||||
* that's why the gzipparser fails opening the stream.
|
||||
* (see RFC 7231 section 3.1.2.2 for "Content-Encoding" header specification https://tools.ietf.org/html/rfc7231#section-3.1.2.2)*/
|
||||
gzipParser gzParser = (gzipParser)parser;
|
||||
|
||||
bis = new ByteArrayInputStream(sourceArray);
|
||||
|
||||
Document maindoc = gzipParser.createMainDocument(location, mimeType, charset, gzParser);
|
||||
|
||||
try {
|
||||
docs = gzParser.parseCompressedInputStream(location,
|
||||
charset, timezoneOffset, depth,
|
||||
bis, maxLinks, maxBytes);
|
||||
if (docs != null) {
|
||||
maindoc.addSubDocuments(docs);
|
||||
}
|
||||
docs = new Document[] { maindoc };
|
||||
break;
|
||||
} catch(Parser.Failure e1) {
|
||||
failedParser.put(parser, e1);
|
||||
} catch(Exception e2) {
|
||||
failedParser.put(parser, new Parser.Failure(e2.getMessage(), location));
|
||||
}
|
||||
} else {
|
||||
failedParser.put(parser, e);
|
||||
}
|
||||
} catch (final Exception e) {
|
||||
failedParser.put(parser, new Parser.Failure(e.getMessage(), location));
|
||||
//log.logWarning("tried parser '" + parser.getName() + "' to parse " + location.toNormalform(true, false) + " but failed: " + e.getMessage(), e);
|
||||
|
@ -638,8 +701,21 @@ public final class TextParser {
|
|||
return ext2mime.get(ext.toLowerCase(Locale.ROOT));
|
||||
}
|
||||
|
||||
private static String normalizeMimeType(String mimeType) {
|
||||
if (mimeType == null) return "application/octet-stream";
|
||||
/**
|
||||
* Normalize a media type information string (can be a HTTP "Content-Type"
|
||||
* response header) : convert to lower case, remove any supplementary
|
||||
* parameters such as the encoding (charset name), and provide a default
|
||||
* value when null.
|
||||
*
|
||||
* @param mimeType
|
||||
* raw information about media type, eventually provided by a
|
||||
* HTTP "Content-Type" response header
|
||||
* @return a non null media type in lower case
|
||||
*/
|
||||
public static String normalizeMimeType(String mimeType) {
|
||||
if (mimeType == null) {
|
||||
return "application/octet-stream";
|
||||
}
|
||||
mimeType = mimeType.toLowerCase(Locale.ROOT);
|
||||
final int pos = mimeType.indexOf(';');
|
||||
return ((pos < 0) ? mimeType.trim() : mimeType.substring(0, pos).trim());
|
||||
|
|
|
@ -31,9 +31,12 @@ import java.io.File;
|
|||
import java.io.FileOutputStream;
|
||||
import java.io.IOException;
|
||||
import java.io.InputStream;
|
||||
import java.net.MalformedURLException;
|
||||
import java.util.Date;
|
||||
import java.util.zip.GZIPInputStream;
|
||||
|
||||
import org.apache.commons.compress.compressors.gzip.GzipUtils;
|
||||
|
||||
import net.yacy.cora.document.id.DigestURL;
|
||||
import net.yacy.cora.document.id.MultiProtocolURL;
|
||||
import net.yacy.document.AbstractParser;
|
||||
|
@ -42,13 +45,14 @@ import net.yacy.document.Parser;
|
|||
import net.yacy.document.TextParser;
|
||||
import net.yacy.document.VocabularyScraper;
|
||||
import net.yacy.kelondro.util.FileUtils;
|
||||
import org.apache.commons.compress.compressors.gzip.GzipUtils;
|
||||
|
||||
/**
|
||||
* Parses a gz archive.
|
||||
* Unzips and parses the content and adds it to the created main document
|
||||
*/
|
||||
public class gzipParser extends AbstractParser implements Parser {
|
||||
|
||||
private static final int DEFAULT_DEPTH = 999;
|
||||
|
||||
public gzipParser() {
|
||||
super("GNU Zip Compressed Archive Parser");
|
||||
|
@ -75,12 +79,18 @@ public class gzipParser extends AbstractParser implements Parser {
|
|||
Document maindoc = null;
|
||||
GZIPInputStream zippedContent = null;
|
||||
FileOutputStream out = null;
|
||||
try {
|
||||
zippedContent = new GZIPInputStream(source);
|
||||
} catch(IOException e) {
|
||||
/* Use a GZIPOpeningStreamException to signal the caller the error occurred directly on stream opening
|
||||
* and eventually apply special error handling */
|
||||
throw new Parser.Failure("Unexpected error while parsing gzip file. " + e.getMessage(), location,
|
||||
new GZIPOpeningStreamException());
|
||||
}
|
||||
try {
|
||||
int read = 0;
|
||||
final byte[] data = new byte[1024];
|
||||
|
||||
zippedContent = new GZIPInputStream(source);
|
||||
|
||||
tempFile = File.createTempFile("gunzip","tmp");
|
||||
|
||||
// creating a temp file to store the uncompressed data
|
||||
|
@ -112,11 +122,11 @@ public class gzipParser extends AbstractParser implements Parser {
|
|||
}
|
||||
}
|
||||
try {
|
||||
maindoc = createMainDocument(location, mimeType, charset);
|
||||
maindoc = createMainDocument(location, mimeType, charset, this);
|
||||
// creating a new parser class to parse the unzipped content
|
||||
final String contentfilename = GzipUtils.getUncompressedFilename(location.getFileName());
|
||||
final String mime = TextParser.mimeOf(MultiProtocolURL.getFileExtension(contentfilename));
|
||||
Document[] docs = TextParser.parseSource(location, mime, null, scraper, timezoneOffset, 999, tempFile);
|
||||
Document[] docs = TextParser.parseSource(location, mime, null, scraper, timezoneOffset, DEFAULT_DEPTH, tempFile);
|
||||
if (docs != null) maindoc.addSubDocuments(docs);
|
||||
} catch (final Exception e) {
|
||||
if (e instanceof InterruptedException) throw (InterruptedException) e;
|
||||
|
@ -134,15 +144,16 @@ public class gzipParser extends AbstractParser implements Parser {
|
|||
* @param location the parsed resource URL
|
||||
* @param mimeType the media type of the resource
|
||||
* @param charset the charset name if known
|
||||
* @param an instance of gzipParser that is registered as the parser origin of the document
|
||||
* @return a Document instance
|
||||
*/
|
||||
private Document createMainDocument(final DigestURL location, final String mimeType, final String charset) {
|
||||
public static Document createMainDocument(final DigestURL location, final String mimeType, final String charset, final gzipParser parser) {
|
||||
final String filename = location.getFileName();
|
||||
Document maindoc = new Document(
|
||||
location,
|
||||
mimeType,
|
||||
charset,
|
||||
this,
|
||||
parser,
|
||||
null,
|
||||
null,
|
||||
AbstractParser.singleList(filename.isEmpty() ? location.toTokens() : MultiProtocolURL.unescape(filename)), // title
|
||||
|
@ -159,6 +170,41 @@ public class gzipParser extends AbstractParser implements Parser {
|
|||
new Date());
|
||||
return maindoc;
|
||||
}
|
||||
|
||||
/**
|
||||
* Parse content in an open stream uncompressing on the fly a gzipped resource.
|
||||
* @param location the URL of the gzipped resource
|
||||
* @param charset the charset name if known
|
||||
* @param timezoneOffset the local time zone offset
|
||||
* @param compressedInStream an open stream uncompressing on the fly the compressed content
|
||||
* @param maxLinks
|
||||
* the maximum total number of links to parse and add to the
|
||||
* result documents
|
||||
* @param maxBytes
|
||||
* the maximum number of content bytes to process
|
||||
* @return a list of documents that result from parsing the source, with
|
||||
* empty or null text.
|
||||
* @throws Parser.Failure
|
||||
* when the parser processing failed
|
||||
*/
|
||||
public Document[] parseCompressedInputStream(final DigestURL location, final String charset, final int timezoneOffset, final int depth,
|
||||
final InputStream compressedInStream, final int maxLinks, final long maxBytes) throws Failure {
|
||||
// creating a new parser class to parse the unzipped content
|
||||
final String compressedFileName = location.getFileName();
|
||||
final String contentfilename = GzipUtils.getUncompressedFilename(compressedFileName);
|
||||
final String mime = TextParser.mimeOf(MultiProtocolURL.getFileExtension(contentfilename));
|
||||
try {
|
||||
/* Use the uncompressed file name for sub parsers to not unnecessarily use again the gzipparser */
|
||||
final String locationPath = location.getPath();
|
||||
final String contentPath = locationPath.substring(0, locationPath.length() - compressedFileName.length()) + contentfilename;
|
||||
final DigestURL contentLocation = new DigestURL(location.getProtocol(), location.getHost(), location.getPort(), contentPath);
|
||||
|
||||
/* Rely on the supporting parsers to respect the maxLinks and maxBytes limits on compressed content */
|
||||
return TextParser.parseWithLimits(contentLocation, mime, charset, timezoneOffset, depth, -1, compressedInStream, maxLinks, maxBytes);
|
||||
} catch (MalformedURLException e) {
|
||||
throw new Parser.Failure("Unexpected error while parsing gzip file. " + e.getMessage(), location);
|
||||
}
|
||||
}
|
||||
|
||||
@Override
|
||||
public boolean isParseWithLimitsSupported() {
|
||||
|
@ -177,21 +223,38 @@ public class gzipParser extends AbstractParser implements Parser {
|
|||
* before an eventual OutOfMemory occurs */
|
||||
zippedContent = new GZIPInputStream(source);
|
||||
} catch(IOException e) {
|
||||
throw new Parser.Failure("Unexpected error while parsing gzip file. " + e.getMessage(), location);
|
||||
/* Use a GZIPOpeningStreamException to signal the caller the error occurred directly on stream opening
|
||||
* and eventually apply special error handling */
|
||||
throw new Parser.Failure("Unexpected error while parsing gzip file. " + e.getMessage(), location,
|
||||
new GZIPOpeningStreamException());
|
||||
}
|
||||
try {
|
||||
maindoc = createMainDocument(location, mimeType, charset);
|
||||
// creating a new parser class to parse the unzipped content
|
||||
final String contentfilename = GzipUtils.getUncompressedFilename(location.getFileName());
|
||||
final String mime = TextParser.mimeOf(MultiProtocolURL.getFileExtension(contentfilename));
|
||||
maindoc = createMainDocument(location, mimeType, charset, this);
|
||||
|
||||
/* Rely on the supporting parsers to respect the maxLinks and maxBytes limits on compressed content */
|
||||
Document[] docs = TextParser.parseWithLimits(location, mime, charset, timezoneOffset, -1, zippedContent, maxLinks, maxBytes);
|
||||
if (docs != null) maindoc.addSubDocuments(docs);
|
||||
Document[] docs = parseCompressedInputStream(location, charset, timezoneOffset, DEFAULT_DEPTH, zippedContent, maxLinks, maxBytes);
|
||||
if (docs != null) {
|
||||
maindoc.addSubDocuments(docs);
|
||||
}
|
||||
} catch (final Exception e) {
|
||||
throw new Parser.Failure("Unexpected error while parsing gzip file. " + e.getMessage(),location);
|
||||
}
|
||||
return maindoc == null ? null : new Document[]{maindoc};
|
||||
}
|
||||
|
||||
/**
|
||||
* Used to signal an error occurred when opening a gzipped input stream.
|
||||
*/
|
||||
public class GZIPOpeningStreamException extends Exception {
|
||||
|
||||
/** The serialization ID */
|
||||
private static final long serialVersionUID = 2824038185373304636L;
|
||||
|
||||
public GZIPOpeningStreamException() {
|
||||
super();
|
||||
}
|
||||
|
||||
public GZIPOpeningStreamException(final String message) {
|
||||
super(message);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
|
Loading…
Reference in New Issue
Block a user