Support parsing gzip files from servers with redundant headers.

Some web servers provide both 'Content-Encoding : "gzip"' and
'Content-Type : "application/x-gzip"' HTTP headers on their ".gz" files.
This was annoying to fail on such resources which are not so uncommon,
while non conforming (see RFC 7231 section 3.1.2.2 for
"Content-Encoding" header specification
https://tools.ietf.org/html/rfc7231#section-3.1.2.2)
This commit is contained in:
luccioman 2017-07-16 14:46:46 +02:00
parent 11a7f923d4
commit 5a646540cc
3 changed files with 166 additions and 50 deletions

View File

@ -30,7 +30,6 @@ import net.yacy.cora.util.ConcurrentLog;
import net.yacy.document.Document; import net.yacy.document.Document;
import net.yacy.document.Parser; import net.yacy.document.Parser;
import net.yacy.document.TextParser; import net.yacy.document.TextParser;
import net.yacy.document.VocabularyScraper;
/** /**
* A crawler load response, holding content as a stream. * A crawler load response, holding content as a stream.
@ -90,31 +89,7 @@ public class StreamResponse {
* when no parser support the content * when no parser support the content
*/ */
public Document[] parse() throws Parser.Failure { public Document[] parse() throws Parser.Failure {
final String supportError = TextParser.supports(this.response.url(), return parseWithLimits(Integer.MAX_VALUE, Long.MAX_VALUE);
this.response.getResponseHeader() == null ? null : this.response.getResponseHeader().getContentType());
if (supportError != null) {
throw new Parser.Failure("no parser support:" + supportError, this.response.url());
}
try {
return TextParser.parseSource(this.response.url(),
this.response.getResponseHeader() == null ? null
: this.response.getResponseHeader().getContentType(),
this.response.getResponseHeader() == null ? StandardCharsets.UTF_8.name()
: this.response.getResponseHeader().getCharacterEncoding(),
new VocabularyScraper(), this.response.getRequest().timezoneOffset(),
this.response.getRequest().depth(), this.response.size(), this.contentStream);
} catch (final Exception e) {
return null;
} finally {
if (this.contentStream != null) {
try {
this.contentStream.close();
} catch (IOException ignored) {
log.warn("Could not close content stream on url " + this.response.url());
}
}
}
} }
/** /**
@ -151,8 +126,10 @@ public class StreamResponse {
: this.response.getResponseHeader().getCharacterEncoding(); : this.response.getResponseHeader().getCharacterEncoding();
return TextParser.parseWithLimits(this.response.url(), mimeType, charsetName, return TextParser.parseWithLimits(this.response.url(), mimeType, charsetName,
this.response.getRequest().timezoneOffset(), this.response.size(), this.contentStream, maxLinks, this.response.getRequest().timezoneOffset(), this.response.getRequest().depth(),
maxBytes); this.response.size(), this.contentStream, maxLinks, maxBytes);
} catch(Parser.Failure e) {
throw e;
}catch (final Exception e) { }catch (final Exception e) {
return null; return null;
} finally { } finally {

View File

@ -49,6 +49,7 @@ import net.yacy.document.parser.csvParser;
import net.yacy.document.parser.docParser; import net.yacy.document.parser.docParser;
import net.yacy.document.parser.genericParser; import net.yacy.document.parser.genericParser;
import net.yacy.document.parser.gzipParser; import net.yacy.document.parser.gzipParser;
import net.yacy.document.parser.gzipParser.GZIPOpeningStreamException;
import net.yacy.document.parser.htmlParser; import net.yacy.document.parser.htmlParser;
import net.yacy.document.parser.linkScraperParser; import net.yacy.document.parser.linkScraperParser;
import net.yacy.document.parser.mmParser; import net.yacy.document.parser.mmParser;
@ -296,6 +297,35 @@ public final class TextParser {
/* Try to reset the marked stream. If the failed parser has consumed too many bytes : /* Try to reset the marked stream. If the failed parser has consumed too many bytes :
* too bad, the marks is invalid and process fails now with an IOException */ * too bad, the marks is invalid and process fails now with an IOException */
bufferedStream.reset(); bufferedStream.reset();
if(parser instanceof gzipParser && e.getCause() instanceof GZIPOpeningStreamException
&& (idioms.size() == 1 || (idioms.size() == 2 && idioms.contains(genericIdiom)))) {
/* The gzip parser failed directly when opening the content stream : before falling back to the generic parser,
* let's have a chance to parse the stream as uncompressed. */
/* Indeed, this can be a case of misconfigured web server, providing both headers "Content-Encoding" with value "gzip",
* and "Content-type" with value such as "application/gzip".
* In that case our HTTP client (see GzipResponseInterceptor) is already uncompressing the stream on the fly,
* that's why the gzipparser fails opening the stream.
* (see RFC 7231 section 3.1.2.2 for "Content-Encoding" header specification https://tools.ietf.org/html/rfc7231#section-3.1.2.2)*/
gzipParser gzParser = (gzipParser)parser;
nonCloseInputStream = new CloseShieldInputStream(bufferedStream);
Document maindoc = gzipParser.createMainDocument(location, mimeType, charset, gzParser);
try {
Document[] docs = gzParser.parseCompressedInputStream(location,
charset, timezoneOffset, depth,
nonCloseInputStream, maxLinks, maxBytes);
if (docs != null) {
maindoc.addSubDocuments(docs);
}
return new Document[] { maindoc };
} catch(Exception e1) {
/* Try again to reset the marked stream if the failed parser has not consumed too many bytes */
bufferedStream.reset();
}
}
} }
} }
} catch (IOException e) { } catch (IOException e) {
@ -345,6 +375,7 @@ public final class TextParser {
* @param mimeType the mime type of the source, if known * @param mimeType the mime type of the source, if known
* @param charset the charset name of the source, if known * @param charset the charset name of the source, if known
* @param timezoneOffset the local time zone offset * @param timezoneOffset the local time zone offset
* @param depth the current depth of the crawl
* @param contentLength the length of the source, if known (else -1 should be used) * @param contentLength the length of the source, if known (else -1 should be used)
* @param source a input stream * @param source a input stream
* @param maxLinks the maximum total number of links to parse and add to the result documents * @param maxLinks the maximum total number of links to parse and add to the result documents
@ -353,9 +384,9 @@ public final class TextParser {
* @throws Parser.Failure when the parser processing failed * @throws Parser.Failure when the parser processing failed
*/ */
public static Document[] parseWithLimits(final DigestURL location, String mimeType, final String charset, public static Document[] parseWithLimits(final DigestURL location, String mimeType, final String charset,
final int timezoneOffset, final long contentLength, final InputStream sourceStream, int maxLinks, final int timezoneOffset, final int depth, final long contentLength, final InputStream sourceStream, int maxLinks,
long maxBytes) throws Parser.Failure{ long maxBytes) throws Parser.Failure{
return parseSource(location, mimeType, charset, new VocabularyScraper(), timezoneOffset, 0, contentLength, return parseSource(location, mimeType, charset, new VocabularyScraper(), timezoneOffset, depth, contentLength,
sourceStream, maxLinks, maxBytes); sourceStream, maxLinks, maxBytes);
} }
@ -400,6 +431,8 @@ public final class TextParser {
docs = parser.parse(location, mimeType, documentCharset, scraper, timezoneOffset, limitedSource); docs = parser.parse(location, mimeType, documentCharset, scraper, timezoneOffset, limitedSource);
} }
return docs; return docs;
} catch(Parser.Failure e) {
throw e;
} catch (final Exception e) { } catch (final Exception e) {
throw new Parser.Failure("parser failed: " + parser.getName(), location); throw new Parser.Failure("parser failed: " + parser.getName(), location);
} }
@ -460,8 +493,38 @@ public final class TextParser {
docs = parser.parse(location, mimeType, documentCharset, scraper, timezoneOffset, bis); docs = parser.parse(location, mimeType, documentCharset, scraper, timezoneOffset, bis);
} }
} catch (final Parser.Failure e) { } catch (final Parser.Failure e) {
if(parser instanceof gzipParser && e.getCause() instanceof GZIPOpeningStreamException &&
(parsers.size() == 1 || (parsers.size() == 2 && parsers.contains(genericIdiom)))) {
/* The gzip parser failed directly when opening the content stream : before falling back to the generic parser,
* let's have a chance to parse the stream as uncompressed. */
/* Indeed, this can be a case of misconfigured web server, providing both headers "Content-Encoding" with value "gzip",
* and "Content-type" with value such as "application/gzip".
* In that case our HTTP client (see GzipResponseInterceptor) is already uncompressing the stream on the fly,
* that's why the gzipparser fails opening the stream.
* (see RFC 7231 section 3.1.2.2 for "Content-Encoding" header specification https://tools.ietf.org/html/rfc7231#section-3.1.2.2)*/
gzipParser gzParser = (gzipParser)parser;
bis = new ByteArrayInputStream(sourceArray);
Document maindoc = gzipParser.createMainDocument(location, mimeType, charset, gzParser);
try {
docs = gzParser.parseCompressedInputStream(location,
charset, timezoneOffset, depth,
bis, maxLinks, maxBytes);
if (docs != null) {
maindoc.addSubDocuments(docs);
}
docs = new Document[] { maindoc };
break;
} catch(Parser.Failure e1) {
failedParser.put(parser, e1);
} catch(Exception e2) {
failedParser.put(parser, new Parser.Failure(e2.getMessage(), location));
}
} else {
failedParser.put(parser, e); failedParser.put(parser, e);
//log.logWarning("tried parser '" + parser.getName() + "' to parse " + location.toNormalform(true, false) + " but failed: " + e.getMessage(), e); }
} catch (final Exception e) { } catch (final Exception e) {
failedParser.put(parser, new Parser.Failure(e.getMessage(), location)); failedParser.put(parser, new Parser.Failure(e.getMessage(), location));
//log.logWarning("tried parser '" + parser.getName() + "' to parse " + location.toNormalform(true, false) + " but failed: " + e.getMessage(), e); //log.logWarning("tried parser '" + parser.getName() + "' to parse " + location.toNormalform(true, false) + " but failed: " + e.getMessage(), e);
@ -638,8 +701,21 @@ public final class TextParser {
return ext2mime.get(ext.toLowerCase(Locale.ROOT)); return ext2mime.get(ext.toLowerCase(Locale.ROOT));
} }
private static String normalizeMimeType(String mimeType) { /**
if (mimeType == null) return "application/octet-stream"; * Normalize a media type information string (can be a HTTP "Content-Type"
* response header) : convert to lower case, remove any supplementary
* parameters such as the encoding (charset name), and provide a default
* value when null.
*
* @param mimeType
* raw information about media type, eventually provided by a
* HTTP "Content-Type" response header
* @return a non null media type in lower case
*/
public static String normalizeMimeType(String mimeType) {
if (mimeType == null) {
return "application/octet-stream";
}
mimeType = mimeType.toLowerCase(Locale.ROOT); mimeType = mimeType.toLowerCase(Locale.ROOT);
final int pos = mimeType.indexOf(';'); final int pos = mimeType.indexOf(';');
return ((pos < 0) ? mimeType.trim() : mimeType.substring(0, pos).trim()); return ((pos < 0) ? mimeType.trim() : mimeType.substring(0, pos).trim());

View File

@ -31,9 +31,12 @@ import java.io.File;
import java.io.FileOutputStream; import java.io.FileOutputStream;
import java.io.IOException; import java.io.IOException;
import java.io.InputStream; import java.io.InputStream;
import java.net.MalformedURLException;
import java.util.Date; import java.util.Date;
import java.util.zip.GZIPInputStream; import java.util.zip.GZIPInputStream;
import org.apache.commons.compress.compressors.gzip.GzipUtils;
import net.yacy.cora.document.id.DigestURL; import net.yacy.cora.document.id.DigestURL;
import net.yacy.cora.document.id.MultiProtocolURL; import net.yacy.cora.document.id.MultiProtocolURL;
import net.yacy.document.AbstractParser; import net.yacy.document.AbstractParser;
@ -42,7 +45,6 @@ import net.yacy.document.Parser;
import net.yacy.document.TextParser; import net.yacy.document.TextParser;
import net.yacy.document.VocabularyScraper; import net.yacy.document.VocabularyScraper;
import net.yacy.kelondro.util.FileUtils; import net.yacy.kelondro.util.FileUtils;
import org.apache.commons.compress.compressors.gzip.GzipUtils;
/** /**
* Parses a gz archive. * Parses a gz archive.
@ -50,6 +52,8 @@ import org.apache.commons.compress.compressors.gzip.GzipUtils;
*/ */
public class gzipParser extends AbstractParser implements Parser { public class gzipParser extends AbstractParser implements Parser {
private static final int DEFAULT_DEPTH = 999;
public gzipParser() { public gzipParser() {
super("GNU Zip Compressed Archive Parser"); super("GNU Zip Compressed Archive Parser");
this.SUPPORTED_EXTENSIONS.add("gz"); this.SUPPORTED_EXTENSIONS.add("gz");
@ -75,12 +79,18 @@ public class gzipParser extends AbstractParser implements Parser {
Document maindoc = null; Document maindoc = null;
GZIPInputStream zippedContent = null; GZIPInputStream zippedContent = null;
FileOutputStream out = null; FileOutputStream out = null;
try {
zippedContent = new GZIPInputStream(source);
} catch(IOException e) {
/* Use a GZIPOpeningStreamException to signal the caller the error occurred directly on stream opening
* and eventually apply special error handling */
throw new Parser.Failure("Unexpected error while parsing gzip file. " + e.getMessage(), location,
new GZIPOpeningStreamException());
}
try { try {
int read = 0; int read = 0;
final byte[] data = new byte[1024]; final byte[] data = new byte[1024];
zippedContent = new GZIPInputStream(source);
tempFile = File.createTempFile("gunzip","tmp"); tempFile = File.createTempFile("gunzip","tmp");
// creating a temp file to store the uncompressed data // creating a temp file to store the uncompressed data
@ -112,11 +122,11 @@ public class gzipParser extends AbstractParser implements Parser {
} }
} }
try { try {
maindoc = createMainDocument(location, mimeType, charset); maindoc = createMainDocument(location, mimeType, charset, this);
// creating a new parser class to parse the unzipped content // creating a new parser class to parse the unzipped content
final String contentfilename = GzipUtils.getUncompressedFilename(location.getFileName()); final String contentfilename = GzipUtils.getUncompressedFilename(location.getFileName());
final String mime = TextParser.mimeOf(MultiProtocolURL.getFileExtension(contentfilename)); final String mime = TextParser.mimeOf(MultiProtocolURL.getFileExtension(contentfilename));
Document[] docs = TextParser.parseSource(location, mime, null, scraper, timezoneOffset, 999, tempFile); Document[] docs = TextParser.parseSource(location, mime, null, scraper, timezoneOffset, DEFAULT_DEPTH, tempFile);
if (docs != null) maindoc.addSubDocuments(docs); if (docs != null) maindoc.addSubDocuments(docs);
} catch (final Exception e) { } catch (final Exception e) {
if (e instanceof InterruptedException) throw (InterruptedException) e; if (e instanceof InterruptedException) throw (InterruptedException) e;
@ -134,15 +144,16 @@ public class gzipParser extends AbstractParser implements Parser {
* @param location the parsed resource URL * @param location the parsed resource URL
* @param mimeType the media type of the resource * @param mimeType the media type of the resource
* @param charset the charset name if known * @param charset the charset name if known
* @param an instance of gzipParser that is registered as the parser origin of the document
* @return a Document instance * @return a Document instance
*/ */
private Document createMainDocument(final DigestURL location, final String mimeType, final String charset) { public static Document createMainDocument(final DigestURL location, final String mimeType, final String charset, final gzipParser parser) {
final String filename = location.getFileName(); final String filename = location.getFileName();
Document maindoc = new Document( Document maindoc = new Document(
location, location,
mimeType, mimeType,
charset, charset,
this, parser,
null, null,
null, null,
AbstractParser.singleList(filename.isEmpty() ? location.toTokens() : MultiProtocolURL.unescape(filename)), // title AbstractParser.singleList(filename.isEmpty() ? location.toTokens() : MultiProtocolURL.unescape(filename)), // title
@ -160,6 +171,41 @@ public class gzipParser extends AbstractParser implements Parser {
return maindoc; return maindoc;
} }
/**
* Parse content in an open stream uncompressing on the fly a gzipped resource.
* @param location the URL of the gzipped resource
* @param charset the charset name if known
* @param timezoneOffset the local time zone offset
* @param compressedInStream an open stream uncompressing on the fly the compressed content
* @param maxLinks
* the maximum total number of links to parse and add to the
* result documents
* @param maxBytes
* the maximum number of content bytes to process
* @return a list of documents that result from parsing the source, with
* empty or null text.
* @throws Parser.Failure
* when the parser processing failed
*/
public Document[] parseCompressedInputStream(final DigestURL location, final String charset, final int timezoneOffset, final int depth,
final InputStream compressedInStream, final int maxLinks, final long maxBytes) throws Failure {
// creating a new parser class to parse the unzipped content
final String compressedFileName = location.getFileName();
final String contentfilename = GzipUtils.getUncompressedFilename(compressedFileName);
final String mime = TextParser.mimeOf(MultiProtocolURL.getFileExtension(contentfilename));
try {
/* Use the uncompressed file name for sub parsers to not unnecessarily use again the gzipparser */
final String locationPath = location.getPath();
final String contentPath = locationPath.substring(0, locationPath.length() - compressedFileName.length()) + contentfilename;
final DigestURL contentLocation = new DigestURL(location.getProtocol(), location.getHost(), location.getPort(), contentPath);
/* Rely on the supporting parsers to respect the maxLinks and maxBytes limits on compressed content */
return TextParser.parseWithLimits(contentLocation, mime, charset, timezoneOffset, depth, -1, compressedInStream, maxLinks, maxBytes);
} catch (MalformedURLException e) {
throw new Parser.Failure("Unexpected error while parsing gzip file. " + e.getMessage(), location);
}
}
@Override @Override
public boolean isParseWithLimitsSupported() { public boolean isParseWithLimitsSupported() {
return true; return true;
@ -177,21 +223,38 @@ public class gzipParser extends AbstractParser implements Parser {
* before an eventual OutOfMemory occurs */ * before an eventual OutOfMemory occurs */
zippedContent = new GZIPInputStream(source); zippedContent = new GZIPInputStream(source);
} catch(IOException e) { } catch(IOException e) {
throw new Parser.Failure("Unexpected error while parsing gzip file. " + e.getMessage(), location); /* Use a GZIPOpeningStreamException to signal the caller the error occurred directly on stream opening
* and eventually apply special error handling */
throw new Parser.Failure("Unexpected error while parsing gzip file. " + e.getMessage(), location,
new GZIPOpeningStreamException());
} }
try { try {
maindoc = createMainDocument(location, mimeType, charset); maindoc = createMainDocument(location, mimeType, charset, this);
// creating a new parser class to parse the unzipped content
final String contentfilename = GzipUtils.getUncompressedFilename(location.getFileName());
final String mime = TextParser.mimeOf(MultiProtocolURL.getFileExtension(contentfilename));
/* Rely on the supporting parsers to respect the maxLinks and maxBytes limits on compressed content */ Document[] docs = parseCompressedInputStream(location, charset, timezoneOffset, DEFAULT_DEPTH, zippedContent, maxLinks, maxBytes);
Document[] docs = TextParser.parseWithLimits(location, mime, charset, timezoneOffset, -1, zippedContent, maxLinks, maxBytes); if (docs != null) {
if (docs != null) maindoc.addSubDocuments(docs); maindoc.addSubDocuments(docs);
}
} catch (final Exception e) { } catch (final Exception e) {
throw new Parser.Failure("Unexpected error while parsing gzip file. " + e.getMessage(),location); throw new Parser.Failure("Unexpected error while parsing gzip file. " + e.getMessage(),location);
} }
return maindoc == null ? null : new Document[]{maindoc}; return maindoc == null ? null : new Document[]{maindoc};
} }
/**
* Used to signal an error occurred when opening a gzipped input stream.
*/
public class GZIPOpeningStreamException extends Exception {
/** The serialization ID */
private static final long serialVersionUID = 2824038185373304636L;
public GZIPOpeningStreamException() {
super();
}
public GZIPOpeningStreamException(final String message) {
super(message);
}
}
} }