mirror of
https://github.com/yacy/yacy_search_server.git
synced 2024-09-19 00:01:41 +02:00
Support parsing gzip files from servers with redundant headers.
Some web servers provide both 'Content-Encoding : "gzip"' and 'Content-Type : "application/x-gzip"' HTTP headers on their ".gz" files. This was annoying to fail on such resources which are not so uncommon, while non conforming (see RFC 7231 section 3.1.2.2 for "Content-Encoding" header specification https://tools.ietf.org/html/rfc7231#section-3.1.2.2)
This commit is contained in:
parent
11a7f923d4
commit
5a646540cc
|
@ -30,7 +30,6 @@ import net.yacy.cora.util.ConcurrentLog;
|
||||||
import net.yacy.document.Document;
|
import net.yacy.document.Document;
|
||||||
import net.yacy.document.Parser;
|
import net.yacy.document.Parser;
|
||||||
import net.yacy.document.TextParser;
|
import net.yacy.document.TextParser;
|
||||||
import net.yacy.document.VocabularyScraper;
|
|
||||||
|
|
||||||
/**
|
/**
|
||||||
* A crawler load response, holding content as a stream.
|
* A crawler load response, holding content as a stream.
|
||||||
|
@ -90,31 +89,7 @@ public class StreamResponse {
|
||||||
* when no parser support the content
|
* when no parser support the content
|
||||||
*/
|
*/
|
||||||
public Document[] parse() throws Parser.Failure {
|
public Document[] parse() throws Parser.Failure {
|
||||||
final String supportError = TextParser.supports(this.response.url(),
|
return parseWithLimits(Integer.MAX_VALUE, Long.MAX_VALUE);
|
||||||
this.response.getResponseHeader() == null ? null : this.response.getResponseHeader().getContentType());
|
|
||||||
if (supportError != null) {
|
|
||||||
throw new Parser.Failure("no parser support:" + supportError, this.response.url());
|
|
||||||
}
|
|
||||||
try {
|
|
||||||
return TextParser.parseSource(this.response.url(),
|
|
||||||
this.response.getResponseHeader() == null ? null
|
|
||||||
: this.response.getResponseHeader().getContentType(),
|
|
||||||
this.response.getResponseHeader() == null ? StandardCharsets.UTF_8.name()
|
|
||||||
: this.response.getResponseHeader().getCharacterEncoding(),
|
|
||||||
new VocabularyScraper(), this.response.getRequest().timezoneOffset(),
|
|
||||||
this.response.getRequest().depth(), this.response.size(), this.contentStream);
|
|
||||||
} catch (final Exception e) {
|
|
||||||
return null;
|
|
||||||
} finally {
|
|
||||||
if (this.contentStream != null) {
|
|
||||||
try {
|
|
||||||
this.contentStream.close();
|
|
||||||
} catch (IOException ignored) {
|
|
||||||
log.warn("Could not close content stream on url " + this.response.url());
|
|
||||||
}
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
}
|
}
|
||||||
|
|
||||||
/**
|
/**
|
||||||
|
@ -151,8 +126,10 @@ public class StreamResponse {
|
||||||
: this.response.getResponseHeader().getCharacterEncoding();
|
: this.response.getResponseHeader().getCharacterEncoding();
|
||||||
|
|
||||||
return TextParser.parseWithLimits(this.response.url(), mimeType, charsetName,
|
return TextParser.parseWithLimits(this.response.url(), mimeType, charsetName,
|
||||||
this.response.getRequest().timezoneOffset(), this.response.size(), this.contentStream, maxLinks,
|
this.response.getRequest().timezoneOffset(), this.response.getRequest().depth(),
|
||||||
maxBytes);
|
this.response.size(), this.contentStream, maxLinks, maxBytes);
|
||||||
|
} catch(Parser.Failure e) {
|
||||||
|
throw e;
|
||||||
}catch (final Exception e) {
|
}catch (final Exception e) {
|
||||||
return null;
|
return null;
|
||||||
} finally {
|
} finally {
|
||||||
|
|
|
@ -49,6 +49,7 @@ import net.yacy.document.parser.csvParser;
|
||||||
import net.yacy.document.parser.docParser;
|
import net.yacy.document.parser.docParser;
|
||||||
import net.yacy.document.parser.genericParser;
|
import net.yacy.document.parser.genericParser;
|
||||||
import net.yacy.document.parser.gzipParser;
|
import net.yacy.document.parser.gzipParser;
|
||||||
|
import net.yacy.document.parser.gzipParser.GZIPOpeningStreamException;
|
||||||
import net.yacy.document.parser.htmlParser;
|
import net.yacy.document.parser.htmlParser;
|
||||||
import net.yacy.document.parser.linkScraperParser;
|
import net.yacy.document.parser.linkScraperParser;
|
||||||
import net.yacy.document.parser.mmParser;
|
import net.yacy.document.parser.mmParser;
|
||||||
|
@ -296,6 +297,35 @@ public final class TextParser {
|
||||||
/* Try to reset the marked stream. If the failed parser has consumed too many bytes :
|
/* Try to reset the marked stream. If the failed parser has consumed too many bytes :
|
||||||
* too bad, the marks is invalid and process fails now with an IOException */
|
* too bad, the marks is invalid and process fails now with an IOException */
|
||||||
bufferedStream.reset();
|
bufferedStream.reset();
|
||||||
|
|
||||||
|
if(parser instanceof gzipParser && e.getCause() instanceof GZIPOpeningStreamException
|
||||||
|
&& (idioms.size() == 1 || (idioms.size() == 2 && idioms.contains(genericIdiom)))) {
|
||||||
|
/* The gzip parser failed directly when opening the content stream : before falling back to the generic parser,
|
||||||
|
* let's have a chance to parse the stream as uncompressed. */
|
||||||
|
/* Indeed, this can be a case of misconfigured web server, providing both headers "Content-Encoding" with value "gzip",
|
||||||
|
* and "Content-type" with value such as "application/gzip".
|
||||||
|
* In that case our HTTP client (see GzipResponseInterceptor) is already uncompressing the stream on the fly,
|
||||||
|
* that's why the gzipparser fails opening the stream.
|
||||||
|
* (see RFC 7231 section 3.1.2.2 for "Content-Encoding" header specification https://tools.ietf.org/html/rfc7231#section-3.1.2.2)*/
|
||||||
|
gzipParser gzParser = (gzipParser)parser;
|
||||||
|
|
||||||
|
nonCloseInputStream = new CloseShieldInputStream(bufferedStream);
|
||||||
|
|
||||||
|
Document maindoc = gzipParser.createMainDocument(location, mimeType, charset, gzParser);
|
||||||
|
|
||||||
|
try {
|
||||||
|
Document[] docs = gzParser.parseCompressedInputStream(location,
|
||||||
|
charset, timezoneOffset, depth,
|
||||||
|
nonCloseInputStream, maxLinks, maxBytes);
|
||||||
|
if (docs != null) {
|
||||||
|
maindoc.addSubDocuments(docs);
|
||||||
|
}
|
||||||
|
return new Document[] { maindoc };
|
||||||
|
} catch(Exception e1) {
|
||||||
|
/* Try again to reset the marked stream if the failed parser has not consumed too many bytes */
|
||||||
|
bufferedStream.reset();
|
||||||
|
}
|
||||||
|
}
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
} catch (IOException e) {
|
} catch (IOException e) {
|
||||||
|
@ -345,6 +375,7 @@ public final class TextParser {
|
||||||
* @param mimeType the mime type of the source, if known
|
* @param mimeType the mime type of the source, if known
|
||||||
* @param charset the charset name of the source, if known
|
* @param charset the charset name of the source, if known
|
||||||
* @param timezoneOffset the local time zone offset
|
* @param timezoneOffset the local time zone offset
|
||||||
|
* @param depth the current depth of the crawl
|
||||||
* @param contentLength the length of the source, if known (else -1 should be used)
|
* @param contentLength the length of the source, if known (else -1 should be used)
|
||||||
* @param source a input stream
|
* @param source a input stream
|
||||||
* @param maxLinks the maximum total number of links to parse and add to the result documents
|
* @param maxLinks the maximum total number of links to parse and add to the result documents
|
||||||
|
@ -353,9 +384,9 @@ public final class TextParser {
|
||||||
* @throws Parser.Failure when the parser processing failed
|
* @throws Parser.Failure when the parser processing failed
|
||||||
*/
|
*/
|
||||||
public static Document[] parseWithLimits(final DigestURL location, String mimeType, final String charset,
|
public static Document[] parseWithLimits(final DigestURL location, String mimeType, final String charset,
|
||||||
final int timezoneOffset, final long contentLength, final InputStream sourceStream, int maxLinks,
|
final int timezoneOffset, final int depth, final long contentLength, final InputStream sourceStream, int maxLinks,
|
||||||
long maxBytes) throws Parser.Failure{
|
long maxBytes) throws Parser.Failure{
|
||||||
return parseSource(location, mimeType, charset, new VocabularyScraper(), timezoneOffset, 0, contentLength,
|
return parseSource(location, mimeType, charset, new VocabularyScraper(), timezoneOffset, depth, contentLength,
|
||||||
sourceStream, maxLinks, maxBytes);
|
sourceStream, maxLinks, maxBytes);
|
||||||
}
|
}
|
||||||
|
|
||||||
|
@ -400,6 +431,8 @@ public final class TextParser {
|
||||||
docs = parser.parse(location, mimeType, documentCharset, scraper, timezoneOffset, limitedSource);
|
docs = parser.parse(location, mimeType, documentCharset, scraper, timezoneOffset, limitedSource);
|
||||||
}
|
}
|
||||||
return docs;
|
return docs;
|
||||||
|
} catch(Parser.Failure e) {
|
||||||
|
throw e;
|
||||||
} catch (final Exception e) {
|
} catch (final Exception e) {
|
||||||
throw new Parser.Failure("parser failed: " + parser.getName(), location);
|
throw new Parser.Failure("parser failed: " + parser.getName(), location);
|
||||||
}
|
}
|
||||||
|
@ -460,8 +493,38 @@ public final class TextParser {
|
||||||
docs = parser.parse(location, mimeType, documentCharset, scraper, timezoneOffset, bis);
|
docs = parser.parse(location, mimeType, documentCharset, scraper, timezoneOffset, bis);
|
||||||
}
|
}
|
||||||
} catch (final Parser.Failure e) {
|
} catch (final Parser.Failure e) {
|
||||||
|
if(parser instanceof gzipParser && e.getCause() instanceof GZIPOpeningStreamException &&
|
||||||
|
(parsers.size() == 1 || (parsers.size() == 2 && parsers.contains(genericIdiom)))) {
|
||||||
|
/* The gzip parser failed directly when opening the content stream : before falling back to the generic parser,
|
||||||
|
* let's have a chance to parse the stream as uncompressed. */
|
||||||
|
/* Indeed, this can be a case of misconfigured web server, providing both headers "Content-Encoding" with value "gzip",
|
||||||
|
* and "Content-type" with value such as "application/gzip".
|
||||||
|
* In that case our HTTP client (see GzipResponseInterceptor) is already uncompressing the stream on the fly,
|
||||||
|
* that's why the gzipparser fails opening the stream.
|
||||||
|
* (see RFC 7231 section 3.1.2.2 for "Content-Encoding" header specification https://tools.ietf.org/html/rfc7231#section-3.1.2.2)*/
|
||||||
|
gzipParser gzParser = (gzipParser)parser;
|
||||||
|
|
||||||
|
bis = new ByteArrayInputStream(sourceArray);
|
||||||
|
|
||||||
|
Document maindoc = gzipParser.createMainDocument(location, mimeType, charset, gzParser);
|
||||||
|
|
||||||
|
try {
|
||||||
|
docs = gzParser.parseCompressedInputStream(location,
|
||||||
|
charset, timezoneOffset, depth,
|
||||||
|
bis, maxLinks, maxBytes);
|
||||||
|
if (docs != null) {
|
||||||
|
maindoc.addSubDocuments(docs);
|
||||||
|
}
|
||||||
|
docs = new Document[] { maindoc };
|
||||||
|
break;
|
||||||
|
} catch(Parser.Failure e1) {
|
||||||
|
failedParser.put(parser, e1);
|
||||||
|
} catch(Exception e2) {
|
||||||
|
failedParser.put(parser, new Parser.Failure(e2.getMessage(), location));
|
||||||
|
}
|
||||||
|
} else {
|
||||||
failedParser.put(parser, e);
|
failedParser.put(parser, e);
|
||||||
//log.logWarning("tried parser '" + parser.getName() + "' to parse " + location.toNormalform(true, false) + " but failed: " + e.getMessage(), e);
|
}
|
||||||
} catch (final Exception e) {
|
} catch (final Exception e) {
|
||||||
failedParser.put(parser, new Parser.Failure(e.getMessage(), location));
|
failedParser.put(parser, new Parser.Failure(e.getMessage(), location));
|
||||||
//log.logWarning("tried parser '" + parser.getName() + "' to parse " + location.toNormalform(true, false) + " but failed: " + e.getMessage(), e);
|
//log.logWarning("tried parser '" + parser.getName() + "' to parse " + location.toNormalform(true, false) + " but failed: " + e.getMessage(), e);
|
||||||
|
@ -638,8 +701,21 @@ public final class TextParser {
|
||||||
return ext2mime.get(ext.toLowerCase(Locale.ROOT));
|
return ext2mime.get(ext.toLowerCase(Locale.ROOT));
|
||||||
}
|
}
|
||||||
|
|
||||||
private static String normalizeMimeType(String mimeType) {
|
/**
|
||||||
if (mimeType == null) return "application/octet-stream";
|
* Normalize a media type information string (can be a HTTP "Content-Type"
|
||||||
|
* response header) : convert to lower case, remove any supplementary
|
||||||
|
* parameters such as the encoding (charset name), and provide a default
|
||||||
|
* value when null.
|
||||||
|
*
|
||||||
|
* @param mimeType
|
||||||
|
* raw information about media type, eventually provided by a
|
||||||
|
* HTTP "Content-Type" response header
|
||||||
|
* @return a non null media type in lower case
|
||||||
|
*/
|
||||||
|
public static String normalizeMimeType(String mimeType) {
|
||||||
|
if (mimeType == null) {
|
||||||
|
return "application/octet-stream";
|
||||||
|
}
|
||||||
mimeType = mimeType.toLowerCase(Locale.ROOT);
|
mimeType = mimeType.toLowerCase(Locale.ROOT);
|
||||||
final int pos = mimeType.indexOf(';');
|
final int pos = mimeType.indexOf(';');
|
||||||
return ((pos < 0) ? mimeType.trim() : mimeType.substring(0, pos).trim());
|
return ((pos < 0) ? mimeType.trim() : mimeType.substring(0, pos).trim());
|
||||||
|
|
|
@ -31,9 +31,12 @@ import java.io.File;
|
||||||
import java.io.FileOutputStream;
|
import java.io.FileOutputStream;
|
||||||
import java.io.IOException;
|
import java.io.IOException;
|
||||||
import java.io.InputStream;
|
import java.io.InputStream;
|
||||||
|
import java.net.MalformedURLException;
|
||||||
import java.util.Date;
|
import java.util.Date;
|
||||||
import java.util.zip.GZIPInputStream;
|
import java.util.zip.GZIPInputStream;
|
||||||
|
|
||||||
|
import org.apache.commons.compress.compressors.gzip.GzipUtils;
|
||||||
|
|
||||||
import net.yacy.cora.document.id.DigestURL;
|
import net.yacy.cora.document.id.DigestURL;
|
||||||
import net.yacy.cora.document.id.MultiProtocolURL;
|
import net.yacy.cora.document.id.MultiProtocolURL;
|
||||||
import net.yacy.document.AbstractParser;
|
import net.yacy.document.AbstractParser;
|
||||||
|
@ -42,7 +45,6 @@ import net.yacy.document.Parser;
|
||||||
import net.yacy.document.TextParser;
|
import net.yacy.document.TextParser;
|
||||||
import net.yacy.document.VocabularyScraper;
|
import net.yacy.document.VocabularyScraper;
|
||||||
import net.yacy.kelondro.util.FileUtils;
|
import net.yacy.kelondro.util.FileUtils;
|
||||||
import org.apache.commons.compress.compressors.gzip.GzipUtils;
|
|
||||||
|
|
||||||
/**
|
/**
|
||||||
* Parses a gz archive.
|
* Parses a gz archive.
|
||||||
|
@ -50,6 +52,8 @@ import org.apache.commons.compress.compressors.gzip.GzipUtils;
|
||||||
*/
|
*/
|
||||||
public class gzipParser extends AbstractParser implements Parser {
|
public class gzipParser extends AbstractParser implements Parser {
|
||||||
|
|
||||||
|
private static final int DEFAULT_DEPTH = 999;
|
||||||
|
|
||||||
public gzipParser() {
|
public gzipParser() {
|
||||||
super("GNU Zip Compressed Archive Parser");
|
super("GNU Zip Compressed Archive Parser");
|
||||||
this.SUPPORTED_EXTENSIONS.add("gz");
|
this.SUPPORTED_EXTENSIONS.add("gz");
|
||||||
|
@ -75,12 +79,18 @@ public class gzipParser extends AbstractParser implements Parser {
|
||||||
Document maindoc = null;
|
Document maindoc = null;
|
||||||
GZIPInputStream zippedContent = null;
|
GZIPInputStream zippedContent = null;
|
||||||
FileOutputStream out = null;
|
FileOutputStream out = null;
|
||||||
|
try {
|
||||||
|
zippedContent = new GZIPInputStream(source);
|
||||||
|
} catch(IOException e) {
|
||||||
|
/* Use a GZIPOpeningStreamException to signal the caller the error occurred directly on stream opening
|
||||||
|
* and eventually apply special error handling */
|
||||||
|
throw new Parser.Failure("Unexpected error while parsing gzip file. " + e.getMessage(), location,
|
||||||
|
new GZIPOpeningStreamException());
|
||||||
|
}
|
||||||
try {
|
try {
|
||||||
int read = 0;
|
int read = 0;
|
||||||
final byte[] data = new byte[1024];
|
final byte[] data = new byte[1024];
|
||||||
|
|
||||||
zippedContent = new GZIPInputStream(source);
|
|
||||||
|
|
||||||
tempFile = File.createTempFile("gunzip","tmp");
|
tempFile = File.createTempFile("gunzip","tmp");
|
||||||
|
|
||||||
// creating a temp file to store the uncompressed data
|
// creating a temp file to store the uncompressed data
|
||||||
|
@ -112,11 +122,11 @@ public class gzipParser extends AbstractParser implements Parser {
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
try {
|
try {
|
||||||
maindoc = createMainDocument(location, mimeType, charset);
|
maindoc = createMainDocument(location, mimeType, charset, this);
|
||||||
// creating a new parser class to parse the unzipped content
|
// creating a new parser class to parse the unzipped content
|
||||||
final String contentfilename = GzipUtils.getUncompressedFilename(location.getFileName());
|
final String contentfilename = GzipUtils.getUncompressedFilename(location.getFileName());
|
||||||
final String mime = TextParser.mimeOf(MultiProtocolURL.getFileExtension(contentfilename));
|
final String mime = TextParser.mimeOf(MultiProtocolURL.getFileExtension(contentfilename));
|
||||||
Document[] docs = TextParser.parseSource(location, mime, null, scraper, timezoneOffset, 999, tempFile);
|
Document[] docs = TextParser.parseSource(location, mime, null, scraper, timezoneOffset, DEFAULT_DEPTH, tempFile);
|
||||||
if (docs != null) maindoc.addSubDocuments(docs);
|
if (docs != null) maindoc.addSubDocuments(docs);
|
||||||
} catch (final Exception e) {
|
} catch (final Exception e) {
|
||||||
if (e instanceof InterruptedException) throw (InterruptedException) e;
|
if (e instanceof InterruptedException) throw (InterruptedException) e;
|
||||||
|
@ -134,15 +144,16 @@ public class gzipParser extends AbstractParser implements Parser {
|
||||||
* @param location the parsed resource URL
|
* @param location the parsed resource URL
|
||||||
* @param mimeType the media type of the resource
|
* @param mimeType the media type of the resource
|
||||||
* @param charset the charset name if known
|
* @param charset the charset name if known
|
||||||
|
* @param an instance of gzipParser that is registered as the parser origin of the document
|
||||||
* @return a Document instance
|
* @return a Document instance
|
||||||
*/
|
*/
|
||||||
private Document createMainDocument(final DigestURL location, final String mimeType, final String charset) {
|
public static Document createMainDocument(final DigestURL location, final String mimeType, final String charset, final gzipParser parser) {
|
||||||
final String filename = location.getFileName();
|
final String filename = location.getFileName();
|
||||||
Document maindoc = new Document(
|
Document maindoc = new Document(
|
||||||
location,
|
location,
|
||||||
mimeType,
|
mimeType,
|
||||||
charset,
|
charset,
|
||||||
this,
|
parser,
|
||||||
null,
|
null,
|
||||||
null,
|
null,
|
||||||
AbstractParser.singleList(filename.isEmpty() ? location.toTokens() : MultiProtocolURL.unescape(filename)), // title
|
AbstractParser.singleList(filename.isEmpty() ? location.toTokens() : MultiProtocolURL.unescape(filename)), // title
|
||||||
|
@ -160,6 +171,41 @@ public class gzipParser extends AbstractParser implements Parser {
|
||||||
return maindoc;
|
return maindoc;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Parse content in an open stream uncompressing on the fly a gzipped resource.
|
||||||
|
* @param location the URL of the gzipped resource
|
||||||
|
* @param charset the charset name if known
|
||||||
|
* @param timezoneOffset the local time zone offset
|
||||||
|
* @param compressedInStream an open stream uncompressing on the fly the compressed content
|
||||||
|
* @param maxLinks
|
||||||
|
* the maximum total number of links to parse and add to the
|
||||||
|
* result documents
|
||||||
|
* @param maxBytes
|
||||||
|
* the maximum number of content bytes to process
|
||||||
|
* @return a list of documents that result from parsing the source, with
|
||||||
|
* empty or null text.
|
||||||
|
* @throws Parser.Failure
|
||||||
|
* when the parser processing failed
|
||||||
|
*/
|
||||||
|
public Document[] parseCompressedInputStream(final DigestURL location, final String charset, final int timezoneOffset, final int depth,
|
||||||
|
final InputStream compressedInStream, final int maxLinks, final long maxBytes) throws Failure {
|
||||||
|
// creating a new parser class to parse the unzipped content
|
||||||
|
final String compressedFileName = location.getFileName();
|
||||||
|
final String contentfilename = GzipUtils.getUncompressedFilename(compressedFileName);
|
||||||
|
final String mime = TextParser.mimeOf(MultiProtocolURL.getFileExtension(contentfilename));
|
||||||
|
try {
|
||||||
|
/* Use the uncompressed file name for sub parsers to not unnecessarily use again the gzipparser */
|
||||||
|
final String locationPath = location.getPath();
|
||||||
|
final String contentPath = locationPath.substring(0, locationPath.length() - compressedFileName.length()) + contentfilename;
|
||||||
|
final DigestURL contentLocation = new DigestURL(location.getProtocol(), location.getHost(), location.getPort(), contentPath);
|
||||||
|
|
||||||
|
/* Rely on the supporting parsers to respect the maxLinks and maxBytes limits on compressed content */
|
||||||
|
return TextParser.parseWithLimits(contentLocation, mime, charset, timezoneOffset, depth, -1, compressedInStream, maxLinks, maxBytes);
|
||||||
|
} catch (MalformedURLException e) {
|
||||||
|
throw new Parser.Failure("Unexpected error while parsing gzip file. " + e.getMessage(), location);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
@Override
|
@Override
|
||||||
public boolean isParseWithLimitsSupported() {
|
public boolean isParseWithLimitsSupported() {
|
||||||
return true;
|
return true;
|
||||||
|
@ -177,21 +223,38 @@ public class gzipParser extends AbstractParser implements Parser {
|
||||||
* before an eventual OutOfMemory occurs */
|
* before an eventual OutOfMemory occurs */
|
||||||
zippedContent = new GZIPInputStream(source);
|
zippedContent = new GZIPInputStream(source);
|
||||||
} catch(IOException e) {
|
} catch(IOException e) {
|
||||||
throw new Parser.Failure("Unexpected error while parsing gzip file. " + e.getMessage(), location);
|
/* Use a GZIPOpeningStreamException to signal the caller the error occurred directly on stream opening
|
||||||
|
* and eventually apply special error handling */
|
||||||
|
throw new Parser.Failure("Unexpected error while parsing gzip file. " + e.getMessage(), location,
|
||||||
|
new GZIPOpeningStreamException());
|
||||||
}
|
}
|
||||||
try {
|
try {
|
||||||
maindoc = createMainDocument(location, mimeType, charset);
|
maindoc = createMainDocument(location, mimeType, charset, this);
|
||||||
// creating a new parser class to parse the unzipped content
|
|
||||||
final String contentfilename = GzipUtils.getUncompressedFilename(location.getFileName());
|
|
||||||
final String mime = TextParser.mimeOf(MultiProtocolURL.getFileExtension(contentfilename));
|
|
||||||
|
|
||||||
/* Rely on the supporting parsers to respect the maxLinks and maxBytes limits on compressed content */
|
Document[] docs = parseCompressedInputStream(location, charset, timezoneOffset, DEFAULT_DEPTH, zippedContent, maxLinks, maxBytes);
|
||||||
Document[] docs = TextParser.parseWithLimits(location, mime, charset, timezoneOffset, -1, zippedContent, maxLinks, maxBytes);
|
if (docs != null) {
|
||||||
if (docs != null) maindoc.addSubDocuments(docs);
|
maindoc.addSubDocuments(docs);
|
||||||
|
}
|
||||||
} catch (final Exception e) {
|
} catch (final Exception e) {
|
||||||
throw new Parser.Failure("Unexpected error while parsing gzip file. " + e.getMessage(),location);
|
throw new Parser.Failure("Unexpected error while parsing gzip file. " + e.getMessage(),location);
|
||||||
}
|
}
|
||||||
return maindoc == null ? null : new Document[]{maindoc};
|
return maindoc == null ? null : new Document[]{maindoc};
|
||||||
}
|
}
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Used to signal an error occurred when opening a gzipped input stream.
|
||||||
|
*/
|
||||||
|
public class GZIPOpeningStreamException extends Exception {
|
||||||
|
|
||||||
|
/** The serialization ID */
|
||||||
|
private static final long serialVersionUID = 2824038185373304636L;
|
||||||
|
|
||||||
|
public GZIPOpeningStreamException() {
|
||||||
|
super();
|
||||||
|
}
|
||||||
|
|
||||||
|
public GZIPOpeningStreamException(final String message) {
|
||||||
|
super(message);
|
||||||
|
}
|
||||||
|
}
|
||||||
}
|
}
|
||||||
|
|
Loading…
Reference in New Issue
Block a user