added iso,apk,dmg to extension-deny list

see also https://github.com/yacy/yacy_search_server/issues/510
zip is not on the list because it can be parsed
This commit is contained in:
Michael Peter Christen 2022-10-05 16:28:50 +02:00
parent 761dbdf06d
commit d49f937b98
2 changed files with 237 additions and 237 deletions

View File

@ -328,7 +328,7 @@ releases = DATA/RELEASE
# the following mime-types are a blacklist for indexing: # the following mime-types are a blacklist for indexing:
# parser.mime.deny: specifies mime-types that shall not be indexed # parser.mime.deny: specifies mime-types that shall not be indexed
parser.mime.deny= parser.mime.deny=
parser.extensions.deny= parser.extensions.deny=iso,apk,dmg
# The audioTagParser is disabled by default as it needs to create a temporary file each time an audio resource is parsed # The audioTagParser is disabled by default as it needs to create a temporary file each time an audio resource is parsed
# Audio file extensions and media types can be enabled in the ConfigParser_p.html page if this is not a problem with your install # Audio file extensions and media types can be enabled in the ConfigParser_p.html page if this is not a problem with your install
parser.enableAudioTags=false parser.enableAudioTags=false

View File

@ -80,16 +80,16 @@ public final class TextParser {
private static final Object v = new Object(); private static final Object v = new Object();
private static final Parser genericIdiom = new genericParser(); private static final Parser genericIdiom = new genericParser();
/** A generic XML parser instance */ /** A generic XML parser instance */
private static final Parser genericXMLIdiom = new GenericXMLParser(); private static final Parser genericXMLIdiom = new GenericXMLParser();
//use LinkedHashSet for parser collection to use (init) order to prefered parser for same ext or mime //use LinkedHashSet for parser collection to use (init) order to prefered parser for same ext or mime
private static final Map<String, LinkedHashSet<Parser>> mime2parser = new ConcurrentHashMap<String, LinkedHashSet<Parser>>(); private static final Map<String, LinkedHashSet<Parser>> mime2parser = new ConcurrentHashMap<>();
private static final ConcurrentHashMap<String, LinkedHashSet<Parser>> ext2parser = new ConcurrentHashMap<String, LinkedHashSet<Parser>>(); private static final ConcurrentHashMap<String, LinkedHashSet<Parser>> ext2parser = new ConcurrentHashMap<>();
private static final Map<String, String> ext2mime = new ConcurrentHashMap<String, String>(); private static final Map<String, String> ext2mime = new ConcurrentHashMap<>();
private static final Map<String, Object> denyMime = new ConcurrentHashMap<String, Object>(); private static final Map<String, Object> denyMime = new ConcurrentHashMap<>();
private static final Map<String, Object> denyExtensionx = new ConcurrentHashMap<String, Object>(); private static final Map<String, Object> denyExtensionx = new ConcurrentHashMap<>();
static { static {
initParser(new apkParser()); initParser(new apkParser());
@ -130,19 +130,19 @@ public final class TextParser {
} }
public static Set<Parser> parsers() { public static Set<Parser> parsers() {
final Set<Parser> c = new HashSet<Parser>(); final Set<Parser> c = new HashSet<>();
for (Set<Parser> pl: ext2parser.values()) c.addAll(pl); for (final Set<Parser> pl: ext2parser.values()) c.addAll(pl);
for (Set<Parser> pl: mime2parser.values()) c.addAll(pl); for (final Set<Parser> pl: mime2parser.values()) c.addAll(pl);
return c; return c;
} }
/** /**
* @return the set of all supported mime types * @return the set of all supported mime types
*/ */
public static Set<String> supportedMimeTypes() { public static Set<String> supportedMimeTypes() {
final Set<String> mimeTypes = new HashSet<>(); final Set<String> mimeTypes = new HashSet<>();
mimeTypes.addAll(mime2parser.keySet()); mimeTypes.addAll(mime2parser.keySet());
return mimeTypes; return mimeTypes;
} }
private static void initParser(final Parser parser) { private static void initParser(final Parser parser) {
@ -153,7 +153,7 @@ public final class TextParser {
if (prototypeMime == null) prototypeMime = mimeType; if (prototypeMime == null) prototypeMime = mimeType;
LinkedHashSet<Parser> p0 = mime2parser.get(mimeType); LinkedHashSet<Parser> p0 = mime2parser.get(mimeType);
if (p0 == null) { if (p0 == null) {
p0 = new LinkedHashSet<Parser>(); p0 = new LinkedHashSet<>();
mime2parser.put(mimeType, p0); mime2parser.put(mimeType, p0);
} }
p0.add(parser); p0.add(parser);
@ -172,7 +172,7 @@ public final class TextParser {
ext = ext.toLowerCase(Locale.ROOT); ext = ext.toLowerCase(Locale.ROOT);
LinkedHashSet<Parser> p0 = ext2parser.get(ext); LinkedHashSet<Parser> p0 = ext2parser.get(ext);
if (p0 == null) { if (p0 == null) {
p0 = new LinkedHashSet<Parser>(); p0 = new LinkedHashSet<>();
ext2parser.put(ext, p0); ext2parser.put(ext, p0);
} }
p0.add(parser); p0.add(parser);
@ -189,7 +189,7 @@ public final class TextParser {
final int timezoneOffset, final int timezoneOffset,
final int depth, final int depth,
final File sourceFile final File sourceFile
) throws InterruptedException, Parser.Failure { ) throws InterruptedException, Parser.Failure {
BufferedInputStream sourceStream = null; BufferedInputStream sourceStream = null;
Document[] docs = null; Document[] docs = null;
@ -223,7 +223,7 @@ public final class TextParser {
final int timezoneOffset, final int timezoneOffset,
final int depth, final int depth,
final byte[] content final byte[] content
) throws Parser.Failure { ) throws Parser.Failure {
if (AbstractParser.log.isFine()) AbstractParser.log.fine("Parsing '" + location + "' from byte-array"); if (AbstractParser.log.isFine()) AbstractParser.log.fine("Parsing '" + location + "' from byte-array");
mimeType = normalizeMimeType(mimeType); mimeType = normalizeMimeType(mimeType);
Set<Parser> idioms = null; Set<Parser> idioms = null;
@ -236,11 +236,11 @@ public final class TextParser {
} }
assert !idioms.isEmpty() : "no parsers applied for url " + location.toNormalform(true); assert !idioms.isEmpty() : "no parsers applied for url " + location.toNormalform(true);
Document[] docs = parseSource(location, mimeType, idioms, charset, ignore_class_name, scraper, timezoneOffset, depth, content, Integer.MAX_VALUE, Long.MAX_VALUE); final Document[] docs = parseSource(location, mimeType, idioms, charset, ignore_class_name, scraper, timezoneOffset, depth, content, Integer.MAX_VALUE, Long.MAX_VALUE);
return docs; return docs;
} }
/** /**
* Apply only the generic parser to the given content from location. * Apply only the generic parser to the given content from location.
*/ */
@ -253,17 +253,17 @@ public final class TextParser {
final int timezoneOffset, final int timezoneOffset,
final int depth, final int depth,
final byte[] content final byte[] content
) throws Parser.Failure { ) throws Parser.Failure {
if (AbstractParser.log.isFine()) { if (AbstractParser.log.isFine()) {
AbstractParser.log.fine("Parsing '" + location + "' from byte-array, applying only the generic parser"); AbstractParser.log.fine("Parsing '" + location + "' from byte-array, applying only the generic parser");
} }
mimeType = normalizeMimeType(mimeType); mimeType = normalizeMimeType(mimeType);
Set<Parser> idioms = new HashSet<>(); final Set<Parser> idioms = new HashSet<>();
idioms.add(TextParser.genericIdiom); idioms.add(TextParser.genericIdiom);
return parseSource(location, mimeType, idioms, charset, ignoreClassNames, scraper, timezoneOffset, depth, content, Integer.MAX_VALUE, Long.MAX_VALUE); return parseSource(location, mimeType, idioms, charset, ignoreClassNames, scraper, timezoneOffset, depth, content, Integer.MAX_VALUE, Long.MAX_VALUE);
} }
private static Document[] parseSource( private static Document[] parseSource(
final DigestURL location, final DigestURL location,
String mimeType, String mimeType,
@ -276,7 +276,7 @@ public final class TextParser {
final InputStream sourceStream, final InputStream sourceStream,
final int maxLinks, final int maxLinks,
final long maxBytes final long maxBytes
) throws Parser.Failure { ) throws Parser.Failure {
if (AbstractParser.log.isFine()) AbstractParser.log.fine("Parsing '" + location + "' from stream"); if (AbstractParser.log.isFine()) AbstractParser.log.fine("Parsing '" + location + "' from stream");
mimeType = normalizeMimeType(mimeType); mimeType = normalizeMimeType(mimeType);
Set<Parser> idioms = null; Set<Parser> idioms = null;
@ -291,126 +291,126 @@ public final class TextParser {
boolean canStream = false; boolean canStream = false;
if(idioms.size() == 1) { if(idioms.size() == 1) {
canStream = true; canStream = true;
} else if(idioms.size() == 2) { } else if(idioms.size() == 2) {
/* When there are only 2 available parsers, stream oriented parsing can still be applied when one of the 2 parsers is the generic one */ /* When there are only 2 available parsers, stream oriented parsing can still be applied when one of the 2 parsers is the generic one */
for(Parser idiom : idioms) { for(final Parser idiom : idioms) {
if(idiom instanceof genericParser) { if(idiom instanceof genericParser) {
canStream = true; canStream = true;
} }
} }
} else if(sourceStream instanceof ByteArrayInputStream) { } else if(sourceStream instanceof ByteArrayInputStream) {
/* Also check if we have a ByteArrayInputStream as source to prevent useless bytes duplication in a new byte array */ /* Also check if we have a ByteArrayInputStream as source to prevent useless bytes duplication in a new byte array */
canStream = true; canStream = true;
} }
// if we do not have more than one non generic parser, or the content size is over MaxInt (2GB), or is over the totally available memory, // if we do not have more than one non generic parser, or the content size is over MaxInt (2GB), or is over the totally available memory,
// or stream is already in memory as a ByteArrayInputStream // or stream is already in memory as a ByteArrayInputStream
// then we use only stream-oriented parser. // then we use only stream-oriented parser.
if (canStream || contentLength > Integer.MAX_VALUE || contentLength > MemoryControl.available()) { if (canStream || contentLength > Integer.MAX_VALUE || contentLength > MemoryControl.available()) {
try { try {
/* The size of the buffer on the stream must be large enough to allow parser implementations to start parsing the resource /* The size of the buffer on the stream must be large enough to allow parser implementations to start parsing the resource
* and eventually fail, but must also be larger than eventual parsers internal buffers such as BufferedInputStream.DEFAULT_BUFFER_SIZE (8192 bytes) */ * and eventually fail, but must also be larger than eventual parsers internal buffers such as BufferedInputStream.DEFAULT_BUFFER_SIZE (8192 bytes) */
int rewindSize = 10 * 1024; final int rewindSize = 10 * 1024;
final InputStream markableStream; final InputStream markableStream;
if(sourceStream instanceof ByteArrayInputStream) { if(sourceStream instanceof ByteArrayInputStream) {
/* No nead to use a wrapping buffered stream when the source is already entirely in memory. /* No nead to use a wrapping buffered stream when the source is already entirely in memory.
* What's more, ByteArrayInputStream has no read limit when marking.*/ * What's more, ByteArrayInputStream has no read limit when marking.*/
markableStream = sourceStream; markableStream = sourceStream;
} else { } else {
markableStream = new BufferedInputStream(sourceStream, rewindSize); markableStream = new BufferedInputStream(sourceStream, rewindSize);
} }
/* Mark now to allow resetting the buffered stream to the beginning of the stream */ /* Mark now to allow resetting the buffered stream to the beginning of the stream */
markableStream.mark(rewindSize); markableStream.mark(rewindSize);
/* Loop on parser : they are supposed to be sorted in order to start with the most specific and end with the most generic */
for(Parser parser : idioms) {
/* Wrap in a CloseShieldInputStream to prevent SAX parsers closing the sourceStream
* and so let us eventually reuse the same opened stream with other parsers on parser failure */
CloseShieldInputStream nonCloseInputStream = new CloseShieldInputStream(markableStream);
try {
return parseSource(location, mimeType, parser, charset, ignore_class_name, scraper, timezoneOffset,
nonCloseInputStream, maxLinks, maxBytes);
} catch (Parser.Failure e) {
/* Try to reset the marked stream. If the failed parser has consumed too many bytes :
* too bad, the marks is invalid and process fails now with an IOException */
markableStream.reset();
if(parser instanceof gzipParser && e.getCause() instanceof GZIPOpeningStreamException
&& (idioms.size() == 1 || (idioms.size() == 2 && idioms.contains(genericIdiom)))) {
/* The gzip parser failed directly when opening the content stream : before falling back to the generic parser,
* let's have a chance to parse the stream as uncompressed. */
/* Indeed, this can be a case of misconfigured web server, providing both headers "Content-Encoding" with value "gzip",
* and "Content-type" with value such as "application/gzip".
* In that case our HTTP client (see GzipResponseInterceptor) is already uncompressing the stream on the fly,
* that's why the gzipparser fails opening the stream.
* (see RFC 7231 section 3.1.2.2 for "Content-Encoding" header specification https://tools.ietf.org/html/rfc7231#section-3.1.2.2)*/
gzipParser gzParser = (gzipParser)parser;
nonCloseInputStream = new CloseShieldInputStream(markableStream);
Document maindoc = gzipParser.createMainDocument(location, mimeType, charset, gzParser);
try { /* Loop on parser : they are supposed to be sorted in order to start with the most specific and end with the most generic */
Document[] docs = gzParser.parseCompressedInputStream(location, for(final Parser parser : idioms) {
charset, timezoneOffset, depth, /* Wrap in a CloseShieldInputStream to prevent SAX parsers closing the sourceStream
nonCloseInputStream, maxLinks, maxBytes); * and so let us eventually reuse the same opened stream with other parsers on parser failure */
if (docs != null) { CloseShieldInputStream nonCloseInputStream = new CloseShieldInputStream(markableStream);
maindoc.addSubDocuments(docs);
} try {
return new Document[] { maindoc }; return parseSource(location, mimeType, parser, charset, ignore_class_name, scraper, timezoneOffset,
} catch(Exception e1) { nonCloseInputStream, maxLinks, maxBytes);
/* Try again to reset the marked stream if the failed parser has not consumed too many bytes */ } catch (final Parser.Failure e) {
markableStream.reset(); /* Try to reset the marked stream. If the failed parser has consumed too many bytes :
} * too bad, the marks is invalid and process fails now with an IOException */
} markableStream.reset();
}
} if(parser instanceof gzipParser && e.getCause() instanceof GZIPOpeningStreamException
} catch (IOException e) { && (idioms.size() == 1 || (idioms.size() == 2 && idioms.contains(genericIdiom)))) {
throw new Parser.Failure("Error reading source", location); /* The gzip parser failed directly when opening the content stream : before falling back to the generic parser,
} * let's have a chance to parse the stream as uncompressed. */
} /* Indeed, this can be a case of misconfigured web server, providing both headers "Content-Encoding" with value "gzip",
* and "Content-type" with value such as "application/gzip".
* In that case our HTTP client (see GzipResponseInterceptor) is already uncompressing the stream on the fly,
* that's why the gzipparser fails opening the stream.
* (see RFC 7231 section 3.1.2.2 for "Content-Encoding" header specification https://tools.ietf.org/html/rfc7231#section-3.1.2.2)*/
final gzipParser gzParser = (gzipParser)parser;
nonCloseInputStream = new CloseShieldInputStream(markableStream);
final Document maindoc = gzipParser.createMainDocument(location, mimeType, charset, gzParser);
try {
final Document[] docs = gzParser.parseCompressedInputStream(location,
charset, timezoneOffset, depth,
nonCloseInputStream, maxLinks, maxBytes);
if (docs != null) {
maindoc.addSubDocuments(docs);
}
return new Document[] { maindoc };
} catch(final Exception e1) {
/* Try again to reset the marked stream if the failed parser has not consumed too many bytes */
markableStream.reset();
}
}
}
}
} catch (final IOException e) {
throw new Parser.Failure("Error reading source", location);
}
}
// in case that we know more parsers we first transform the content into a byte[] and use that as base // in case that we know more parsers we first transform the content into a byte[] and use that as base
// for a number of different parse attempts. // for a number of different parse attempts.
int maxBytesToRead = -1; int maxBytesToRead = -1;
if(maxBytes < Integer.MAX_VALUE) { if(maxBytes < Integer.MAX_VALUE) {
/* Load at most maxBytes + 1 : /* Load at most maxBytes + 1 :
- to let parsers not supporting Parser.parseWithLimits detect the maxBytes size is exceeded and end with a Parser.Failure - to let parsers not supporting Parser.parseWithLimits detect the maxBytes size is exceeded and end with a Parser.Failure
- but let parsers supporting Parser.parseWithLimits perform partial parsing of maxBytes content */ - but let parsers supporting Parser.parseWithLimits perform partial parsing of maxBytes content */
maxBytesToRead = (int)maxBytes + 1; maxBytesToRead = (int)maxBytes + 1;
} }
if(contentLength >= 0 && contentLength < maxBytesToRead) { if(contentLength >= 0 && contentLength < maxBytesToRead) {
maxBytesToRead = (int)contentLength; maxBytesToRead = (int)contentLength;
} }
byte[] b = null; byte[] b = null;
try { try {
b = FileUtils.read(sourceStream, maxBytesToRead); b = FileUtils.read(sourceStream, maxBytesToRead);
} catch (final IOException e) { } catch (final IOException e) {
throw new Parser.Failure(e.getMessage(), location); throw new Parser.Failure(e.getMessage(), location);
} }
Document[] docs = parseSource(location, mimeType, idioms, charset, ignore_class_name, scraper, timezoneOffset, depth, b, maxLinks, maxBytes); final Document[] docs = parseSource(location, mimeType, idioms, charset, ignore_class_name, scraper, timezoneOffset, depth, b, maxLinks, maxBytes);
return docs; return docs;
} }
public static Document[] parseSource(final DigestURL location, String mimeType, final String charset, public static Document[] parseSource(final DigestURL location, String mimeType, final String charset,
final Set<String> ignore_class_name, final Set<String> ignore_class_name,
final VocabularyScraper scraper, final int timezoneOffset, final int depth, final long contentLength, final VocabularyScraper scraper, final int timezoneOffset, final int depth, final long contentLength,
final InputStream sourceStream) throws Parser.Failure { final InputStream sourceStream) throws Parser.Failure {
return parseSource(location, mimeType, charset, ignore_class_name, scraper, timezoneOffset, depth, contentLength, sourceStream, return parseSource(location, mimeType, charset, ignore_class_name, scraper, timezoneOffset, depth, contentLength, sourceStream,
Integer.MAX_VALUE, Long.MAX_VALUE); Integer.MAX_VALUE, Long.MAX_VALUE);
} }
/** /**
* Try to limit the parser processing with a maximum total number of links detection (anchors, images links, media links...) * Try to limit the parser processing with a maximum total number of links detection (anchors, images links, media links...)
* or a maximum amount of content bytes to parse. Limits apply only when the available parsers for the resource media type support parsing within limits * or a maximum amount of content bytes to parse. Limits apply only when the available parsers for the resource media type support parsing within limits
* (see {@link Parser#isParseWithLimitsSupported()}. When available parsers do * (see {@link Parser#isParseWithLimitsSupported()}. When available parsers do
* not support parsing within limits, an exception is thrown when * not support parsing within limits, an exception is thrown when
* content size is beyond maxBytes. * content size is beyond maxBytes.
* @param location the URL of the source * @param location the URL of the source
* @param mimeType the mime type of the source, if known * @param mimeType the mime type of the source, if known
* @param charset the charset name of the source, if known * @param charset the charset name of the source, if known
@ -424,19 +424,19 @@ public final class TextParser {
* @return a list of documents that result from parsing the source, with empty or null text. * @return a list of documents that result from parsing the source, with empty or null text.
* @throws Parser.Failure when the parser processing failed * @throws Parser.Failure when the parser processing failed
*/ */
public static Document[] parseWithLimits(final DigestURL location, String mimeType, final String charset, final Set<String> ignoreClassNames, public static Document[] parseWithLimits(final DigestURL location, String mimeType, final String charset, final Set<String> ignoreClassNames,
final int timezoneOffset, final int depth, final long contentLength, final InputStream sourceStream, int maxLinks, final int timezoneOffset, final int depth, final long contentLength, final InputStream sourceStream, int maxLinks,
long maxBytes) throws Parser.Failure{ long maxBytes) throws Parser.Failure{
return parseSource(location, mimeType, charset, ignoreClassNames, new VocabularyScraper(), timezoneOffset, depth, contentLength, return parseSource(location, mimeType, charset, ignoreClassNames, new VocabularyScraper(), timezoneOffset, depth, contentLength,
sourceStream, maxLinks, maxBytes); sourceStream, maxLinks, maxBytes);
} }
/** /**
* Try to limit the parser processing with a maximum total number of links detection (anchors, images links, media links...) * Try to limit the parser processing with a maximum total number of links detection (anchors, images links, media links...)
* or a maximum amount of content bytes to parse. Limits apply only when the available parsers for the resource media type support parsing within limits * or a maximum amount of content bytes to parse. Limits apply only when the available parsers for the resource media type support parsing within limits
* (see {@link Parser#isParseWithLimitsSupported()}. When available parsers do * (see {@link Parser#isParseWithLimitsSupported()}. When available parsers do
* not support parsing within limits, an exception is thrown when * not support parsing within limits, an exception is thrown when
* content size is beyond maxBytes. * content size is beyond maxBytes.
* @param location the URL of the source * @param location the URL of the source
* @param mimeType the mime type of the source, if known * @param mimeType the mime type of the source, if known
* @param charset the charset name of the source, if known * @param charset the charset name of the source, if known
@ -449,15 +449,15 @@ public final class TextParser {
* @return a list of documents that result from parsing the source, with empty or null text. * @return a list of documents that result from parsing the source, with empty or null text.
* @throws Parser.Failure when the parser processing failed * @throws Parser.Failure when the parser processing failed
*/ */
public static Document[] parseWithLimits(final DigestURL location, String mimeType, final String charset, public static Document[] parseWithLimits(final DigestURL location, String mimeType, final String charset,
final int timezoneOffset, final int depth, final long contentLength, final InputStream sourceStream, int maxLinks, final int timezoneOffset, final int depth, final long contentLength, final InputStream sourceStream, int maxLinks,
long maxBytes) throws Parser.Failure{ long maxBytes) throws Parser.Failure{
return parseSource(location, mimeType, charset, new HashSet<String>(), new VocabularyScraper(), timezoneOffset, depth, contentLength, return parseSource(location, mimeType, charset, new HashSet<String>(), new VocabularyScraper(), timezoneOffset, depth, contentLength,
sourceStream, maxLinks, maxBytes); sourceStream, maxLinks, maxBytes);
} }
/** /**
* *
* @param location the URL of the source * @param location the URL of the source
* @param mimeType the mime type of the source, if known * @param mimeType the mime type of the source, if known
* @param parser a parser supporting the resource at location * @param parser a parser supporting the resource at location
@ -481,7 +481,7 @@ public final class TextParser {
final InputStream sourceStream, final InputStream sourceStream,
final int maxLinks, final int maxLinks,
final long maxBytes final long maxBytes
) throws Parser.Failure { ) throws Parser.Failure {
if (AbstractParser.log.isFine()) AbstractParser.log.fine("Parsing '" + location + "' from stream"); if (AbstractParser.log.isFine()) AbstractParser.log.fine("Parsing '" + location + "' from stream");
final String fileExt = MultiProtocolURL.getFileExtension(location.getFileName()); final String fileExt = MultiProtocolURL.getFileExtension(location.getFileName());
final String documentCharset = htmlParser.patchCharsetEncoding(charset); final String documentCharset = htmlParser.patchCharsetEncoding(charset);
@ -491,15 +491,15 @@ public final class TextParser {
try { try {
final Document[] docs; final Document[] docs;
if(parser.isParseWithLimitsSupported()) { if(parser.isParseWithLimitsSupported()) {
docs = parser.parseWithLimits(location, mimeType, documentCharset, ignore_class_name, scraper, timezoneOffset, sourceStream, maxLinks, maxBytes); docs = parser.parseWithLimits(location, mimeType, documentCharset, ignore_class_name, scraper, timezoneOffset, sourceStream, maxLinks, maxBytes);
} else { } else {
/* Parser do not support partial parsing within limits : let's control it here*/ /* Parser do not support partial parsing within limits : let's control it here*/
InputStream limitedSource = new StrictLimitInputStream(sourceStream, maxBytes); final InputStream limitedSource = new StrictLimitInputStream(sourceStream, maxBytes);
docs = parser.parse(location, mimeType, documentCharset, ignore_class_name, scraper, timezoneOffset, limitedSource); docs = parser.parse(location, mimeType, documentCharset, ignore_class_name, scraper, timezoneOffset, limitedSource);
} }
return docs; return docs;
} catch(Parser.Failure e) { } catch(final Parser.Failure e) {
throw e; throw e;
} catch (final Exception e) { } catch (final Exception e) {
throw new Parser.Failure("parser failed: " + parser.getName(), location); throw new Parser.Failure("parser failed: " + parser.getName(), location);
} }
@ -531,77 +531,77 @@ public final class TextParser {
final byte[] sourceArray, final byte[] sourceArray,
final int maxLinks, final int maxLinks,
final long maxBytes final long maxBytes
) throws Parser.Failure { ) throws Parser.Failure {
final String fileExt = MultiProtocolURL.getFileExtension(location.getFileName()); final String fileExt = MultiProtocolURL.getFileExtension(location.getFileName());
if (AbstractParser.log.isFine()) AbstractParser.log.fine("Parsing " + location + " with mimeType '" + mimeType + "' and file extension '" + fileExt + "' from byte[]"); if (AbstractParser.log.isFine()) AbstractParser.log.fine("Parsing " + location + " with mimeType '" + mimeType + "' and file extension '" + fileExt + "' from byte[]");
final String documentCharset = htmlParser.patchCharsetEncoding(charset); final String documentCharset = htmlParser.patchCharsetEncoding(charset);
assert !parsers.isEmpty(); assert !parsers.isEmpty();
Document[] docs = null; Document[] docs = null;
final Map<Parser, Parser.Failure> failedParser = new HashMap<Parser, Parser.Failure>(); final Map<Parser, Parser.Failure> failedParser = new HashMap<>();
String origName = Thread.currentThread().getName(); final String origName = Thread.currentThread().getName();
Thread.currentThread().setName("parsing + " + location.toString()); // set a name to get the address in Thread Dump Thread.currentThread().setName("parsing + " + location.toString()); // set a name to get the address in Thread Dump
for (final Parser parser: parsers) { for (final Parser parser: parsers) {
if (MemoryControl.request(sourceArray.length * 6, false)) { if (MemoryControl.request(sourceArray.length * 6, false)) {
ByteArrayInputStream bis; ByteArrayInputStream bis;
if (mimeType.equals("text/plain") && parser.getName().equals("HTML Parser")) { if (mimeType.equals("text/plain") && parser.getName().equals("HTML Parser")) {
// a hack to simulate html files .. is needed for NOLOAD queues. This throws their data into virtual text/plain messages. // a hack to simulate html files .. is needed for NOLOAD queues. This throws their data into virtual text/plain messages.
bis = new ByteArrayInputStream(UTF8.getBytes("<html><head></head><body><h1>" + UTF8.String(sourceArray) + "</h1></body><html>")); bis = new ByteArrayInputStream(UTF8.getBytes("<html><head></head><body><h1>" + UTF8.String(sourceArray) + "</h1></body><html>"));
} else { } else {
bis = new ByteArrayInputStream(sourceArray); bis = new ByteArrayInputStream(sourceArray);
} }
try { try {
if(parser.isParseWithLimitsSupported()) { if(parser.isParseWithLimitsSupported()) {
docs = parser.parseWithLimits(location, mimeType, documentCharset, ignore_class_name, scraper, timezoneOffset, bis, maxLinks, maxBytes); docs = parser.parseWithLimits(location, mimeType, documentCharset, ignore_class_name, scraper, timezoneOffset, bis, maxLinks, maxBytes);
} else { } else {
/* Partial parsing is not supported by this parser : check content length now */ /* Partial parsing is not supported by this parser : check content length now */
if(sourceArray.length > maxBytes) { if(sourceArray.length > maxBytes) {
throw new Parser.Failure("Content size is over maximum size of " + maxBytes + "", location); throw new Parser.Failure("Content size is over maximum size of " + maxBytes + "", location);
} }
docs = parser.parse(location, mimeType, documentCharset, ignore_class_name, scraper, timezoneOffset, bis); docs = parser.parse(location, mimeType, documentCharset, ignore_class_name, scraper, timezoneOffset, bis);
} }
} catch (final Parser.Failure e) { } catch (final Parser.Failure e) {
if(parser instanceof gzipParser && e.getCause() instanceof GZIPOpeningStreamException && if(parser instanceof gzipParser && e.getCause() instanceof GZIPOpeningStreamException &&
(parsers.size() == 1 || (parsers.size() == 2 && parsers.contains(genericIdiom)))) { (parsers.size() == 1 || (parsers.size() == 2 && parsers.contains(genericIdiom)))) {
/* The gzip parser failed directly when opening the content stream : before falling back to the generic parser, /* The gzip parser failed directly when opening the content stream : before falling back to the generic parser,
* let's have a chance to parse the stream as uncompressed. */ * let's have a chance to parse the stream as uncompressed. */
/* Indeed, this can be a case of misconfigured web server, providing both headers "Content-Encoding" with value "gzip", /* Indeed, this can be a case of misconfigured web server, providing both headers "Content-Encoding" with value "gzip",
* and "Content-type" with value such as "application/gzip". * and "Content-type" with value such as "application/gzip".
* In that case our HTTP client (see GzipResponseInterceptor) is already uncompressing the stream on the fly, * In that case our HTTP client (see GzipResponseInterceptor) is already uncompressing the stream on the fly,
* that's why the gzipparser fails opening the stream. * that's why the gzipparser fails opening the stream.
* (see RFC 7231 section 3.1.2.2 for "Content-Encoding" header specification https://tools.ietf.org/html/rfc7231#section-3.1.2.2)*/ * (see RFC 7231 section 3.1.2.2 for "Content-Encoding" header specification https://tools.ietf.org/html/rfc7231#section-3.1.2.2)*/
gzipParser gzParser = (gzipParser)parser; final gzipParser gzParser = (gzipParser)parser;
bis = new ByteArrayInputStream(sourceArray);
Document maindoc = gzipParser.createMainDocument(location, mimeType, charset, gzParser);
try { bis = new ByteArrayInputStream(sourceArray);
docs = gzParser.parseCompressedInputStream(location,
charset, timezoneOffset, depth, final Document maindoc = gzipParser.createMainDocument(location, mimeType, charset, gzParser);
bis, maxLinks, maxBytes);
if (docs != null) { try {
maindoc.addSubDocuments(docs); docs = gzParser.parseCompressedInputStream(location,
} charset, timezoneOffset, depth,
docs = new Document[] { maindoc }; bis, maxLinks, maxBytes);
break; if (docs != null) {
} catch(Parser.Failure e1) { maindoc.addSubDocuments(docs);
failedParser.put(parser, e1); }
} catch(Exception e2) { docs = new Document[] { maindoc };
failedParser.put(parser, new Parser.Failure(e2.getMessage(), location)); break;
} } catch(final Parser.Failure e1) {
} else { failedParser.put(parser, e1);
failedParser.put(parser, e); } catch(final Exception e2) {
} failedParser.put(parser, new Parser.Failure(e2.getMessage(), location));
}
} else {
failedParser.put(parser, e);
}
} catch (final Exception e) { } catch (final Exception e) {
failedParser.put(parser, new Parser.Failure(e.getMessage(), location)); failedParser.put(parser, new Parser.Failure(e.getMessage(), location));
//log.logWarning("tried parser '" + parser.getName() + "' to parse " + location.toNormalform(true, false) + " but failed: " + e.getMessage(), e); //log.logWarning("tried parser '" + parser.getName() + "' to parse " + location.toNormalform(true, false) + " but failed: " + e.getMessage(), e);
} finally { } finally {
try { try {
bis.close(); bis.close();
} catch(IOException ioe) { } catch(final IOException ioe) {
// Ignore. // Ignore.
} }
} }
if (docs != null) break; if (docs != null) break;
} }
@ -616,22 +616,22 @@ public final class TextParser {
} }
String failedParsers = ""; String failedParsers = "";
for (final Map.Entry<Parser, Parser.Failure> error: failedParser.entrySet()) { for (final Map.Entry<Parser, Parser.Failure> error: failedParser.entrySet()) {
AbstractParser.log.warn("tried parser '" + error.getKey().getName() + "' to parse " + location.toNormalform(true) + " but failed: " + error.getValue().getMessage(), error.getValue()); AbstractParser.log.warn("tried parser '" + error.getKey().getName() + "' to parse " + location.toNormalform(true) + " but failed: " + error.getValue().getMessage(), error.getValue());
failedParsers += error.getKey().getName() + " "; failedParsers += error.getKey().getName() + " ";
} }
throw new Parser.Failure("All parser failed: " + failedParsers, location); throw new Parser.Failure("All parser failed: " + failedParsers, location);
} }
for (final Document d: docs) { for (final Document d: docs) {
InputStream textStream = d.getTextStream(); final InputStream textStream = d.getTextStream();
assert textStream != null : "mimeType = " + mimeType; assert textStream != null : "mimeType = " + mimeType;
try { try {
if(textStream != null) { if(textStream != null) {
/* textStream can be a FileInputStream : we must close it to ensure releasing system resource */ /* textStream can be a FileInputStream : we must close it to ensure releasing system resource */
textStream.close(); textStream.close();
} }
} catch (IOException e) { } catch (final IOException e) {
AbstractParser.log.warn("Could not close text input stream"); AbstractParser.log.warn("Could not close text input stream");
} }
d.setDepth(depth); d.setDepth(depth);
} // verify docs } // verify docs
@ -670,7 +670,7 @@ public final class TextParser {
* @throws Parser.Failure when the file extension or the MIME type is denied * @throws Parser.Failure when the file extension or the MIME type is denied
*/ */
private static Set<Parser> parsers(final MultiProtocolURL url, String mimeType1) throws Parser.Failure { private static Set<Parser> parsers(final MultiProtocolURL url, String mimeType1) throws Parser.Failure {
final Set<Parser> idioms = new LinkedHashSet<Parser>(2); // LinkedSet to maintain order (genericParser should be last) final Set<Parser> idioms = new LinkedHashSet<>(2); // LinkedSet to maintain order (genericParser should be last)
// check given mime type, place this first because this is the most likely to work and the best fit to the supplied mime // check given mime type, place this first because this is the most likely to work and the best fit to the supplied mime
Set<Parser> idiom; Set<Parser> idiom;
@ -682,13 +682,13 @@ public final class TextParser {
} }
// check extension and add as backup (in case no, wrong or unknown/unsupported mime was supplied) // check extension and add as backup (in case no, wrong or unknown/unsupported mime was supplied)
String ext = MultiProtocolURL.getFileExtension(url.getFileName()); final String ext = MultiProtocolURL.getFileExtension(url.getFileName());
if (ext != null && ext.length() > 0) { if (ext != null && ext.length() > 0) {
/* We do not throw here an exception when the media type is provided and inconsistent with the extension (if it is not supported an exception has already beeen thrown). /* We do not throw here an exception when the media type is provided and inconsistent with the extension (if it is not supported an exception has already beeen thrown).
* Otherwise we would reject URLs with an apparently unsupported extension but whose actual Media Type is supported (for example text/html). * Otherwise we would reject URLs with an apparently unsupported extension but whose actual Media Type is supported (for example text/html).
* Notable example : wikimedia commons pages, such as https://commons.wikimedia.org/wiki/File:YaCy_logo.png */ * Notable example : wikimedia commons pages, such as https://commons.wikimedia.org/wiki/File:YaCy_logo.png */
if (denyExtensionx.containsKey(ext) && (mimeType1 == null || mimeType1.equals(mimeOf(ext)))) { if (denyExtensionx.containsKey(ext) && (mimeType1 == null || mimeType1.equals(mimeOf(ext)))) {
throw new Parser.Failure("file extension '" + ext + "' is denied (1)", url); throw new Parser.Failure("file extension '" + ext + "' is denied (1)", url);
} }
idiom = ext2parser.get(ext); idiom = ext2parser.get(ext);
if (idiom != null && !idioms.containsAll(idiom)) { // use containsAll -> idiom is a Set of parser if (idiom != null && !idioms.containsAll(idiom)) { // use containsAll -> idiom is a Set of parser
@ -701,11 +701,11 @@ public final class TextParser {
if (mimeType2 != null && (idiom = mime2parser.get(mimeType2)) != null && !idioms.containsAll(idiom)) { // use containsAll -> idiom is a Set of parser if (mimeType2 != null && (idiom = mime2parser.get(mimeType2)) != null && !idioms.containsAll(idiom)) { // use containsAll -> idiom is a Set of parser
idioms.addAll(idiom); idioms.addAll(idiom);
} }
/* No matching idiom has been found : let's check if the media type ends with the "+xml" suffix so we can handle it with a generic XML parser /* No matching idiom has been found : let's check if the media type ends with the "+xml" suffix so we can handle it with a generic XML parser
* (see RFC 7303 - Using '+xml' when Registering XML-Based Media Types : https://tools.ietf.org/html/rfc7303#section-4.2) */ * (see RFC 7303 - Using '+xml' when Registering XML-Based Media Types : https://tools.ietf.org/html/rfc7303#section-4.2) */
if(idioms.isEmpty() && mimeType1 != null && mimeType1.endsWith("+xml")) { if(idioms.isEmpty() && mimeType1 != null && mimeType1.endsWith("+xml")) {
idioms.add(genericXMLIdiom); idioms.add(genericXMLIdiom);
} }
// always add the generic parser (make sure it is the last in access order) // always add the generic parser (make sure it is the last in access order)
@ -723,18 +723,18 @@ public final class TextParser {
*/ */
public static String supportsMime(String mimeType) { public static String supportsMime(String mimeType) {
if (mimeType == null) { if (mimeType == null) {
return null; return null;
} }
mimeType = normalizeMimeType(mimeType); mimeType = normalizeMimeType(mimeType);
if (denyMime.containsKey(mimeType)) { if (denyMime.containsKey(mimeType)) {
return "mime type '" + mimeType + "' is denied (2)"; return "mime type '" + mimeType + "' is denied (2)";
} }
if (mime2parser.get(mimeType) == null) { if (mime2parser.get(mimeType) == null) {
/* No matching idiom has been found : let's check if the media type ends with the "+xml" suffix as can handle it with a generic XML parser /* No matching idiom has been found : let's check if the media type ends with the "+xml" suffix as can handle it with a generic XML parser
* (see RFC 7303 - Using '+xml' when Registering XML-Based Media Types : https://tools.ietf.org/html/rfc7303#section-4.2) */ * (see RFC 7303 - Using '+xml' when Registering XML-Based Media Types : https://tools.ietf.org/html/rfc7303#section-4.2) */
if(!mimeType.endsWith("+xml")) { if(!mimeType.endsWith("+xml")) {
return "no parser for mime '" + mimeType + "' available"; return "no parser for mime '" + mimeType + "' available";
} }
} }
return null; return null;
} }
@ -774,20 +774,20 @@ public final class TextParser {
return ext2mime.get(ext.toLowerCase(Locale.ROOT)); return ext2mime.get(ext.toLowerCase(Locale.ROOT));
} }
/** /**
* Normalize a media type information string (can be a HTTP "Content-Type" * Normalize a media type information string (can be a HTTP "Content-Type"
* response header) : convert to lower case, remove any supplementary * response header) : convert to lower case, remove any supplementary
* parameters such as the encoding (charset name), and provide a default * parameters such as the encoding (charset name), and provide a default
* value when null. * value when null.
* *
* @param mimeType * @param mimeType
* raw information about media type, eventually provided by a * raw information about media type, eventually provided by a
* HTTP "Content-Type" response header * HTTP "Content-Type" response header
* @return a non null media type in lower case * @return a non null media type in lower case
*/ */
public static String normalizeMimeType(String mimeType) { public static String normalizeMimeType(String mimeType) {
if (mimeType == null) { if (mimeType == null) {
return "application/octet-stream"; return "application/octet-stream";
} }
mimeType = mimeType.toLowerCase(Locale.ROOT); mimeType = mimeType.toLowerCase(Locale.ROOT);
final int pos = mimeType.indexOf(';'); final int pos = mimeType.indexOf(';');
@ -818,7 +818,7 @@ public final class TextParser {
public static void setDenyExtension(final String denyList) { public static void setDenyExtension(final String denyList) {
denyExtensionx.clear(); denyExtensionx.clear();
for (final String s: CommonPattern.COMMA.split(denyList)) denyExtensionx.put(s, v); for (final String s: CommonPattern.COMMA.split(denyList)) denyExtensionx.put(s.trim(), v);
} }
public static String getDenyExtension() { public static String getDenyExtension() {