mirror of
https://github.com/yacy/yacy_search_server.git
synced 2024-09-19 00:01:41 +02:00
added iso,apk,dmg to extension-deny list
see also https://github.com/yacy/yacy_search_server/issues/510 zip is not on the list because it can be parsed
This commit is contained in:
parent
761dbdf06d
commit
d49f937b98
|
@ -328,7 +328,7 @@ releases = DATA/RELEASE
|
||||||
# the following mime-types are a blacklist for indexing:
|
# the following mime-types are a blacklist for indexing:
|
||||||
# parser.mime.deny: specifies mime-types that shall not be indexed
|
# parser.mime.deny: specifies mime-types that shall not be indexed
|
||||||
parser.mime.deny=
|
parser.mime.deny=
|
||||||
parser.extensions.deny=
|
parser.extensions.deny=iso,apk,dmg
|
||||||
# The audioTagParser is disabled by default as it needs to create a temporary file each time an audio resource is parsed
|
# The audioTagParser is disabled by default as it needs to create a temporary file each time an audio resource is parsed
|
||||||
# Audio file extensions and media types can be enabled in the ConfigParser_p.html page if this is not a problem with your install
|
# Audio file extensions and media types can be enabled in the ConfigParser_p.html page if this is not a problem with your install
|
||||||
parser.enableAudioTags=false
|
parser.enableAudioTags=false
|
||||||
|
|
|
@ -80,16 +80,16 @@ public final class TextParser {
|
||||||
private static final Object v = new Object();
|
private static final Object v = new Object();
|
||||||
|
|
||||||
private static final Parser genericIdiom = new genericParser();
|
private static final Parser genericIdiom = new genericParser();
|
||||||
|
|
||||||
/** A generic XML parser instance */
|
/** A generic XML parser instance */
|
||||||
private static final Parser genericXMLIdiom = new GenericXMLParser();
|
private static final Parser genericXMLIdiom = new GenericXMLParser();
|
||||||
|
|
||||||
//use LinkedHashSet for parser collection to use (init) order to prefered parser for same ext or mime
|
//use LinkedHashSet for parser collection to use (init) order to prefered parser for same ext or mime
|
||||||
private static final Map<String, LinkedHashSet<Parser>> mime2parser = new ConcurrentHashMap<String, LinkedHashSet<Parser>>();
|
private static final Map<String, LinkedHashSet<Parser>> mime2parser = new ConcurrentHashMap<>();
|
||||||
private static final ConcurrentHashMap<String, LinkedHashSet<Parser>> ext2parser = new ConcurrentHashMap<String, LinkedHashSet<Parser>>();
|
private static final ConcurrentHashMap<String, LinkedHashSet<Parser>> ext2parser = new ConcurrentHashMap<>();
|
||||||
private static final Map<String, String> ext2mime = new ConcurrentHashMap<String, String>();
|
private static final Map<String, String> ext2mime = new ConcurrentHashMap<>();
|
||||||
private static final Map<String, Object> denyMime = new ConcurrentHashMap<String, Object>();
|
private static final Map<String, Object> denyMime = new ConcurrentHashMap<>();
|
||||||
private static final Map<String, Object> denyExtensionx = new ConcurrentHashMap<String, Object>();
|
private static final Map<String, Object> denyExtensionx = new ConcurrentHashMap<>();
|
||||||
|
|
||||||
static {
|
static {
|
||||||
initParser(new apkParser());
|
initParser(new apkParser());
|
||||||
|
@ -130,19 +130,19 @@ public final class TextParser {
|
||||||
}
|
}
|
||||||
|
|
||||||
public static Set<Parser> parsers() {
|
public static Set<Parser> parsers() {
|
||||||
final Set<Parser> c = new HashSet<Parser>();
|
final Set<Parser> c = new HashSet<>();
|
||||||
for (Set<Parser> pl: ext2parser.values()) c.addAll(pl);
|
for (final Set<Parser> pl: ext2parser.values()) c.addAll(pl);
|
||||||
for (Set<Parser> pl: mime2parser.values()) c.addAll(pl);
|
for (final Set<Parser> pl: mime2parser.values()) c.addAll(pl);
|
||||||
return c;
|
return c;
|
||||||
}
|
}
|
||||||
|
|
||||||
/**
|
/**
|
||||||
* @return the set of all supported mime types
|
* @return the set of all supported mime types
|
||||||
*/
|
*/
|
||||||
public static Set<String> supportedMimeTypes() {
|
public static Set<String> supportedMimeTypes() {
|
||||||
final Set<String> mimeTypes = new HashSet<>();
|
final Set<String> mimeTypes = new HashSet<>();
|
||||||
mimeTypes.addAll(mime2parser.keySet());
|
mimeTypes.addAll(mime2parser.keySet());
|
||||||
return mimeTypes;
|
return mimeTypes;
|
||||||
}
|
}
|
||||||
|
|
||||||
private static void initParser(final Parser parser) {
|
private static void initParser(final Parser parser) {
|
||||||
|
@ -153,7 +153,7 @@ public final class TextParser {
|
||||||
if (prototypeMime == null) prototypeMime = mimeType;
|
if (prototypeMime == null) prototypeMime = mimeType;
|
||||||
LinkedHashSet<Parser> p0 = mime2parser.get(mimeType);
|
LinkedHashSet<Parser> p0 = mime2parser.get(mimeType);
|
||||||
if (p0 == null) {
|
if (p0 == null) {
|
||||||
p0 = new LinkedHashSet<Parser>();
|
p0 = new LinkedHashSet<>();
|
||||||
mime2parser.put(mimeType, p0);
|
mime2parser.put(mimeType, p0);
|
||||||
}
|
}
|
||||||
p0.add(parser);
|
p0.add(parser);
|
||||||
|
@ -172,7 +172,7 @@ public final class TextParser {
|
||||||
ext = ext.toLowerCase(Locale.ROOT);
|
ext = ext.toLowerCase(Locale.ROOT);
|
||||||
LinkedHashSet<Parser> p0 = ext2parser.get(ext);
|
LinkedHashSet<Parser> p0 = ext2parser.get(ext);
|
||||||
if (p0 == null) {
|
if (p0 == null) {
|
||||||
p0 = new LinkedHashSet<Parser>();
|
p0 = new LinkedHashSet<>();
|
||||||
ext2parser.put(ext, p0);
|
ext2parser.put(ext, p0);
|
||||||
}
|
}
|
||||||
p0.add(parser);
|
p0.add(parser);
|
||||||
|
@ -189,7 +189,7 @@ public final class TextParser {
|
||||||
final int timezoneOffset,
|
final int timezoneOffset,
|
||||||
final int depth,
|
final int depth,
|
||||||
final File sourceFile
|
final File sourceFile
|
||||||
) throws InterruptedException, Parser.Failure {
|
) throws InterruptedException, Parser.Failure {
|
||||||
|
|
||||||
BufferedInputStream sourceStream = null;
|
BufferedInputStream sourceStream = null;
|
||||||
Document[] docs = null;
|
Document[] docs = null;
|
||||||
|
@ -223,7 +223,7 @@ public final class TextParser {
|
||||||
final int timezoneOffset,
|
final int timezoneOffset,
|
||||||
final int depth,
|
final int depth,
|
||||||
final byte[] content
|
final byte[] content
|
||||||
) throws Parser.Failure {
|
) throws Parser.Failure {
|
||||||
if (AbstractParser.log.isFine()) AbstractParser.log.fine("Parsing '" + location + "' from byte-array");
|
if (AbstractParser.log.isFine()) AbstractParser.log.fine("Parsing '" + location + "' from byte-array");
|
||||||
mimeType = normalizeMimeType(mimeType);
|
mimeType = normalizeMimeType(mimeType);
|
||||||
Set<Parser> idioms = null;
|
Set<Parser> idioms = null;
|
||||||
|
@ -236,11 +236,11 @@ public final class TextParser {
|
||||||
}
|
}
|
||||||
assert !idioms.isEmpty() : "no parsers applied for url " + location.toNormalform(true);
|
assert !idioms.isEmpty() : "no parsers applied for url " + location.toNormalform(true);
|
||||||
|
|
||||||
Document[] docs = parseSource(location, mimeType, idioms, charset, ignore_class_name, scraper, timezoneOffset, depth, content, Integer.MAX_VALUE, Long.MAX_VALUE);
|
final Document[] docs = parseSource(location, mimeType, idioms, charset, ignore_class_name, scraper, timezoneOffset, depth, content, Integer.MAX_VALUE, Long.MAX_VALUE);
|
||||||
|
|
||||||
return docs;
|
return docs;
|
||||||
}
|
}
|
||||||
|
|
||||||
/**
|
/**
|
||||||
* Apply only the generic parser to the given content from location.
|
* Apply only the generic parser to the given content from location.
|
||||||
*/
|
*/
|
||||||
|
@ -253,17 +253,17 @@ public final class TextParser {
|
||||||
final int timezoneOffset,
|
final int timezoneOffset,
|
||||||
final int depth,
|
final int depth,
|
||||||
final byte[] content
|
final byte[] content
|
||||||
) throws Parser.Failure {
|
) throws Parser.Failure {
|
||||||
if (AbstractParser.log.isFine()) {
|
if (AbstractParser.log.isFine()) {
|
||||||
AbstractParser.log.fine("Parsing '" + location + "' from byte-array, applying only the generic parser");
|
AbstractParser.log.fine("Parsing '" + location + "' from byte-array, applying only the generic parser");
|
||||||
}
|
}
|
||||||
mimeType = normalizeMimeType(mimeType);
|
mimeType = normalizeMimeType(mimeType);
|
||||||
Set<Parser> idioms = new HashSet<>();
|
final Set<Parser> idioms = new HashSet<>();
|
||||||
idioms.add(TextParser.genericIdiom);
|
idioms.add(TextParser.genericIdiom);
|
||||||
|
|
||||||
return parseSource(location, mimeType, idioms, charset, ignoreClassNames, scraper, timezoneOffset, depth, content, Integer.MAX_VALUE, Long.MAX_VALUE);
|
return parseSource(location, mimeType, idioms, charset, ignoreClassNames, scraper, timezoneOffset, depth, content, Integer.MAX_VALUE, Long.MAX_VALUE);
|
||||||
}
|
}
|
||||||
|
|
||||||
private static Document[] parseSource(
|
private static Document[] parseSource(
|
||||||
final DigestURL location,
|
final DigestURL location,
|
||||||
String mimeType,
|
String mimeType,
|
||||||
|
@ -276,7 +276,7 @@ public final class TextParser {
|
||||||
final InputStream sourceStream,
|
final InputStream sourceStream,
|
||||||
final int maxLinks,
|
final int maxLinks,
|
||||||
final long maxBytes
|
final long maxBytes
|
||||||
) throws Parser.Failure {
|
) throws Parser.Failure {
|
||||||
if (AbstractParser.log.isFine()) AbstractParser.log.fine("Parsing '" + location + "' from stream");
|
if (AbstractParser.log.isFine()) AbstractParser.log.fine("Parsing '" + location + "' from stream");
|
||||||
mimeType = normalizeMimeType(mimeType);
|
mimeType = normalizeMimeType(mimeType);
|
||||||
Set<Parser> idioms = null;
|
Set<Parser> idioms = null;
|
||||||
|
@ -291,126 +291,126 @@ public final class TextParser {
|
||||||
|
|
||||||
boolean canStream = false;
|
boolean canStream = false;
|
||||||
if(idioms.size() == 1) {
|
if(idioms.size() == 1) {
|
||||||
canStream = true;
|
canStream = true;
|
||||||
} else if(idioms.size() == 2) {
|
} else if(idioms.size() == 2) {
|
||||||
/* When there are only 2 available parsers, stream oriented parsing can still be applied when one of the 2 parsers is the generic one */
|
/* When there are only 2 available parsers, stream oriented parsing can still be applied when one of the 2 parsers is the generic one */
|
||||||
for(Parser idiom : idioms) {
|
for(final Parser idiom : idioms) {
|
||||||
if(idiom instanceof genericParser) {
|
if(idiom instanceof genericParser) {
|
||||||
canStream = true;
|
canStream = true;
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
} else if(sourceStream instanceof ByteArrayInputStream) {
|
} else if(sourceStream instanceof ByteArrayInputStream) {
|
||||||
/* Also check if we have a ByteArrayInputStream as source to prevent useless bytes duplication in a new byte array */
|
/* Also check if we have a ByteArrayInputStream as source to prevent useless bytes duplication in a new byte array */
|
||||||
canStream = true;
|
canStream = true;
|
||||||
}
|
}
|
||||||
|
|
||||||
// if we do not have more than one non generic parser, or the content size is over MaxInt (2GB), or is over the totally available memory,
|
// if we do not have more than one non generic parser, or the content size is over MaxInt (2GB), or is over the totally available memory,
|
||||||
// or stream is already in memory as a ByteArrayInputStream
|
// or stream is already in memory as a ByteArrayInputStream
|
||||||
// then we use only stream-oriented parser.
|
// then we use only stream-oriented parser.
|
||||||
if (canStream || contentLength > Integer.MAX_VALUE || contentLength > MemoryControl.available()) {
|
if (canStream || contentLength > Integer.MAX_VALUE || contentLength > MemoryControl.available()) {
|
||||||
try {
|
try {
|
||||||
/* The size of the buffer on the stream must be large enough to allow parser implementations to start parsing the resource
|
/* The size of the buffer on the stream must be large enough to allow parser implementations to start parsing the resource
|
||||||
* and eventually fail, but must also be larger than eventual parsers internal buffers such as BufferedInputStream.DEFAULT_BUFFER_SIZE (8192 bytes) */
|
* and eventually fail, but must also be larger than eventual parsers internal buffers such as BufferedInputStream.DEFAULT_BUFFER_SIZE (8192 bytes) */
|
||||||
int rewindSize = 10 * 1024;
|
final int rewindSize = 10 * 1024;
|
||||||
final InputStream markableStream;
|
final InputStream markableStream;
|
||||||
if(sourceStream instanceof ByteArrayInputStream) {
|
if(sourceStream instanceof ByteArrayInputStream) {
|
||||||
/* No nead to use a wrapping buffered stream when the source is already entirely in memory.
|
/* No nead to use a wrapping buffered stream when the source is already entirely in memory.
|
||||||
* What's more, ByteArrayInputStream has no read limit when marking.*/
|
* What's more, ByteArrayInputStream has no read limit when marking.*/
|
||||||
markableStream = sourceStream;
|
markableStream = sourceStream;
|
||||||
} else {
|
} else {
|
||||||
markableStream = new BufferedInputStream(sourceStream, rewindSize);
|
markableStream = new BufferedInputStream(sourceStream, rewindSize);
|
||||||
}
|
}
|
||||||
/* Mark now to allow resetting the buffered stream to the beginning of the stream */
|
/* Mark now to allow resetting the buffered stream to the beginning of the stream */
|
||||||
markableStream.mark(rewindSize);
|
markableStream.mark(rewindSize);
|
||||||
|
|
||||||
/* Loop on parser : they are supposed to be sorted in order to start with the most specific and end with the most generic */
|
|
||||||
for(Parser parser : idioms) {
|
|
||||||
/* Wrap in a CloseShieldInputStream to prevent SAX parsers closing the sourceStream
|
|
||||||
* and so let us eventually reuse the same opened stream with other parsers on parser failure */
|
|
||||||
CloseShieldInputStream nonCloseInputStream = new CloseShieldInputStream(markableStream);
|
|
||||||
|
|
||||||
try {
|
|
||||||
return parseSource(location, mimeType, parser, charset, ignore_class_name, scraper, timezoneOffset,
|
|
||||||
nonCloseInputStream, maxLinks, maxBytes);
|
|
||||||
} catch (Parser.Failure e) {
|
|
||||||
/* Try to reset the marked stream. If the failed parser has consumed too many bytes :
|
|
||||||
* too bad, the marks is invalid and process fails now with an IOException */
|
|
||||||
markableStream.reset();
|
|
||||||
|
|
||||||
if(parser instanceof gzipParser && e.getCause() instanceof GZIPOpeningStreamException
|
|
||||||
&& (idioms.size() == 1 || (idioms.size() == 2 && idioms.contains(genericIdiom)))) {
|
|
||||||
/* The gzip parser failed directly when opening the content stream : before falling back to the generic parser,
|
|
||||||
* let's have a chance to parse the stream as uncompressed. */
|
|
||||||
/* Indeed, this can be a case of misconfigured web server, providing both headers "Content-Encoding" with value "gzip",
|
|
||||||
* and "Content-type" with value such as "application/gzip".
|
|
||||||
* In that case our HTTP client (see GzipResponseInterceptor) is already uncompressing the stream on the fly,
|
|
||||||
* that's why the gzipparser fails opening the stream.
|
|
||||||
* (see RFC 7231 section 3.1.2.2 for "Content-Encoding" header specification https://tools.ietf.org/html/rfc7231#section-3.1.2.2)*/
|
|
||||||
gzipParser gzParser = (gzipParser)parser;
|
|
||||||
|
|
||||||
nonCloseInputStream = new CloseShieldInputStream(markableStream);
|
|
||||||
|
|
||||||
Document maindoc = gzipParser.createMainDocument(location, mimeType, charset, gzParser);
|
|
||||||
|
|
||||||
try {
|
/* Loop on parser : they are supposed to be sorted in order to start with the most specific and end with the most generic */
|
||||||
Document[] docs = gzParser.parseCompressedInputStream(location,
|
for(final Parser parser : idioms) {
|
||||||
charset, timezoneOffset, depth,
|
/* Wrap in a CloseShieldInputStream to prevent SAX parsers closing the sourceStream
|
||||||
nonCloseInputStream, maxLinks, maxBytes);
|
* and so let us eventually reuse the same opened stream with other parsers on parser failure */
|
||||||
if (docs != null) {
|
CloseShieldInputStream nonCloseInputStream = new CloseShieldInputStream(markableStream);
|
||||||
maindoc.addSubDocuments(docs);
|
|
||||||
}
|
try {
|
||||||
return new Document[] { maindoc };
|
return parseSource(location, mimeType, parser, charset, ignore_class_name, scraper, timezoneOffset,
|
||||||
} catch(Exception e1) {
|
nonCloseInputStream, maxLinks, maxBytes);
|
||||||
/* Try again to reset the marked stream if the failed parser has not consumed too many bytes */
|
} catch (final Parser.Failure e) {
|
||||||
markableStream.reset();
|
/* Try to reset the marked stream. If the failed parser has consumed too many bytes :
|
||||||
}
|
* too bad, the marks is invalid and process fails now with an IOException */
|
||||||
}
|
markableStream.reset();
|
||||||
}
|
|
||||||
}
|
if(parser instanceof gzipParser && e.getCause() instanceof GZIPOpeningStreamException
|
||||||
} catch (IOException e) {
|
&& (idioms.size() == 1 || (idioms.size() == 2 && idioms.contains(genericIdiom)))) {
|
||||||
throw new Parser.Failure("Error reading source", location);
|
/* The gzip parser failed directly when opening the content stream : before falling back to the generic parser,
|
||||||
}
|
* let's have a chance to parse the stream as uncompressed. */
|
||||||
}
|
/* Indeed, this can be a case of misconfigured web server, providing both headers "Content-Encoding" with value "gzip",
|
||||||
|
* and "Content-type" with value such as "application/gzip".
|
||||||
|
* In that case our HTTP client (see GzipResponseInterceptor) is already uncompressing the stream on the fly,
|
||||||
|
* that's why the gzipparser fails opening the stream.
|
||||||
|
* (see RFC 7231 section 3.1.2.2 for "Content-Encoding" header specification https://tools.ietf.org/html/rfc7231#section-3.1.2.2)*/
|
||||||
|
final gzipParser gzParser = (gzipParser)parser;
|
||||||
|
|
||||||
|
nonCloseInputStream = new CloseShieldInputStream(markableStream);
|
||||||
|
|
||||||
|
final Document maindoc = gzipParser.createMainDocument(location, mimeType, charset, gzParser);
|
||||||
|
|
||||||
|
try {
|
||||||
|
final Document[] docs = gzParser.parseCompressedInputStream(location,
|
||||||
|
charset, timezoneOffset, depth,
|
||||||
|
nonCloseInputStream, maxLinks, maxBytes);
|
||||||
|
if (docs != null) {
|
||||||
|
maindoc.addSubDocuments(docs);
|
||||||
|
}
|
||||||
|
return new Document[] { maindoc };
|
||||||
|
} catch(final Exception e1) {
|
||||||
|
/* Try again to reset the marked stream if the failed parser has not consumed too many bytes */
|
||||||
|
markableStream.reset();
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
} catch (final IOException e) {
|
||||||
|
throw new Parser.Failure("Error reading source", location);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
// in case that we know more parsers we first transform the content into a byte[] and use that as base
|
// in case that we know more parsers we first transform the content into a byte[] and use that as base
|
||||||
// for a number of different parse attempts.
|
// for a number of different parse attempts.
|
||||||
|
|
||||||
int maxBytesToRead = -1;
|
int maxBytesToRead = -1;
|
||||||
if(maxBytes < Integer.MAX_VALUE) {
|
if(maxBytes < Integer.MAX_VALUE) {
|
||||||
/* Load at most maxBytes + 1 :
|
/* Load at most maxBytes + 1 :
|
||||||
- to let parsers not supporting Parser.parseWithLimits detect the maxBytes size is exceeded and end with a Parser.Failure
|
- to let parsers not supporting Parser.parseWithLimits detect the maxBytes size is exceeded and end with a Parser.Failure
|
||||||
- but let parsers supporting Parser.parseWithLimits perform partial parsing of maxBytes content */
|
- but let parsers supporting Parser.parseWithLimits perform partial parsing of maxBytes content */
|
||||||
maxBytesToRead = (int)maxBytes + 1;
|
maxBytesToRead = (int)maxBytes + 1;
|
||||||
}
|
}
|
||||||
if(contentLength >= 0 && contentLength < maxBytesToRead) {
|
if(contentLength >= 0 && contentLength < maxBytesToRead) {
|
||||||
maxBytesToRead = (int)contentLength;
|
maxBytesToRead = (int)contentLength;
|
||||||
}
|
}
|
||||||
|
|
||||||
byte[] b = null;
|
byte[] b = null;
|
||||||
try {
|
try {
|
||||||
b = FileUtils.read(sourceStream, maxBytesToRead);
|
b = FileUtils.read(sourceStream, maxBytesToRead);
|
||||||
} catch (final IOException e) {
|
} catch (final IOException e) {
|
||||||
throw new Parser.Failure(e.getMessage(), location);
|
throw new Parser.Failure(e.getMessage(), location);
|
||||||
}
|
}
|
||||||
Document[] docs = parseSource(location, mimeType, idioms, charset, ignore_class_name, scraper, timezoneOffset, depth, b, maxLinks, maxBytes);
|
final Document[] docs = parseSource(location, mimeType, idioms, charset, ignore_class_name, scraper, timezoneOffset, depth, b, maxLinks, maxBytes);
|
||||||
|
|
||||||
return docs;
|
return docs;
|
||||||
}
|
}
|
||||||
|
|
||||||
public static Document[] parseSource(final DigestURL location, String mimeType, final String charset,
|
public static Document[] parseSource(final DigestURL location, String mimeType, final String charset,
|
||||||
final Set<String> ignore_class_name,
|
final Set<String> ignore_class_name,
|
||||||
final VocabularyScraper scraper, final int timezoneOffset, final int depth, final long contentLength,
|
final VocabularyScraper scraper, final int timezoneOffset, final int depth, final long contentLength,
|
||||||
final InputStream sourceStream) throws Parser.Failure {
|
final InputStream sourceStream) throws Parser.Failure {
|
||||||
return parseSource(location, mimeType, charset, ignore_class_name, scraper, timezoneOffset, depth, contentLength, sourceStream,
|
return parseSource(location, mimeType, charset, ignore_class_name, scraper, timezoneOffset, depth, contentLength, sourceStream,
|
||||||
Integer.MAX_VALUE, Long.MAX_VALUE);
|
Integer.MAX_VALUE, Long.MAX_VALUE);
|
||||||
}
|
}
|
||||||
|
|
||||||
/**
|
/**
|
||||||
* Try to limit the parser processing with a maximum total number of links detection (anchors, images links, media links...)
|
* Try to limit the parser processing with a maximum total number of links detection (anchors, images links, media links...)
|
||||||
* or a maximum amount of content bytes to parse. Limits apply only when the available parsers for the resource media type support parsing within limits
|
* or a maximum amount of content bytes to parse. Limits apply only when the available parsers for the resource media type support parsing within limits
|
||||||
* (see {@link Parser#isParseWithLimitsSupported()}. When available parsers do
|
* (see {@link Parser#isParseWithLimitsSupported()}. When available parsers do
|
||||||
* not support parsing within limits, an exception is thrown when
|
* not support parsing within limits, an exception is thrown when
|
||||||
* content size is beyond maxBytes.
|
* content size is beyond maxBytes.
|
||||||
* @param location the URL of the source
|
* @param location the URL of the source
|
||||||
* @param mimeType the mime type of the source, if known
|
* @param mimeType the mime type of the source, if known
|
||||||
* @param charset the charset name of the source, if known
|
* @param charset the charset name of the source, if known
|
||||||
|
@ -424,19 +424,19 @@ public final class TextParser {
|
||||||
* @return a list of documents that result from parsing the source, with empty or null text.
|
* @return a list of documents that result from parsing the source, with empty or null text.
|
||||||
* @throws Parser.Failure when the parser processing failed
|
* @throws Parser.Failure when the parser processing failed
|
||||||
*/
|
*/
|
||||||
public static Document[] parseWithLimits(final DigestURL location, String mimeType, final String charset, final Set<String> ignoreClassNames,
|
public static Document[] parseWithLimits(final DigestURL location, String mimeType, final String charset, final Set<String> ignoreClassNames,
|
||||||
final int timezoneOffset, final int depth, final long contentLength, final InputStream sourceStream, int maxLinks,
|
final int timezoneOffset, final int depth, final long contentLength, final InputStream sourceStream, int maxLinks,
|
||||||
long maxBytes) throws Parser.Failure{
|
long maxBytes) throws Parser.Failure{
|
||||||
return parseSource(location, mimeType, charset, ignoreClassNames, new VocabularyScraper(), timezoneOffset, depth, contentLength,
|
return parseSource(location, mimeType, charset, ignoreClassNames, new VocabularyScraper(), timezoneOffset, depth, contentLength,
|
||||||
sourceStream, maxLinks, maxBytes);
|
sourceStream, maxLinks, maxBytes);
|
||||||
}
|
}
|
||||||
|
|
||||||
/**
|
/**
|
||||||
* Try to limit the parser processing with a maximum total number of links detection (anchors, images links, media links...)
|
* Try to limit the parser processing with a maximum total number of links detection (anchors, images links, media links...)
|
||||||
* or a maximum amount of content bytes to parse. Limits apply only when the available parsers for the resource media type support parsing within limits
|
* or a maximum amount of content bytes to parse. Limits apply only when the available parsers for the resource media type support parsing within limits
|
||||||
* (see {@link Parser#isParseWithLimitsSupported()}. When available parsers do
|
* (see {@link Parser#isParseWithLimitsSupported()}. When available parsers do
|
||||||
* not support parsing within limits, an exception is thrown when
|
* not support parsing within limits, an exception is thrown when
|
||||||
* content size is beyond maxBytes.
|
* content size is beyond maxBytes.
|
||||||
* @param location the URL of the source
|
* @param location the URL of the source
|
||||||
* @param mimeType the mime type of the source, if known
|
* @param mimeType the mime type of the source, if known
|
||||||
* @param charset the charset name of the source, if known
|
* @param charset the charset name of the source, if known
|
||||||
|
@ -449,15 +449,15 @@ public final class TextParser {
|
||||||
* @return a list of documents that result from parsing the source, with empty or null text.
|
* @return a list of documents that result from parsing the source, with empty or null text.
|
||||||
* @throws Parser.Failure when the parser processing failed
|
* @throws Parser.Failure when the parser processing failed
|
||||||
*/
|
*/
|
||||||
public static Document[] parseWithLimits(final DigestURL location, String mimeType, final String charset,
|
public static Document[] parseWithLimits(final DigestURL location, String mimeType, final String charset,
|
||||||
final int timezoneOffset, final int depth, final long contentLength, final InputStream sourceStream, int maxLinks,
|
final int timezoneOffset, final int depth, final long contentLength, final InputStream sourceStream, int maxLinks,
|
||||||
long maxBytes) throws Parser.Failure{
|
long maxBytes) throws Parser.Failure{
|
||||||
return parseSource(location, mimeType, charset, new HashSet<String>(), new VocabularyScraper(), timezoneOffset, depth, contentLength,
|
return parseSource(location, mimeType, charset, new HashSet<String>(), new VocabularyScraper(), timezoneOffset, depth, contentLength,
|
||||||
sourceStream, maxLinks, maxBytes);
|
sourceStream, maxLinks, maxBytes);
|
||||||
}
|
}
|
||||||
|
|
||||||
/**
|
/**
|
||||||
*
|
*
|
||||||
* @param location the URL of the source
|
* @param location the URL of the source
|
||||||
* @param mimeType the mime type of the source, if known
|
* @param mimeType the mime type of the source, if known
|
||||||
* @param parser a parser supporting the resource at location
|
* @param parser a parser supporting the resource at location
|
||||||
|
@ -481,7 +481,7 @@ public final class TextParser {
|
||||||
final InputStream sourceStream,
|
final InputStream sourceStream,
|
||||||
final int maxLinks,
|
final int maxLinks,
|
||||||
final long maxBytes
|
final long maxBytes
|
||||||
) throws Parser.Failure {
|
) throws Parser.Failure {
|
||||||
if (AbstractParser.log.isFine()) AbstractParser.log.fine("Parsing '" + location + "' from stream");
|
if (AbstractParser.log.isFine()) AbstractParser.log.fine("Parsing '" + location + "' from stream");
|
||||||
final String fileExt = MultiProtocolURL.getFileExtension(location.getFileName());
|
final String fileExt = MultiProtocolURL.getFileExtension(location.getFileName());
|
||||||
final String documentCharset = htmlParser.patchCharsetEncoding(charset);
|
final String documentCharset = htmlParser.patchCharsetEncoding(charset);
|
||||||
|
@ -491,15 +491,15 @@ public final class TextParser {
|
||||||
try {
|
try {
|
||||||
final Document[] docs;
|
final Document[] docs;
|
||||||
if(parser.isParseWithLimitsSupported()) {
|
if(parser.isParseWithLimitsSupported()) {
|
||||||
docs = parser.parseWithLimits(location, mimeType, documentCharset, ignore_class_name, scraper, timezoneOffset, sourceStream, maxLinks, maxBytes);
|
docs = parser.parseWithLimits(location, mimeType, documentCharset, ignore_class_name, scraper, timezoneOffset, sourceStream, maxLinks, maxBytes);
|
||||||
} else {
|
} else {
|
||||||
/* Parser do not support partial parsing within limits : let's control it here*/
|
/* Parser do not support partial parsing within limits : let's control it here*/
|
||||||
InputStream limitedSource = new StrictLimitInputStream(sourceStream, maxBytes);
|
final InputStream limitedSource = new StrictLimitInputStream(sourceStream, maxBytes);
|
||||||
docs = parser.parse(location, mimeType, documentCharset, ignore_class_name, scraper, timezoneOffset, limitedSource);
|
docs = parser.parse(location, mimeType, documentCharset, ignore_class_name, scraper, timezoneOffset, limitedSource);
|
||||||
}
|
}
|
||||||
return docs;
|
return docs;
|
||||||
} catch(Parser.Failure e) {
|
} catch(final Parser.Failure e) {
|
||||||
throw e;
|
throw e;
|
||||||
} catch (final Exception e) {
|
} catch (final Exception e) {
|
||||||
throw new Parser.Failure("parser failed: " + parser.getName(), location);
|
throw new Parser.Failure("parser failed: " + parser.getName(), location);
|
||||||
}
|
}
|
||||||
|
@ -531,77 +531,77 @@ public final class TextParser {
|
||||||
final byte[] sourceArray,
|
final byte[] sourceArray,
|
||||||
final int maxLinks,
|
final int maxLinks,
|
||||||
final long maxBytes
|
final long maxBytes
|
||||||
) throws Parser.Failure {
|
) throws Parser.Failure {
|
||||||
final String fileExt = MultiProtocolURL.getFileExtension(location.getFileName());
|
final String fileExt = MultiProtocolURL.getFileExtension(location.getFileName());
|
||||||
if (AbstractParser.log.isFine()) AbstractParser.log.fine("Parsing " + location + " with mimeType '" + mimeType + "' and file extension '" + fileExt + "' from byte[]");
|
if (AbstractParser.log.isFine()) AbstractParser.log.fine("Parsing " + location + " with mimeType '" + mimeType + "' and file extension '" + fileExt + "' from byte[]");
|
||||||
final String documentCharset = htmlParser.patchCharsetEncoding(charset);
|
final String documentCharset = htmlParser.patchCharsetEncoding(charset);
|
||||||
assert !parsers.isEmpty();
|
assert !parsers.isEmpty();
|
||||||
|
|
||||||
Document[] docs = null;
|
Document[] docs = null;
|
||||||
final Map<Parser, Parser.Failure> failedParser = new HashMap<Parser, Parser.Failure>();
|
final Map<Parser, Parser.Failure> failedParser = new HashMap<>();
|
||||||
String origName = Thread.currentThread().getName();
|
final String origName = Thread.currentThread().getName();
|
||||||
Thread.currentThread().setName("parsing + " + location.toString()); // set a name to get the address in Thread Dump
|
Thread.currentThread().setName("parsing + " + location.toString()); // set a name to get the address in Thread Dump
|
||||||
for (final Parser parser: parsers) {
|
for (final Parser parser: parsers) {
|
||||||
if (MemoryControl.request(sourceArray.length * 6, false)) {
|
if (MemoryControl.request(sourceArray.length * 6, false)) {
|
||||||
ByteArrayInputStream bis;
|
ByteArrayInputStream bis;
|
||||||
if (mimeType.equals("text/plain") && parser.getName().equals("HTML Parser")) {
|
if (mimeType.equals("text/plain") && parser.getName().equals("HTML Parser")) {
|
||||||
// a hack to simulate html files .. is needed for NOLOAD queues. This throws their data into virtual text/plain messages.
|
// a hack to simulate html files .. is needed for NOLOAD queues. This throws their data into virtual text/plain messages.
|
||||||
bis = new ByteArrayInputStream(UTF8.getBytes("<html><head></head><body><h1>" + UTF8.String(sourceArray) + "</h1></body><html>"));
|
bis = new ByteArrayInputStream(UTF8.getBytes("<html><head></head><body><h1>" + UTF8.String(sourceArray) + "</h1></body><html>"));
|
||||||
} else {
|
} else {
|
||||||
bis = new ByteArrayInputStream(sourceArray);
|
bis = new ByteArrayInputStream(sourceArray);
|
||||||
}
|
}
|
||||||
try {
|
try {
|
||||||
if(parser.isParseWithLimitsSupported()) {
|
if(parser.isParseWithLimitsSupported()) {
|
||||||
docs = parser.parseWithLimits(location, mimeType, documentCharset, ignore_class_name, scraper, timezoneOffset, bis, maxLinks, maxBytes);
|
docs = parser.parseWithLimits(location, mimeType, documentCharset, ignore_class_name, scraper, timezoneOffset, bis, maxLinks, maxBytes);
|
||||||
} else {
|
} else {
|
||||||
/* Partial parsing is not supported by this parser : check content length now */
|
/* Partial parsing is not supported by this parser : check content length now */
|
||||||
if(sourceArray.length > maxBytes) {
|
if(sourceArray.length > maxBytes) {
|
||||||
throw new Parser.Failure("Content size is over maximum size of " + maxBytes + "", location);
|
throw new Parser.Failure("Content size is over maximum size of " + maxBytes + "", location);
|
||||||
}
|
}
|
||||||
docs = parser.parse(location, mimeType, documentCharset, ignore_class_name, scraper, timezoneOffset, bis);
|
docs = parser.parse(location, mimeType, documentCharset, ignore_class_name, scraper, timezoneOffset, bis);
|
||||||
}
|
}
|
||||||
} catch (final Parser.Failure e) {
|
} catch (final Parser.Failure e) {
|
||||||
if(parser instanceof gzipParser && e.getCause() instanceof GZIPOpeningStreamException &&
|
if(parser instanceof gzipParser && e.getCause() instanceof GZIPOpeningStreamException &&
|
||||||
(parsers.size() == 1 || (parsers.size() == 2 && parsers.contains(genericIdiom)))) {
|
(parsers.size() == 1 || (parsers.size() == 2 && parsers.contains(genericIdiom)))) {
|
||||||
/* The gzip parser failed directly when opening the content stream : before falling back to the generic parser,
|
/* The gzip parser failed directly when opening the content stream : before falling back to the generic parser,
|
||||||
* let's have a chance to parse the stream as uncompressed. */
|
* let's have a chance to parse the stream as uncompressed. */
|
||||||
/* Indeed, this can be a case of misconfigured web server, providing both headers "Content-Encoding" with value "gzip",
|
/* Indeed, this can be a case of misconfigured web server, providing both headers "Content-Encoding" with value "gzip",
|
||||||
* and "Content-type" with value such as "application/gzip".
|
* and "Content-type" with value such as "application/gzip".
|
||||||
* In that case our HTTP client (see GzipResponseInterceptor) is already uncompressing the stream on the fly,
|
* In that case our HTTP client (see GzipResponseInterceptor) is already uncompressing the stream on the fly,
|
||||||
* that's why the gzipparser fails opening the stream.
|
* that's why the gzipparser fails opening the stream.
|
||||||
* (see RFC 7231 section 3.1.2.2 for "Content-Encoding" header specification https://tools.ietf.org/html/rfc7231#section-3.1.2.2)*/
|
* (see RFC 7231 section 3.1.2.2 for "Content-Encoding" header specification https://tools.ietf.org/html/rfc7231#section-3.1.2.2)*/
|
||||||
gzipParser gzParser = (gzipParser)parser;
|
final gzipParser gzParser = (gzipParser)parser;
|
||||||
|
|
||||||
bis = new ByteArrayInputStream(sourceArray);
|
|
||||||
|
|
||||||
Document maindoc = gzipParser.createMainDocument(location, mimeType, charset, gzParser);
|
|
||||||
|
|
||||||
try {
|
bis = new ByteArrayInputStream(sourceArray);
|
||||||
docs = gzParser.parseCompressedInputStream(location,
|
|
||||||
charset, timezoneOffset, depth,
|
final Document maindoc = gzipParser.createMainDocument(location, mimeType, charset, gzParser);
|
||||||
bis, maxLinks, maxBytes);
|
|
||||||
if (docs != null) {
|
try {
|
||||||
maindoc.addSubDocuments(docs);
|
docs = gzParser.parseCompressedInputStream(location,
|
||||||
}
|
charset, timezoneOffset, depth,
|
||||||
docs = new Document[] { maindoc };
|
bis, maxLinks, maxBytes);
|
||||||
break;
|
if (docs != null) {
|
||||||
} catch(Parser.Failure e1) {
|
maindoc.addSubDocuments(docs);
|
||||||
failedParser.put(parser, e1);
|
}
|
||||||
} catch(Exception e2) {
|
docs = new Document[] { maindoc };
|
||||||
failedParser.put(parser, new Parser.Failure(e2.getMessage(), location));
|
break;
|
||||||
}
|
} catch(final Parser.Failure e1) {
|
||||||
} else {
|
failedParser.put(parser, e1);
|
||||||
failedParser.put(parser, e);
|
} catch(final Exception e2) {
|
||||||
}
|
failedParser.put(parser, new Parser.Failure(e2.getMessage(), location));
|
||||||
|
}
|
||||||
|
} else {
|
||||||
|
failedParser.put(parser, e);
|
||||||
|
}
|
||||||
} catch (final Exception e) {
|
} catch (final Exception e) {
|
||||||
failedParser.put(parser, new Parser.Failure(e.getMessage(), location));
|
failedParser.put(parser, new Parser.Failure(e.getMessage(), location));
|
||||||
//log.logWarning("tried parser '" + parser.getName() + "' to parse " + location.toNormalform(true, false) + " but failed: " + e.getMessage(), e);
|
//log.logWarning("tried parser '" + parser.getName() + "' to parse " + location.toNormalform(true, false) + " but failed: " + e.getMessage(), e);
|
||||||
} finally {
|
} finally {
|
||||||
try {
|
try {
|
||||||
bis.close();
|
bis.close();
|
||||||
} catch(IOException ioe) {
|
} catch(final IOException ioe) {
|
||||||
// Ignore.
|
// Ignore.
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
if (docs != null) break;
|
if (docs != null) break;
|
||||||
}
|
}
|
||||||
|
@ -616,22 +616,22 @@ public final class TextParser {
|
||||||
}
|
}
|
||||||
String failedParsers = "";
|
String failedParsers = "";
|
||||||
for (final Map.Entry<Parser, Parser.Failure> error: failedParser.entrySet()) {
|
for (final Map.Entry<Parser, Parser.Failure> error: failedParser.entrySet()) {
|
||||||
AbstractParser.log.warn("tried parser '" + error.getKey().getName() + "' to parse " + location.toNormalform(true) + " but failed: " + error.getValue().getMessage(), error.getValue());
|
AbstractParser.log.warn("tried parser '" + error.getKey().getName() + "' to parse " + location.toNormalform(true) + " but failed: " + error.getValue().getMessage(), error.getValue());
|
||||||
failedParsers += error.getKey().getName() + " ";
|
failedParsers += error.getKey().getName() + " ";
|
||||||
}
|
}
|
||||||
throw new Parser.Failure("All parser failed: " + failedParsers, location);
|
throw new Parser.Failure("All parser failed: " + failedParsers, location);
|
||||||
}
|
}
|
||||||
for (final Document d: docs) {
|
for (final Document d: docs) {
|
||||||
InputStream textStream = d.getTextStream();
|
final InputStream textStream = d.getTextStream();
|
||||||
assert textStream != null : "mimeType = " + mimeType;
|
assert textStream != null : "mimeType = " + mimeType;
|
||||||
try {
|
try {
|
||||||
if(textStream != null) {
|
if(textStream != null) {
|
||||||
/* textStream can be a FileInputStream : we must close it to ensure releasing system resource */
|
/* textStream can be a FileInputStream : we must close it to ensure releasing system resource */
|
||||||
textStream.close();
|
textStream.close();
|
||||||
}
|
}
|
||||||
} catch (IOException e) {
|
} catch (final IOException e) {
|
||||||
AbstractParser.log.warn("Could not close text input stream");
|
AbstractParser.log.warn("Could not close text input stream");
|
||||||
}
|
}
|
||||||
d.setDepth(depth);
|
d.setDepth(depth);
|
||||||
} // verify docs
|
} // verify docs
|
||||||
|
|
||||||
|
@ -670,7 +670,7 @@ public final class TextParser {
|
||||||
* @throws Parser.Failure when the file extension or the MIME type is denied
|
* @throws Parser.Failure when the file extension or the MIME type is denied
|
||||||
*/
|
*/
|
||||||
private static Set<Parser> parsers(final MultiProtocolURL url, String mimeType1) throws Parser.Failure {
|
private static Set<Parser> parsers(final MultiProtocolURL url, String mimeType1) throws Parser.Failure {
|
||||||
final Set<Parser> idioms = new LinkedHashSet<Parser>(2); // LinkedSet to maintain order (genericParser should be last)
|
final Set<Parser> idioms = new LinkedHashSet<>(2); // LinkedSet to maintain order (genericParser should be last)
|
||||||
|
|
||||||
// check given mime type, place this first because this is the most likely to work and the best fit to the supplied mime
|
// check given mime type, place this first because this is the most likely to work and the best fit to the supplied mime
|
||||||
Set<Parser> idiom;
|
Set<Parser> idiom;
|
||||||
|
@ -682,13 +682,13 @@ public final class TextParser {
|
||||||
}
|
}
|
||||||
|
|
||||||
// check extension and add as backup (in case no, wrong or unknown/unsupported mime was supplied)
|
// check extension and add as backup (in case no, wrong or unknown/unsupported mime was supplied)
|
||||||
String ext = MultiProtocolURL.getFileExtension(url.getFileName());
|
final String ext = MultiProtocolURL.getFileExtension(url.getFileName());
|
||||||
if (ext != null && ext.length() > 0) {
|
if (ext != null && ext.length() > 0) {
|
||||||
/* We do not throw here an exception when the media type is provided and inconsistent with the extension (if it is not supported an exception has already beeen thrown).
|
/* We do not throw here an exception when the media type is provided and inconsistent with the extension (if it is not supported an exception has already beeen thrown).
|
||||||
* Otherwise we would reject URLs with an apparently unsupported extension but whose actual Media Type is supported (for example text/html).
|
* Otherwise we would reject URLs with an apparently unsupported extension but whose actual Media Type is supported (for example text/html).
|
||||||
* Notable example : wikimedia commons pages, such as https://commons.wikimedia.org/wiki/File:YaCy_logo.png */
|
* Notable example : wikimedia commons pages, such as https://commons.wikimedia.org/wiki/File:YaCy_logo.png */
|
||||||
if (denyExtensionx.containsKey(ext) && (mimeType1 == null || mimeType1.equals(mimeOf(ext)))) {
|
if (denyExtensionx.containsKey(ext) && (mimeType1 == null || mimeType1.equals(mimeOf(ext)))) {
|
||||||
throw new Parser.Failure("file extension '" + ext + "' is denied (1)", url);
|
throw new Parser.Failure("file extension '" + ext + "' is denied (1)", url);
|
||||||
}
|
}
|
||||||
idiom = ext2parser.get(ext);
|
idiom = ext2parser.get(ext);
|
||||||
if (idiom != null && !idioms.containsAll(idiom)) { // use containsAll -> idiom is a Set of parser
|
if (idiom != null && !idioms.containsAll(idiom)) { // use containsAll -> idiom is a Set of parser
|
||||||
|
@ -701,11 +701,11 @@ public final class TextParser {
|
||||||
if (mimeType2 != null && (idiom = mime2parser.get(mimeType2)) != null && !idioms.containsAll(idiom)) { // use containsAll -> idiom is a Set of parser
|
if (mimeType2 != null && (idiom = mime2parser.get(mimeType2)) != null && !idioms.containsAll(idiom)) { // use containsAll -> idiom is a Set of parser
|
||||||
idioms.addAll(idiom);
|
idioms.addAll(idiom);
|
||||||
}
|
}
|
||||||
|
|
||||||
/* No matching idiom has been found : let's check if the media type ends with the "+xml" suffix so we can handle it with a generic XML parser
|
/* No matching idiom has been found : let's check if the media type ends with the "+xml" suffix so we can handle it with a generic XML parser
|
||||||
* (see RFC 7303 - Using '+xml' when Registering XML-Based Media Types : https://tools.ietf.org/html/rfc7303#section-4.2) */
|
* (see RFC 7303 - Using '+xml' when Registering XML-Based Media Types : https://tools.ietf.org/html/rfc7303#section-4.2) */
|
||||||
if(idioms.isEmpty() && mimeType1 != null && mimeType1.endsWith("+xml")) {
|
if(idioms.isEmpty() && mimeType1 != null && mimeType1.endsWith("+xml")) {
|
||||||
idioms.add(genericXMLIdiom);
|
idioms.add(genericXMLIdiom);
|
||||||
}
|
}
|
||||||
|
|
||||||
// always add the generic parser (make sure it is the last in access order)
|
// always add the generic parser (make sure it is the last in access order)
|
||||||
|
@ -723,18 +723,18 @@ public final class TextParser {
|
||||||
*/
|
*/
|
||||||
public static String supportsMime(String mimeType) {
|
public static String supportsMime(String mimeType) {
|
||||||
if (mimeType == null) {
|
if (mimeType == null) {
|
||||||
return null;
|
return null;
|
||||||
}
|
}
|
||||||
mimeType = normalizeMimeType(mimeType);
|
mimeType = normalizeMimeType(mimeType);
|
||||||
if (denyMime.containsKey(mimeType)) {
|
if (denyMime.containsKey(mimeType)) {
|
||||||
return "mime type '" + mimeType + "' is denied (2)";
|
return "mime type '" + mimeType + "' is denied (2)";
|
||||||
}
|
}
|
||||||
if (mime2parser.get(mimeType) == null) {
|
if (mime2parser.get(mimeType) == null) {
|
||||||
/* No matching idiom has been found : let's check if the media type ends with the "+xml" suffix as can handle it with a generic XML parser
|
/* No matching idiom has been found : let's check if the media type ends with the "+xml" suffix as can handle it with a generic XML parser
|
||||||
* (see RFC 7303 - Using '+xml' when Registering XML-Based Media Types : https://tools.ietf.org/html/rfc7303#section-4.2) */
|
* (see RFC 7303 - Using '+xml' when Registering XML-Based Media Types : https://tools.ietf.org/html/rfc7303#section-4.2) */
|
||||||
if(!mimeType.endsWith("+xml")) {
|
if(!mimeType.endsWith("+xml")) {
|
||||||
return "no parser for mime '" + mimeType + "' available";
|
return "no parser for mime '" + mimeType + "' available";
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
return null;
|
return null;
|
||||||
}
|
}
|
||||||
|
@ -774,20 +774,20 @@ public final class TextParser {
|
||||||
return ext2mime.get(ext.toLowerCase(Locale.ROOT));
|
return ext2mime.get(ext.toLowerCase(Locale.ROOT));
|
||||||
}
|
}
|
||||||
|
|
||||||
/**
|
/**
|
||||||
* Normalize a media type information string (can be a HTTP "Content-Type"
|
* Normalize a media type information string (can be a HTTP "Content-Type"
|
||||||
* response header) : convert to lower case, remove any supplementary
|
* response header) : convert to lower case, remove any supplementary
|
||||||
* parameters such as the encoding (charset name), and provide a default
|
* parameters such as the encoding (charset name), and provide a default
|
||||||
* value when null.
|
* value when null.
|
||||||
*
|
*
|
||||||
* @param mimeType
|
* @param mimeType
|
||||||
* raw information about media type, eventually provided by a
|
* raw information about media type, eventually provided by a
|
||||||
* HTTP "Content-Type" response header
|
* HTTP "Content-Type" response header
|
||||||
* @return a non null media type in lower case
|
* @return a non null media type in lower case
|
||||||
*/
|
*/
|
||||||
public static String normalizeMimeType(String mimeType) {
|
public static String normalizeMimeType(String mimeType) {
|
||||||
if (mimeType == null) {
|
if (mimeType == null) {
|
||||||
return "application/octet-stream";
|
return "application/octet-stream";
|
||||||
}
|
}
|
||||||
mimeType = mimeType.toLowerCase(Locale.ROOT);
|
mimeType = mimeType.toLowerCase(Locale.ROOT);
|
||||||
final int pos = mimeType.indexOf(';');
|
final int pos = mimeType.indexOf(';');
|
||||||
|
@ -818,7 +818,7 @@ public final class TextParser {
|
||||||
|
|
||||||
public static void setDenyExtension(final String denyList) {
|
public static void setDenyExtension(final String denyList) {
|
||||||
denyExtensionx.clear();
|
denyExtensionx.clear();
|
||||||
for (final String s: CommonPattern.COMMA.split(denyList)) denyExtensionx.put(s, v);
|
for (final String s: CommonPattern.COMMA.split(denyList)) denyExtensionx.put(s.trim(), v);
|
||||||
}
|
}
|
||||||
|
|
||||||
public static String getDenyExtension() {
|
public static String getDenyExtension() {
|
||||||
|
|
Loading…
Reference in New Issue
Block a user