2010-06-29 21:20:45 +02:00
/ * *
* TextParser . java
* Copyright 2009 by Michael Peter Christen , mc @yacy.net , Frankfurt am Main , Germany
* First released 09 . 07 . 2009 at http : //yacy.net
*
* This library is free software ; you can redistribute it and / or
* modify it under the terms of the GNU Lesser General Public
* License as published by the Free Software Foundation ; either
* version 2 . 1 of the License , or ( at your option ) any later version .
2011-06-01 21:31:56 +02:00
*
2010-06-29 21:20:45 +02:00
* This library is distributed in the hope that it will be useful ,
* but WITHOUT ANY WARRANTY ; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE . See the GNU
* Lesser General Public License for more details .
2011-06-01 21:31:56 +02:00
*
2010-06-29 21:20:45 +02:00
* You should have received a copy of the GNU Lesser General Public License
* along with this program in the file lgpl21 . txt
* If not , see < http : //www.gnu.org/licenses/>.
* /
2009-07-10 00:25:31 +02:00
2009-10-18 02:53:43 +02:00
package net.yacy.document ;
2009-07-10 00:25:31 +02:00
import java.io.BufferedInputStream ;
import java.io.ByteArrayInputStream ;
import java.io.File ;
import java.io.FileInputStream ;
2010-03-19 14:04:42 +01:00
import java.io.IOException ;
2009-07-10 00:25:31 +02:00
import java.io.InputStream ;
2010-01-19 15:59:58 +01:00
import java.util.HashMap ;
2009-07-10 16:22:17 +02:00
import java.util.HashSet ;
2013-07-23 20:24:13 +02:00
import java.util.LinkedHashSet ;
2009-07-10 16:22:17 +02:00
import java.util.Map ;
import java.util.Set ;
2010-04-21 15:46:02 +02:00
import java.util.concurrent.ConcurrentHashMap ;
2009-07-10 00:25:31 +02:00
2013-09-15 00:30:23 +02:00
import net.yacy.cora.document.encoding.UTF8 ;
2013-09-15 23:27:04 +02:00
import net.yacy.cora.document.id.AnchorURL ;
2013-09-15 00:30:23 +02:00
import net.yacy.cora.document.id.MultiProtocolURL ;
2012-10-05 18:54:26 +02:00
import net.yacy.document.parser.audioTagParser ;
2009-10-18 02:53:43 +02:00
import net.yacy.document.parser.bzipParser ;
2009-11-03 21:10:59 +01:00
import net.yacy.document.parser.csvParser ;
2009-10-18 02:53:43 +02:00
import net.yacy.document.parser.docParser ;
2010-11-30 17:13:55 +01:00
import net.yacy.document.parser.genericParser ;
2009-10-18 02:53:43 +02:00
import net.yacy.document.parser.gzipParser ;
import net.yacy.document.parser.htmlParser ;
2011-06-01 21:31:56 +02:00
import net.yacy.document.parser.mmParser ;
2009-10-18 02:53:43 +02:00
import net.yacy.document.parser.odtParser ;
import net.yacy.document.parser.ooxmlParser ;
import net.yacy.document.parser.pdfParser ;
import net.yacy.document.parser.pptParser ;
import net.yacy.document.parser.psParser ;
2012-06-10 10:42:33 +02:00
import net.yacy.document.parser.rdfParser ;
2010-08-20 13:30:02 +02:00
import net.yacy.document.parser.rssParser ;
2009-10-18 02:53:43 +02:00
import net.yacy.document.parser.rtfParser ;
import net.yacy.document.parser.sevenzipParser ;
2011-06-01 21:31:56 +02:00
import net.yacy.document.parser.sidAudioParser ;
2009-10-18 02:53:43 +02:00
import net.yacy.document.parser.swfParser ;
import net.yacy.document.parser.tarParser ;
2010-01-04 17:07:31 +01:00
import net.yacy.document.parser.torrentParser ;
2009-10-18 02:53:43 +02:00
import net.yacy.document.parser.vcfParser ;
import net.yacy.document.parser.vsdParser ;
import net.yacy.document.parser.xlsParser ;
import net.yacy.document.parser.zipParser ;
2012-06-10 10:42:33 +02:00
import net.yacy.document.parser.augment.AugmentParser ;
2009-10-20 00:34:44 +02:00
import net.yacy.document.parser.images.genericImageParser ;
2012-06-10 10:42:33 +02:00
import net.yacy.document.parser.rdfa.impl.RDFaParser ;
2010-03-19 14:04:42 +01:00
import net.yacy.kelondro.util.FileUtils ;
2011-06-01 21:31:56 +02:00
import net.yacy.kelondro.util.MemoryControl ;
2012-06-10 12:49:36 +02:00
import net.yacy.search.Switchboard ;
2009-10-10 01:13:30 +02:00
2009-10-20 00:34:44 +02:00
public final class TextParser {
2009-07-10 00:25:31 +02:00
2010-04-22 00:35:47 +02:00
private static final Object v = new Object ( ) ;
2009-10-21 17:12:34 +02:00
2010-11-30 17:13:55 +01:00
private static final Parser genericIdiom = new genericParser ( ) ;
2013-08-14 21:12:10 +02:00
//use LinkedHashSet for parser collection to use (init) order to prefered parser for same ext or mime
private static final Map < String , LinkedHashSet < Parser > > mime2parser = new ConcurrentHashMap < String , LinkedHashSet < Parser > > ( ) ;
private static final ConcurrentHashMap < String , LinkedHashSet < Parser > > ext2parser = new ConcurrentHashMap < String , LinkedHashSet < Parser > > ( ) ;
2010-04-21 15:46:02 +02:00
private static final Map < String , String > ext2mime = new ConcurrentHashMap < String , String > ( ) ;
private static final Map < String , Object > denyMime = new ConcurrentHashMap < String , Object > ( ) ;
private static final Map < String , Object > denyExtensionx = new ConcurrentHashMap < String , Object > ( ) ;
2011-06-01 21:31:56 +02:00
2009-07-10 00:25:31 +02:00
static {
initParser ( new bzipParser ( ) ) ;
2009-11-03 21:10:59 +01:00
initParser ( new csvParser ( ) ) ;
2009-07-10 00:25:31 +02:00
initParser ( new docParser ( ) ) ;
initParser ( new gzipParser ( ) ) ;
2013-08-14 21:12:10 +02:00
// AugmentParser calls internally RDFaParser (therefore add before RDFa)
if ( Switchboard . getSwitchboard ( ) . getConfigBool ( " parserAugmentation " , true ) ) initParser ( new AugmentParser ( ) ) ;
// RDFaParser calls internally htmlParser (therefore add before html)
if ( Switchboard . getSwitchboard ( ) . getConfigBool ( " parserAugmentation.RDFa " , true ) ) initParser ( new RDFaParser ( ) ) ;
initParser ( new htmlParser ( ) ) ; // called within rdfa parser
2009-10-20 00:34:44 +02:00
initParser ( new genericImageParser ( ) ) ;
2010-12-27 21:13:31 +01:00
initParser ( new mmParser ( ) ) ;
2009-07-10 00:25:31 +02:00
initParser ( new odtParser ( ) ) ;
2009-08-08 17:34:41 +02:00
initParser ( new ooxmlParser ( ) ) ;
2009-07-10 00:25:31 +02:00
initParser ( new pdfParser ( ) ) ;
initParser ( new pptParser ( ) ) ;
initParser ( new psParser ( ) ) ;
2010-08-20 13:30:02 +02:00
initParser ( new rssParser ( ) ) ;
2009-07-10 00:25:31 +02:00
initParser ( new rtfParser ( ) ) ;
initParser ( new sevenzipParser ( ) ) ;
2010-12-28 13:06:04 +01:00
initParser ( new sidAudioParser ( ) ) ;
2009-07-10 00:25:31 +02:00
initParser ( new swfParser ( ) ) ;
initParser ( new tarParser ( ) ) ;
2010-01-04 17:07:31 +01:00
initParser ( new torrentParser ( ) ) ;
2009-07-10 00:25:31 +02:00
initParser ( new vcfParser ( ) ) ;
initParser ( new vsdParser ( ) ) ;
initParser ( new xlsParser ( ) ) ;
initParser ( new zipParser ( ) ) ;
2012-06-10 10:42:33 +02:00
initParser ( new rdfParser ( ) ) ;
2012-10-05 18:54:26 +02:00
initParser ( new audioTagParser ( ) ) ;
2009-07-10 00:25:31 +02:00
}
2011-06-01 21:31:56 +02:00
2010-06-29 21:20:45 +02:00
public static Set < Parser > parsers ( ) {
2011-06-01 21:31:56 +02:00
final Set < Parser > c = new HashSet < Parser > ( ) ;
2012-06-12 01:42:58 +02:00
for ( Set < Parser > pl : ext2parser . values ( ) ) c . addAll ( pl ) ;
for ( Set < Parser > pl : mime2parser . values ( ) ) c . addAll ( pl ) ;
2009-07-10 16:22:17 +02:00
return c ;
}
2011-06-01 21:31:56 +02:00
private static void initParser ( final Parser parser ) {
2009-07-14 13:01:05 +02:00
String prototypeMime = null ;
2011-06-01 21:31:56 +02:00
for ( final String mime : parser . supportedMimeTypes ( ) ) {
2009-07-10 16:22:17 +02:00
// process the mime types
2009-07-14 13:01:05 +02:00
final String mimeType = normalizeMimeType ( mime ) ;
if ( prototypeMime = = null ) prototypeMime = mimeType ;
2013-08-14 21:12:10 +02:00
LinkedHashSet < Parser > p0 = mime2parser . get ( mimeType ) ;
2012-06-12 01:42:58 +02:00
if ( p0 = = null ) {
2013-08-14 21:12:10 +02:00
p0 = new LinkedHashSet < Parser > ( ) ;
2012-06-12 01:42:58 +02:00
mime2parser . put ( mimeType , p0 ) ;
}
p0 . add ( parser ) ;
2013-07-09 14:28:25 +02:00
AbstractParser . log . info ( " Parser for mime type ' " + mimeType + " ': " + parser . getName ( ) ) ;
2009-07-15 16:15:51 +02:00
}
2011-06-01 21:31:56 +02:00
2009-07-15 16:15:51 +02:00
if ( prototypeMime ! = null ) for ( String ext : parser . supportedExtensions ( ) ) {
2010-04-21 15:46:02 +02:00
ext = ext . toLowerCase ( ) ;
2011-06-01 21:31:56 +02:00
final String s = ext2mime . get ( ext ) ;
2013-07-09 14:28:25 +02:00
if ( s ! = null & & ! s . equals ( prototypeMime ) ) AbstractParser . log . info ( " Parser for extension ' " + ext + " ' was set to mime ' " + s + " ', overwriting with new mime ' " + prototypeMime + " '. " ) ;
2009-07-15 16:15:51 +02:00
ext2mime . put ( ext , prototypeMime ) ;
2009-07-10 00:25:31 +02:00
}
2011-06-01 21:31:56 +02:00
2009-10-21 17:12:34 +02:00
for ( String ext : parser . supportedExtensions ( ) ) {
// process the extensions
2010-04-21 15:46:02 +02:00
ext = ext . toLowerCase ( ) ;
2013-08-14 21:12:10 +02:00
LinkedHashSet < Parser > p0 = ext2parser . get ( ext ) ;
2012-06-12 01:42:58 +02:00
if ( p0 = = null ) {
2013-08-14 21:12:10 +02:00
p0 = new LinkedHashSet < Parser > ( ) ;
2012-06-12 01:42:58 +02:00
ext2parser . put ( ext , p0 ) ;
}
p0 . add ( parser ) ;
2013-07-09 14:28:25 +02:00
AbstractParser . log . info ( " Parser for extension ' " + ext + " ': " + parser . getName ( ) ) ;
2009-10-21 17:12:34 +02:00
}
2009-07-10 00:25:31 +02:00
}
2011-06-01 21:31:56 +02:00
2010-06-29 21:20:45 +02:00
public static Document [ ] parseSource (
2013-09-15 23:27:04 +02:00
final AnchorURL location ,
2009-07-14 13:01:05 +02:00
final String mimeType ,
final String charset ,
2014-04-16 21:34:28 +02:00
final int depth ,
2012-04-24 16:07:03 +02:00
final File sourceFile
2010-06-29 21:20:45 +02:00
) throws InterruptedException , Parser . Failure {
2009-07-10 00:25:31 +02:00
BufferedInputStream sourceStream = null ;
2010-12-02 12:05:04 +01:00
Document [ ] docs = null ;
2009-07-10 00:25:31 +02:00
try {
2013-07-09 14:28:25 +02:00
if ( AbstractParser . log . isFine ( ) ) AbstractParser . log . fine ( " Parsing ' " + location + " ' from file " ) ;
2009-10-05 22:11:41 +02:00
if ( ! sourceFile . exists ( ) | | ! sourceFile . canRead ( ) | | sourceFile . length ( ) = = 0 ) {
2009-07-10 00:25:31 +02:00
final String errorMsg = sourceFile . exists ( ) ? " Empty resource file. " : " No resource content available (2). " ;
2013-07-09 14:28:25 +02:00
AbstractParser . log . info ( " Unable to parse ' " + location + " '. " + errorMsg ) ;
2010-06-29 21:20:45 +02:00
throw new Parser . Failure ( errorMsg , location ) ;
2009-07-10 00:25:31 +02:00
}
sourceStream = new BufferedInputStream ( new FileInputStream ( sourceFile ) ) ;
2014-04-16 21:34:28 +02:00
docs = parseSource ( location , mimeType , charset , depth , sourceFile . length ( ) , sourceStream ) ;
2009-07-10 00:25:31 +02:00
} catch ( final Exception e ) {
if ( e instanceof InterruptedException ) throw ( InterruptedException ) e ;
2010-06-29 21:20:45 +02:00
if ( e instanceof Parser . Failure ) throw ( Parser . Failure ) e ;
2013-07-09 14:28:25 +02:00
AbstractParser . log . severe ( " Unexpected exception in parseSource from File: " + e . getMessage ( ) , e ) ;
2010-06-29 21:20:45 +02:00
throw new Parser . Failure ( " Unexpected exception: " + e . getMessage ( ) , location ) ;
2009-07-10 00:25:31 +02:00
} finally {
2010-12-02 12:05:04 +01:00
if ( sourceStream ! = null ) try { sourceStream . close ( ) ; } catch ( final Exception ex ) { }
2009-07-10 00:25:31 +02:00
}
2012-07-05 08:44:39 +02:00
2010-12-02 12:05:04 +01:00
return docs ;
2009-07-10 00:25:31 +02:00
}
2011-06-01 21:31:56 +02:00
2010-06-29 21:20:45 +02:00
public static Document [ ] parseSource (
2013-09-15 23:27:04 +02:00
final AnchorURL location ,
2011-08-02 01:31:08 +02:00
String mimeType ,
2010-03-19 14:04:42 +01:00
final String charset ,
2014-04-16 21:34:28 +02:00
final int depth ,
2012-04-24 16:07:03 +02:00
final byte [ ] content
2010-06-29 21:20:45 +02:00
) throws Parser . Failure {
2013-07-09 14:28:25 +02:00
if ( AbstractParser . log . isFine ( ) ) AbstractParser . log . fine ( " Parsing ' " + location + " ' from byte-array " ) ;
2011-08-02 01:31:08 +02:00
mimeType = normalizeMimeType ( mimeType ) ;
2012-06-12 01:42:58 +02:00
Set < Parser > idioms = null ;
2011-08-02 01:31:08 +02:00
try {
idioms = parsers ( location , mimeType ) ;
} catch ( final Parser . Failure e ) {
2013-09-15 00:30:23 +02:00
final String errorMsg = " Parser Failure for extension ' " + MultiProtocolURL . getFileExtension ( location . getFileName ( ) ) + " ' or mimetype ' " + mimeType + " ': " + e . getMessage ( ) ;
2013-07-09 14:28:25 +02:00
AbstractParser . log . warn ( errorMsg ) ;
2011-08-02 01:31:08 +02:00
throw new Parser . Failure ( errorMsg , location ) ;
}
2012-10-10 11:46:22 +02:00
assert ! idioms . isEmpty ( ) : " no parsers applied for url " + location . toNormalform ( true ) ;
2011-08-02 01:31:08 +02:00
2014-04-16 21:34:28 +02:00
Document [ ] docs = parseSource ( location , mimeType , idioms , charset , depth , content ) ;
2011-09-07 12:08:57 +02:00
return docs ;
2010-03-19 14:04:42 +01:00
}
2011-06-01 21:31:56 +02:00
2010-06-29 21:20:45 +02:00
public static Document [ ] parseSource (
2013-09-15 23:27:04 +02:00
final AnchorURL location ,
2009-07-14 13:01:05 +02:00
String mimeType ,
final String charset ,
2014-04-16 21:34:28 +02:00
final int depth ,
2009-07-14 13:01:05 +02:00
final long contentLength ,
2012-04-24 16:07:03 +02:00
final InputStream sourceStream
2010-06-29 21:20:45 +02:00
) throws Parser . Failure {
2013-07-09 14:28:25 +02:00
if ( AbstractParser . log . isFine ( ) ) AbstractParser . log . fine ( " Parsing ' " + location + " ' from stream " ) ;
2009-10-21 17:12:34 +02:00
mimeType = normalizeMimeType ( mimeType ) ;
2012-06-12 01:42:58 +02:00
Set < Parser > idioms = null ;
2010-01-22 14:21:37 +01:00
try {
2010-06-29 21:20:45 +02:00
idioms = parsers ( location , mimeType ) ;
2011-06-01 21:31:56 +02:00
} catch ( final Parser . Failure e ) {
2013-09-15 00:30:23 +02:00
final String errorMsg = " Parser Failure for extension ' " + MultiProtocolURL . getFileExtension ( location . getFileName ( ) ) + " ' or mimetype ' " + mimeType + " ': " + e . getMessage ( ) ;
2013-07-09 14:28:25 +02:00
AbstractParser . log . warn ( errorMsg ) ;
2010-06-29 21:20:45 +02:00
throw new Parser . Failure ( errorMsg , location ) ;
2009-10-21 17:12:34 +02:00
}
2012-10-10 11:46:22 +02:00
assert ! idioms . isEmpty ( ) : " no parsers applied for url " + location . toNormalform ( true ) ;
2011-06-01 21:31:56 +02:00
2010-03-19 14:04:42 +01:00
// if we do not have more than one parser or the content size is over MaxInt
// then we use only one stream-oriented parser.
if ( idioms . size ( ) = = 1 | | contentLength > Integer . MAX_VALUE ) {
// use a specific stream-oriented parser
2012-07-05 10:23:07 +02:00
return parseSource ( location , mimeType , idioms . iterator ( ) . next ( ) , charset , sourceStream ) ;
2010-03-19 14:04:42 +01:00
}
2011-06-01 21:31:56 +02:00
2010-03-19 14:04:42 +01:00
// in case that we know more parsers we first transform the content into a byte[] and use that as base
// for a number of different parse attempts.
2010-11-30 17:13:55 +01:00
byte [ ] b = null ;
2010-03-19 14:04:42 +01:00
try {
2010-11-30 17:13:55 +01:00
b = FileUtils . read ( sourceStream , ( int ) contentLength ) ;
2011-06-01 21:31:56 +02:00
} catch ( final IOException e ) {
2010-06-29 21:20:45 +02:00
throw new Parser . Failure ( e . getMessage ( ) , location ) ;
2010-03-19 14:04:42 +01:00
}
2014-04-16 21:34:28 +02:00
Document [ ] docs = parseSource ( location , mimeType , idioms , charset , depth , b ) ;
2011-09-07 12:08:57 +02:00
return docs ;
2010-03-19 14:04:42 +01:00
}
2010-06-29 21:20:45 +02:00
private static Document [ ] parseSource (
2013-09-15 23:27:04 +02:00
final AnchorURL location ,
2011-06-01 21:31:56 +02:00
final String mimeType ,
final Parser parser ,
2010-03-19 14:04:42 +01:00
final String charset ,
final InputStream sourceStream
2010-06-29 21:20:45 +02:00
) throws Parser . Failure {
2013-07-09 14:28:25 +02:00
if ( AbstractParser . log . isFine ( ) ) AbstractParser . log . fine ( " Parsing ' " + location + " ' from stream " ) ;
2013-09-15 00:30:23 +02:00
final String fileExt = MultiProtocolURL . getFileExtension ( location . getFileName ( ) ) ;
2010-03-19 14:04:42 +01:00
final String documentCharset = htmlParser . patchCharsetEncoding ( charset ) ;
2010-06-29 21:20:45 +02:00
assert parser ! = null ;
2010-03-19 14:04:42 +01:00
2013-07-09 14:28:25 +02:00
if ( AbstractParser . log . isFine ( ) ) AbstractParser . log . fine ( " Parsing " + location + " with mimeType ' " + mimeType + " ' and file extension ' " + fileExt + " '. " ) ;
2010-03-19 14:04:42 +01:00
try {
2011-06-01 21:31:56 +02:00
final Document [ ] docs = parser . parse ( location , mimeType , documentCharset , sourceStream ) ;
2010-12-02 12:05:04 +01:00
return docs ;
2011-06-01 21:31:56 +02:00
} catch ( final Exception e ) {
2010-06-29 21:20:45 +02:00
throw new Parser . Failure ( " parser failed: " + parser . getName ( ) , location ) ;
2010-03-19 14:04:42 +01:00
}
}
2010-06-29 21:20:45 +02:00
private static Document [ ] parseSource (
2013-09-15 23:27:04 +02:00
final AnchorURL location ,
2010-06-29 21:20:45 +02:00
final String mimeType ,
2012-06-12 01:42:58 +02:00
final Set < Parser > parsers ,
2010-03-19 14:04:42 +01:00
final String charset ,
2014-04-16 21:34:28 +02:00
final int depth ,
2010-03-19 14:04:42 +01:00
final byte [ ] sourceArray
2010-06-29 21:20:45 +02:00
) throws Parser . Failure {
2013-09-15 00:30:23 +02:00
final String fileExt = MultiProtocolURL . getFileExtension ( location . getFileName ( ) ) ;
2013-07-09 14:28:25 +02:00
if ( AbstractParser . log . isFine ( ) ) AbstractParser . log . fine ( " Parsing " + location + " with mimeType ' " + mimeType + " ' and file extension ' " + fileExt + " ' from byte[] " ) ;
2010-03-19 14:04:42 +01:00
final String documentCharset = htmlParser . patchCharsetEncoding ( charset ) ;
2010-06-29 21:20:45 +02:00
assert ! parsers . isEmpty ( ) ;
2010-03-19 14:04:42 +01:00
2010-12-02 12:05:04 +01:00
Document [ ] docs = null ;
2012-07-03 06:06:38 +02:00
final Map < Parser , Parser . Failure > failedParser = new HashMap < Parser , Parser . Failure > ( ) ;
2012-12-02 16:52:12 +01:00
String origName = Thread . currentThread ( ) . getName ( ) ;
Thread . currentThread ( ) . setName ( " parsing + " + location . toString ( ) ) ; // set a name to get the address in Thread Dump
2012-07-03 06:06:38 +02:00
for ( final Parser parser : parsers ) {
if ( MemoryControl . request ( sourceArray . length * 6 , false ) ) {
2012-04-24 16:07:03 +02:00
ByteArrayInputStream bis ;
if ( mimeType . equals ( " text/plain " ) & & parser . getName ( ) . equals ( " HTML Parser " ) ) {
// a hack to simulate html files .. is needed for NOLOAD queues. This throws their data into virtual text/plain messages.
bis = new ByteArrayInputStream ( UTF8 . getBytes ( " <html><head></head><body><h1> " + UTF8 . String ( sourceArray ) + " </h1></body><html> " ) ) ;
} else {
bis = new ByteArrayInputStream ( sourceArray ) ;
}
2011-06-01 21:31:56 +02:00
try {
2011-12-01 12:11:13 +01:00
docs = parser . parse ( location , mimeType , documentCharset , bis ) ;
2011-06-01 21:31:56 +02:00
} catch ( final Parser . Failure e ) {
failedParser . put ( parser , e ) ;
//log.logWarning("tried parser '" + parser.getName() + "' to parse " + location.toNormalform(true, false) + " but failed: " + e.getMessage(), e);
} catch ( final Exception e ) {
failedParser . put ( parser , new Parser . Failure ( e . getMessage ( ) , location ) ) ;
//log.logWarning("tried parser '" + parser.getName() + "' to parse " + location.toNormalform(true, false) + " but failed: " + e.getMessage(), e);
2011-12-01 12:11:13 +01:00
} finally {
try {
bis . close ( ) ;
} catch ( IOException ioe ) {
// Ignore.
}
2011-06-01 21:31:56 +02:00
}
if ( docs ! = null ) break ;
2009-07-10 00:25:31 +02:00
}
2009-10-21 17:12:34 +02:00
}
2012-12-02 16:52:12 +01:00
Thread . currentThread ( ) . setName ( origName ) ;
2011-06-01 21:31:56 +02:00
2010-12-02 12:05:04 +01:00
if ( docs = = null ) {
2010-12-27 21:13:31 +01:00
if ( failedParser . isEmpty ( ) ) {
2013-06-25 16:27:20 +02:00
final String errorMsg = " Parsing content with file extension ' " + fileExt + " ' and mimetype ' " + mimeType + " ' failed. " ;
2010-01-19 15:59:58 +01:00
//log.logWarning("Unable to parse '" + location + "'. " + errorMsg);
2010-06-29 21:20:45 +02:00
throw new Parser . Failure ( errorMsg , location ) ;
2010-01-19 15:59:58 +01:00
}
2012-07-05 08:44:39 +02:00
String failedParsers = " " ;
for ( final Map . Entry < Parser , Parser . Failure > error : failedParser . entrySet ( ) ) {
2013-07-09 14:28:25 +02:00
AbstractParser . log . warn ( " tried parser ' " + error . getKey ( ) . getName ( ) + " ' to parse " + location . toNormalform ( true ) + " but failed: " + error . getValue ( ) . getMessage ( ) , error . getValue ( ) ) ;
2012-07-05 08:44:39 +02:00
failedParsers + = error . getKey ( ) . getName ( ) + " " ;
}
throw new Parser . Failure ( " All parser failed: " + failedParsers , location ) ;
2009-07-10 00:25:31 +02:00
}
2014-04-16 21:34:28 +02:00
for ( final Document d : docs ) {
assert d . getTextStream ( ) ! = null : " mimeType = " + mimeType ;
d . setDepth ( depth ) ;
} // verify docs
2011-09-07 12:08:57 +02:00
2010-12-02 12:05:04 +01:00
return docs ;
2009-07-10 00:25:31 +02:00
}
2011-06-01 21:31:56 +02:00
2009-07-14 13:01:05 +02:00
/ * *
* check if the parser supports the given content .
* @param url
* @param mimeType
2009-10-23 00:38:04 +02:00
* @return returns null if the content is supported . If the content is not supported , return a error string .
2009-07-14 13:01:05 +02:00
* /
2013-09-15 00:30:23 +02:00
public static String supports ( final MultiProtocolURL url , final String mimeType ) {
2009-07-14 13:01:05 +02:00
try {
// try to get a parser. If this works, we don't need the parser itself, we just return null to show that everything is ok.
2012-06-12 01:42:58 +02:00
final Set < Parser > idioms = parsers ( url , mimeType ) ;
return ( idioms = = null | | idioms . isEmpty ( ) | | ( idioms . size ( ) = = 1 & & idioms . iterator ( ) . next ( ) . getName ( ) . equals ( genericIdiom . getName ( ) ) ) ) ? " no parser found " : null ;
2011-06-01 21:31:56 +02:00
} catch ( final Parser . Failure e ) {
2009-07-14 13:01:05 +02:00
// in case that a parser is not available, return a error string describing the problem.
return e . getMessage ( ) ;
}
}
2011-06-01 21:31:56 +02:00
2009-10-21 17:12:34 +02:00
/ * *
* find a parser for a given url and mime type
* because mime types returned by web severs are sometimes wrong , we also compute the mime type again
* from the extension that can be extracted from the url path . That means that there are 3 criteria
* that can be used to select a parser :
* - the given extension
* - the given mime type
* - the mime type computed from the extension
* @param url the given url
* @param mimeType the given mime type
* @return a list of Idiom parsers that may be appropriate for the given criteria
2010-06-29 21:20:45 +02:00
* @throws Parser . Failure
2009-10-21 17:12:34 +02:00
* /
2013-09-15 00:30:23 +02:00
private static Set < Parser > parsers ( final MultiProtocolURL url , String mimeType1 ) throws Parser . Failure {
2013-07-23 20:24:13 +02:00
final Set < Parser > idioms = new LinkedHashSet < Parser > ( 2 ) ; // LinkedSet to maintain order (genericParser should be last)
2011-06-01 21:31:56 +02:00
2009-07-14 13:01:05 +02:00
// check extension
2013-09-15 00:30:23 +02:00
String ext = MultiProtocolURL . getFileExtension ( url . getFileName ( ) ) ;
2012-06-12 01:42:58 +02:00
Set < Parser > idiom ;
2009-10-21 17:12:34 +02:00
if ( ext ! = null & & ext . length ( ) > 0 ) {
2010-06-29 21:20:45 +02:00
if ( denyExtensionx . containsKey ( ext ) ) throw new Parser . Failure ( " file extension ' " + ext + " ' is denied (1) " , url ) ;
2009-10-21 17:12:34 +02:00
idiom = ext2parser . get ( ext ) ;
2012-06-12 01:42:58 +02:00
if ( idiom ! = null ) idioms . addAll ( idiom ) ;
2009-10-21 17:12:34 +02:00
}
2011-06-01 21:31:56 +02:00
2009-10-21 17:12:34 +02:00
// check given mime type
if ( mimeType1 ! = null ) {
mimeType1 = normalizeMimeType ( mimeType1 ) ;
2010-06-29 21:20:45 +02:00
if ( denyMime . containsKey ( mimeType1 ) ) throw new Parser . Failure ( " mime type ' " + mimeType1 + " ' is denied (1) " , url ) ;
2009-10-21 17:12:34 +02:00
idiom = mime2parser . get ( mimeType1 ) ;
2012-06-12 01:42:58 +02:00
if ( idiom ! = null & & ! idioms . contains ( idiom ) ) idioms . addAll ( idiom ) ;
2009-10-21 17:12:34 +02:00
}
2011-06-01 21:31:56 +02:00
2009-10-21 17:12:34 +02:00
// check mime type computed from extension
2011-06-01 21:31:56 +02:00
final String mimeType2 = ext2mime . get ( ext ) ;
2012-06-12 01:42:58 +02:00
if ( mimeType2 ! = null & & ( idiom = mime2parser . get ( mimeType2 ) ) ! = null & & ! idioms . contains ( idiom ) ) idioms . addAll ( idiom ) ;
2011-06-01 21:31:56 +02:00
2013-07-23 20:24:13 +02:00
// always add the generic parser (make sure it is the last in access order)
2010-11-30 17:13:55 +01:00
idioms . add ( genericIdiom ) ;
//if (idioms.isEmpty()) throw new Parser.Failure("no parser found for extension '" + ext + "' and mime type '" + mimeType1 + "'", url);
2011-06-01 21:31:56 +02:00
2009-10-21 17:12:34 +02:00
return idioms ;
2009-07-14 13:01:05 +02:00
}
2012-01-17 16:43:34 +01:00
/ * *
* checks if the parser supports the given mime type . It is not only checked if the parser can parse such types ,
* it is also checked if the mime type is not included in the mimetype - deny list .
* @param mimeType
* @return an error if the mime type is not supported , null otherwise
* /
2009-07-14 13:01:05 +02:00
public static String supportsMime ( String mimeType ) {
if ( mimeType = = null ) return null ;
2009-07-10 18:15:33 +02:00
mimeType = normalizeMimeType ( mimeType ) ;
2010-04-21 15:46:02 +02:00
if ( denyMime . containsKey ( mimeType ) ) return " mime type ' " + mimeType + " ' is denied (2) " ;
2009-07-14 13:01:05 +02:00
if ( mime2parser . get ( mimeType ) = = null ) return " no parser for mime ' " + mimeType + " ' available " ;
return null ;
2009-07-10 16:22:17 +02:00
}
2010-12-02 12:05:04 +01:00
2012-01-17 16:43:34 +01:00
/ * *
* checks if the parser supports the given extension . It is not only checked if the parser can parse such files ,
* it is also checked if the extension is not included in the extension - deny list .
* @param extention
* @return an error if the extension is not supported , null otherwise
* /
public static String supportsExtension ( final String ext ) {
2012-07-10 22:59:03 +02:00
if ( ext = = null | | ext . isEmpty ( ) ) return null ;
2010-04-21 15:46:02 +02:00
if ( denyExtensionx . containsKey ( ext ) ) return " file extension ' " + ext + " ' is denied (2) " ;
2011-06-01 21:31:56 +02:00
final String mimeType = ext2mime . get ( ext ) ;
2009-07-14 13:01:05 +02:00
if ( mimeType = = null ) return " no parser available " ;
2012-06-12 01:42:58 +02:00
final Set < Parser > idiom = mime2parser . get ( mimeType ) ;
2009-07-14 13:01:05 +02:00
assert idiom ! = null ;
2012-07-10 22:59:03 +02:00
if ( idiom = = null | | idiom . isEmpty ( ) ) return " no parser available (internal error!) " ;
2009-07-14 13:01:05 +02:00
return null ;
2009-07-10 16:22:17 +02:00
}
2011-06-01 21:31:56 +02:00
2012-01-17 16:43:34 +01:00
/ * *
* checks if the parser supports the given extension . It is not only checked if the parser can parse such files ,
* it is also checked if the extension is not included in the extension - deny list .
* @param extention
* @return an error if the extension is not supported , null otherwise
* /
2013-09-15 00:30:23 +02:00
public static String supportsExtension ( final MultiProtocolURL url ) {
2014-02-05 03:45:02 +01:00
return supportsExtension ( MultiProtocolURL . getFileExtension ( url . getFileName ( ) ) ) ;
2012-01-17 16:43:34 +01:00
}
2013-09-15 00:30:23 +02:00
public static String mimeOf ( final MultiProtocolURL url ) {
return mimeOf ( MultiProtocolURL . getFileExtension ( url . getFileName ( ) ) ) ;
2009-07-10 16:22:17 +02:00
}
2011-06-01 21:31:56 +02:00
public static String mimeOf ( final String ext ) {
2010-04-21 15:46:02 +02:00
return ext2mime . get ( ext . toLowerCase ( ) ) ;
2009-07-10 16:22:17 +02:00
}
2011-06-01 21:31:56 +02:00
2009-07-10 16:22:17 +02:00
private static String normalizeMimeType ( String mimeType ) {
if ( mimeType = = null ) return " application/octet-stream " ;
2010-04-21 15:46:02 +02:00
mimeType = mimeType . toLowerCase ( ) ;
2009-07-10 16:22:17 +02:00
final int pos = mimeType . indexOf ( ';' ) ;
return ( ( pos < 0 ) ? mimeType . trim ( ) : mimeType . substring ( 0 , pos ) . trim ( ) ) ;
}
2011-06-01 21:31:56 +02:00
public static void setDenyMime ( final String denyList ) {
2009-07-10 16:22:17 +02:00
denyMime . clear ( ) ;
2009-11-11 16:49:53 +01:00
String n ;
2011-06-01 21:31:56 +02:00
for ( final String s : denyList . split ( " , " ) ) {
2009-11-11 16:49:53 +01:00
n = normalizeMimeType ( s ) ;
2010-04-22 00:35:47 +02:00
if ( n ! = null & & n . length ( ) > 0 ) denyMime . put ( n , v ) ;
2009-11-11 16:49:53 +01:00
}
2009-07-10 16:22:17 +02:00
}
2011-06-01 21:31:56 +02:00
2009-07-10 16:22:17 +02:00
public static String getDenyMime ( ) {
String s = " " ;
2011-06-01 21:31:56 +02:00
for ( final String d : denyMime . keySet ( ) ) s + = d + " , " ;
2012-07-10 22:59:03 +02:00
if ( ! s . isEmpty ( ) ) s = s . substring ( 0 , s . length ( ) - 1 ) ;
2009-07-10 16:22:17 +02:00
return s ;
}
2011-06-01 21:31:56 +02:00
public static void grantMime ( final String mime , final boolean grant ) {
final String n = normalizeMimeType ( mime ) ;
2012-07-10 22:59:03 +02:00
if ( n = = null | | n . isEmpty ( ) ) return ;
2010-04-22 00:35:47 +02:00
if ( grant ) denyMime . remove ( n ) ; else denyMime . put ( n , v ) ;
2009-07-10 16:22:17 +02:00
}
2011-06-01 21:31:56 +02:00
public static void setDenyExtension ( final String denyList ) {
2010-04-21 15:46:02 +02:00
denyExtensionx . clear ( ) ;
2011-06-01 21:31:56 +02:00
for ( final String s : denyList . split ( " , " ) ) denyExtensionx . put ( s , v ) ;
2009-07-14 13:01:05 +02:00
}
2011-06-01 21:31:56 +02:00
2009-07-14 13:01:05 +02:00
public static String getDenyExtension ( ) {
String s = " " ;
2011-06-01 21:31:56 +02:00
for ( final String d : denyExtensionx . keySet ( ) ) s + = d + " , " ;
2012-10-05 18:54:26 +02:00
if ( ! s . isEmpty ( ) ) s = s . substring ( 0 , s . length ( ) - 1 ) ;
2009-07-14 13:01:05 +02:00
return s ;
}
2011-06-01 21:31:56 +02:00
public static void grantExtension ( final String ext , final boolean grant ) {
2012-07-10 22:59:03 +02:00
if ( ext = = null | | ext . isEmpty ( ) ) return ;
2010-04-22 00:35:47 +02:00
if ( grant ) denyExtensionx . remove ( ext ) ; else denyExtensionx . put ( ext , v ) ;
2009-07-14 13:01:05 +02:00
}
2010-06-29 21:20:45 +02:00
2009-07-10 00:25:31 +02:00
}