2009-07-10 00:25:31 +02:00
// Parser.java
// (C) 2009 by Michael Peter Christen; mc@yacy.net, Frankfurt a. M., Germany
// first published 09.07.2009 on http://yacy.net
//
// This is a part of YaCy, a peer-to-peer based web search engine
//
// $LastChangedDate: 2009-03-20 16:44:59 +0100 (Fr, 20 Mrz 2009) $
// $LastChangedRevision: 5736 $
// $LastChangedBy: borg-0300 $
//
// LICENSE
//
// This program is free software; you can redistribute it and/or modify
// it under the terms of the GNU General Public License as published by
// the Free Software Foundation; either version 2 of the License, or
// (at your option) any later version.
//
// This program is distributed in the hope that it will be useful,
// but WITHOUT ANY WARRANTY; without even the implied warranty of
// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
// GNU General Public License for more details.
//
// You should have received a copy of the GNU General Public License
// along with this program; if not, write to the Free Software
// Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
2009-10-18 02:53:43 +02:00
package net.yacy.document ;
2009-07-10 00:25:31 +02:00
import java.io.BufferedInputStream ;
import java.io.ByteArrayInputStream ;
import java.io.File ;
import java.io.FileInputStream ;
import java.io.InputStream ;
2009-07-10 16:22:17 +02:00
import java.text.Collator ;
2009-10-21 17:12:34 +02:00
import java.util.ArrayList ;
2010-01-19 15:59:58 +01:00
import java.util.HashMap ;
2009-07-10 16:22:17 +02:00
import java.util.HashSet ;
2009-10-21 17:12:34 +02:00
import java.util.List ;
2009-07-10 16:22:17 +02:00
import java.util.Locale ;
import java.util.Map ;
import java.util.Set ;
import java.util.TreeMap ;
import java.util.TreeSet ;
2009-07-10 00:25:31 +02:00
2009-10-18 02:53:43 +02:00
import net.yacy.document.parser.bzipParser ;
2009-11-03 21:10:59 +01:00
import net.yacy.document.parser.csvParser ;
2009-10-18 02:53:43 +02:00
import net.yacy.document.parser.docParser ;
import net.yacy.document.parser.gzipParser ;
import net.yacy.document.parser.htmlParser ;
import net.yacy.document.parser.odtParser ;
import net.yacy.document.parser.ooxmlParser ;
import net.yacy.document.parser.pdfParser ;
import net.yacy.document.parser.pptParser ;
import net.yacy.document.parser.psParser ;
import net.yacy.document.parser.rssParser ;
import net.yacy.document.parser.rtfParser ;
import net.yacy.document.parser.sevenzipParser ;
import net.yacy.document.parser.swfParser ;
import net.yacy.document.parser.tarParser ;
2010-01-04 17:07:31 +01:00
import net.yacy.document.parser.torrentParser ;
2009-10-18 02:53:43 +02:00
import net.yacy.document.parser.vcfParser ;
import net.yacy.document.parser.vsdParser ;
import net.yacy.document.parser.xlsParser ;
import net.yacy.document.parser.zipParser ;
2009-11-20 00:22:53 +01:00
import net.yacy.document.parser.images.bmpParser ;
2009-10-20 00:34:44 +02:00
import net.yacy.document.parser.images.genericImageParser ;
2009-10-11 02:12:19 +02:00
import net.yacy.kelondro.data.meta.DigestURI ;
2009-10-10 01:13:30 +02:00
import net.yacy.kelondro.logging.Log ;
2009-07-10 00:25:31 +02:00
2009-10-20 00:34:44 +02:00
public final class TextParser {
2009-07-10 00:25:31 +02:00
2009-07-10 16:22:17 +02:00
private static final Log log = new Log ( " PARSER " ) ;
// use a collator to relax when distinguishing between lowercase und uppercase letters
private static final Collator insensitiveCollator = Collator . getInstance ( Locale . US ) ;
static {
insensitiveCollator . setStrength ( Collator . SECONDARY ) ;
insensitiveCollator . setDecomposition ( Collator . NO_DECOMPOSITION ) ;
}
2009-10-21 17:12:34 +02:00
2009-07-10 16:22:17 +02:00
private static final Map < String , Idiom > mime2parser = new TreeMap < String , Idiom > ( insensitiveCollator ) ;
2009-10-21 17:12:34 +02:00
private static final Map < String , Idiom > ext2parser = new TreeMap < String , Idiom > ( insensitiveCollator ) ;
2009-07-14 13:01:05 +02:00
private static final Map < String , String > ext2mime = new TreeMap < String , String > ( insensitiveCollator ) ;
2009-07-10 16:22:17 +02:00
private static final Set < String > denyMime = new TreeSet < String > ( insensitiveCollator ) ;
2009-07-14 13:01:05 +02:00
private static final Set < String > denyExtension = new TreeSet < String > ( insensitiveCollator ) ;
2009-07-10 00:25:31 +02:00
static {
2009-11-20 00:22:53 +01:00
initParser ( new bmpParser ( ) ) ;
2009-07-10 00:25:31 +02:00
initParser ( new bzipParser ( ) ) ;
2009-11-03 21:10:59 +01:00
initParser ( new csvParser ( ) ) ;
2009-07-10 00:25:31 +02:00
initParser ( new docParser ( ) ) ;
initParser ( new gzipParser ( ) ) ;
2009-07-10 16:22:17 +02:00
initParser ( new htmlParser ( ) ) ;
2009-10-20 00:34:44 +02:00
initParser ( new genericImageParser ( ) ) ;
2009-07-10 00:25:31 +02:00
initParser ( new odtParser ( ) ) ;
2009-08-08 17:34:41 +02:00
initParser ( new ooxmlParser ( ) ) ;
2009-07-10 00:25:31 +02:00
initParser ( new pdfParser ( ) ) ;
initParser ( new pptParser ( ) ) ;
initParser ( new psParser ( ) ) ;
initParser ( new rssParser ( ) ) ;
initParser ( new rtfParser ( ) ) ;
initParser ( new sevenzipParser ( ) ) ;
initParser ( new swfParser ( ) ) ;
initParser ( new tarParser ( ) ) ;
2010-01-04 17:07:31 +01:00
initParser ( new torrentParser ( ) ) ;
2009-07-10 00:25:31 +02:00
initParser ( new vcfParser ( ) ) ;
initParser ( new vsdParser ( ) ) ;
initParser ( new xlsParser ( ) ) ;
initParser ( new zipParser ( ) ) ;
}
2009-07-10 16:22:17 +02:00
public static Set < Idiom > idioms ( ) {
Set < Idiom > c = new HashSet < Idiom > ( ) ;
2010-01-04 17:07:31 +01:00
c . addAll ( ext2parser . values ( ) ) ;
2009-07-10 16:22:17 +02:00
c . addAll ( mime2parser . values ( ) ) ;
return c ;
}
private static void initParser ( Idiom parser ) {
2009-07-14 13:01:05 +02:00
String prototypeMime = null ;
for ( String mime : parser . supportedMimeTypes ( ) ) {
2009-07-10 16:22:17 +02:00
// process the mime types
2009-07-14 13:01:05 +02:00
final String mimeType = normalizeMimeType ( mime ) ;
if ( prototypeMime = = null ) prototypeMime = mimeType ;
2009-07-10 16:22:17 +02:00
Idiom p0 = mime2parser . get ( mimeType ) ;
2009-07-10 18:15:33 +02:00
if ( p0 ! = null ) log . logSevere ( " parser for mime ' " + mimeType + " ' was set to ' " + p0 . getName ( ) + " ', overwriting with new parser ' " + parser . getName ( ) + " '. " ) ;
2009-07-10 16:22:17 +02:00
mime2parser . put ( mimeType , parser ) ;
Log . logInfo ( " PARSER " , " Parser for mime type ' " + mimeType + " ': " + parser . getName ( ) ) ;
2009-07-15 16:15:51 +02:00
}
if ( prototypeMime ! = null ) for ( String ext : parser . supportedExtensions ( ) ) {
String s = ext2mime . get ( ext ) ;
if ( s ! = null ) log . logSevere ( " parser for extension ' " + ext + " ' was set to mime ' " + s + " ', overwriting with new mime ' " + prototypeMime + " '. " ) ;
ext2mime . put ( ext , prototypeMime ) ;
2009-07-10 00:25:31 +02:00
}
2009-10-21 17:12:34 +02:00
for ( String ext : parser . supportedExtensions ( ) ) {
// process the extensions
Idiom p0 = ext2parser . get ( ext ) ;
if ( p0 ! = null ) log . logSevere ( " parser for extension ' " + ext + " ' was set to ' " + p0 . getName ( ) + " ', overwriting with new parser ' " + parser . getName ( ) + " '. " ) ;
ext2parser . put ( ext , parser ) ;
Log . logInfo ( " PARSER " , " Parser for extension ' " + ext + " ': " + parser . getName ( ) ) ;
}
2009-07-10 00:25:31 +02:00
}
2009-10-05 22:11:41 +02:00
public static Document parseSource (
2009-10-11 02:12:19 +02:00
final DigestURI location ,
2009-10-05 22:11:41 +02:00
final String mimeType ,
final String charset ,
final byte [ ] sourceArray
) throws InterruptedException , ParserException {
2009-07-10 00:25:31 +02:00
ByteArrayInputStream byteIn = null ;
try {
2009-07-10 16:22:17 +02:00
if ( log . isFine ( ) ) log . logFine ( " Parsing ' " + location + " ' from byte-array " ) ;
2009-07-10 00:25:31 +02:00
if ( sourceArray = = null | | sourceArray . length = = 0 ) {
final String errorMsg = " No resource content available (1) " + ( ( ( sourceArray = = null ) ? " source == null " : " source.length() == 0 " ) + " , url = " + location . toNormalform ( true , false ) ) ;
2009-07-10 16:22:17 +02:00
log . logInfo ( " Unable to parse ' " + location + " '. " + errorMsg ) ;
2009-07-10 18:15:33 +02:00
throw new ParserException ( errorMsg , location ) ;
2009-07-10 00:25:31 +02:00
}
byteIn = new ByteArrayInputStream ( sourceArray ) ;
return parseSource ( location , mimeType , charset , sourceArray . length , byteIn ) ;
} catch ( final Exception e ) {
if ( e instanceof InterruptedException ) throw ( InterruptedException ) e ;
if ( e instanceof ParserException ) throw ( ParserException ) e ;
2009-07-10 16:22:17 +02:00
log . logSevere ( " Unexpected exception in parseSource from byte-array: " + e . getMessage ( ) , e ) ;
2009-07-10 18:15:33 +02:00
throw new ParserException ( " Unexpected exception: " + e . getMessage ( ) , location ) ;
2009-07-10 00:25:31 +02:00
} finally {
if ( byteIn ! = null ) try {
byteIn . close ( ) ;
} catch ( final Exception ex ) { }
}
}
2009-07-14 13:01:05 +02:00
public static Document parseSource (
2009-10-11 02:12:19 +02:00
final DigestURI location ,
2009-07-14 13:01:05 +02:00
final String mimeType ,
final String charset ,
final File sourceFile
) throws InterruptedException , ParserException {
2009-07-10 00:25:31 +02:00
BufferedInputStream sourceStream = null ;
try {
2009-07-10 16:22:17 +02:00
if ( log . isFine ( ) ) log . logFine ( " Parsing ' " + location + " ' from file " ) ;
2009-10-05 22:11:41 +02:00
if ( ! sourceFile . exists ( ) | | ! sourceFile . canRead ( ) | | sourceFile . length ( ) = = 0 ) {
2009-07-10 00:25:31 +02:00
final String errorMsg = sourceFile . exists ( ) ? " Empty resource file. " : " No resource content available (2). " ;
2009-07-10 16:22:17 +02:00
log . logInfo ( " Unable to parse ' " + location + " '. " + errorMsg ) ;
2009-07-10 18:15:33 +02:00
throw new ParserException ( errorMsg , location ) ;
2009-07-10 00:25:31 +02:00
}
sourceStream = new BufferedInputStream ( new FileInputStream ( sourceFile ) ) ;
return parseSource ( location , mimeType , charset , sourceFile . length ( ) , sourceStream ) ;
} catch ( final Exception e ) {
if ( e instanceof InterruptedException ) throw ( InterruptedException ) e ;
if ( e instanceof ParserException ) throw ( ParserException ) e ;
2009-07-10 16:22:17 +02:00
log . logSevere ( " Unexpected exception in parseSource from File: " + e . getMessage ( ) , e ) ;
2009-07-10 18:15:33 +02:00
throw new ParserException ( " Unexpected exception: " + e . getMessage ( ) , location ) ;
2009-07-10 00:25:31 +02:00
} finally {
if ( sourceStream ! = null ) try {
sourceStream . close ( ) ;
} catch ( final Exception ex ) { }
}
}
2009-07-14 13:01:05 +02:00
public static Document parseSource (
2009-10-11 02:12:19 +02:00
final DigestURI location ,
2009-07-14 13:01:05 +02:00
String mimeType ,
final String charset ,
final long contentLength ,
final InputStream sourceStream
) throws InterruptedException , ParserException {
2009-10-21 17:12:34 +02:00
if ( log . isFine ( ) ) log . logFine ( " Parsing ' " + location + " ' from stream " ) ;
mimeType = normalizeMimeType ( mimeType ) ;
final String fileExt = location . getFileExtension ( ) ;
final String documentCharset = htmlParser . patchCharsetEncoding ( charset ) ;
2010-01-22 14:21:37 +01:00
List < Idiom > idioms = null ;
try {
idioms = idiomParser ( location , mimeType ) ;
} catch ( ParserException e ) {
final String errorMsg = " Parser Failure for extension ' " + location . getFileExtension ( ) + " ' or mimetype ' " + mimeType + " ': " + e . getMessage ( ) ;
log . logWarning ( errorMsg ) ;
2009-10-21 17:12:34 +02:00
throw new ParserException ( errorMsg , location ) ;
}
2010-01-22 14:21:37 +01:00
assert ! idioms . isEmpty ( ) ;
2009-10-21 17:12:34 +02:00
if ( log . isFine ( ) ) log . logInfo ( " Parsing " + location + " with mimeType ' " + mimeType + " ' and file extension ' " + fileExt + " '. " ) ;
Document doc = null ;
2010-01-19 15:59:58 +01:00
HashMap < Idiom , ParserException > failedParser = new HashMap < Idiom , ParserException > ( ) ;
2009-10-21 17:12:34 +02:00
for ( Idiom parser : idioms ) {
2009-07-14 13:01:05 +02:00
parser . setContentLength ( contentLength ) ;
2009-10-21 17:12:34 +02:00
try {
doc = parser . parse ( location , mimeType , documentCharset , sourceStream ) ;
} catch ( ParserException e ) {
2010-01-19 15:59:58 +01:00
failedParser . put ( parser , e ) ;
//log.logWarning("tried parser '" + parser.getName() + "' to parse " + location.toNormalform(true, false) + " but failed: " + e.getMessage(), e);
2009-07-10 00:25:31 +02:00
}
2009-10-21 17:12:34 +02:00
if ( doc ! = null ) break ;
}
if ( doc = = null ) {
2010-01-19 15:59:58 +01:00
if ( failedParser . size ( ) = = 0 ) {
final String errorMsg = " Parsing content with file extension ' " + location . getFileExtension ( ) + " ' and mimetype ' " + mimeType + " ' failed. " ;
//log.logWarning("Unable to parse '" + location + "'. " + errorMsg);
throw new ParserException ( errorMsg , location ) ;
} else {
String failedParsers = " " ;
for ( Map . Entry < Idiom , ParserException > error : failedParser . entrySet ( ) ) {
log . logWarning ( " tried parser ' " + error . getKey ( ) . getName ( ) + " ' to parse " + location . toNormalform ( true , false ) + " but failed: " + error . getValue ( ) . getMessage ( ) , error . getValue ( ) ) ;
failedParsers + = error . getKey ( ) . getName ( ) + " " ;
}
throw new ParserException ( " All parser failed: " + failedParsers , location ) ;
}
2009-07-10 00:25:31 +02:00
}
2009-10-21 17:12:34 +02:00
return doc ;
2009-07-10 00:25:31 +02:00
}
2009-07-14 13:01:05 +02:00
/ * *
* check if the parser supports the given content .
* @param url
* @param mimeType
2009-10-23 00:38:04 +02:00
* @return returns null if the content is supported . If the content is not supported , return a error string .
2009-07-14 13:01:05 +02:00
* /
2009-10-11 02:12:19 +02:00
public static String supports ( final DigestURI url , String mimeType ) {
2009-07-14 13:01:05 +02:00
try {
// try to get a parser. If this works, we don't need the parser itself, we just return null to show that everything is ok.
2009-10-23 00:38:04 +02:00
List < Idiom > idioms = idiomParser ( url , mimeType ) ;
2009-12-02 01:37:59 +01:00
return ( idioms = = null | | idioms . isEmpty ( ) ) ? " no parser found " : null ;
2009-07-14 13:01:05 +02:00
} catch ( ParserException e ) {
// in case that a parser is not available, return a error string describing the problem.
return e . getMessage ( ) ;
}
}
2009-10-21 17:12:34 +02:00
/ * *
* find a parser for a given url and mime type
* because mime types returned by web severs are sometimes wrong , we also compute the mime type again
* from the extension that can be extracted from the url path . That means that there are 3 criteria
* that can be used to select a parser :
* - the given extension
* - the given mime type
* - the mime type computed from the extension
* @param url the given url
* @param mimeType the given mime type
* @return a list of Idiom parsers that may be appropriate for the given criteria
* @throws ParserException
* /
private static List < Idiom > idiomParser ( final DigestURI url , String mimeType1 ) throws ParserException {
List < Idiom > idioms = new ArrayList < Idiom > ( 2 ) ;
2009-07-14 13:01:05 +02:00
// check extension
String ext = url . getFileExtension ( ) ;
2009-10-21 17:12:34 +02:00
Idiom idiom ;
if ( ext ! = null & & ext . length ( ) > 0 ) {
2009-11-11 16:49:53 +01:00
if ( denyExtension . contains ( ext ) ) throw new ParserException ( " file extension ' " + ext + " ' is denied (1) " , url ) ;
2009-10-21 17:12:34 +02:00
idiom = ext2parser . get ( ext ) ;
if ( idiom ! = null ) idioms . add ( idiom ) ;
}
// check given mime type
if ( mimeType1 ! = null ) {
mimeType1 = normalizeMimeType ( mimeType1 ) ;
2009-11-11 16:49:53 +01:00
if ( denyMime . contains ( mimeType1 ) ) throw new ParserException ( " mime type ' " + mimeType1 + " ' is denied (1) " , url ) ;
2009-10-21 17:12:34 +02:00
idiom = mime2parser . get ( mimeType1 ) ;
if ( idiom ! = null & & ! idioms . contains ( idiom ) ) idioms . add ( idiom ) ;
}
// check mime type computed from extension
String mimeType2 = ext2mime . get ( ext ) ;
if ( mimeType2 = = null | | denyMime . contains ( mimeType2 ) ) return idioms ; // in this case we are a bit more lazy
idiom = mime2parser . get ( mimeType2 ) ;
if ( idiom ! = null & & ! idioms . contains ( idiom ) ) idioms . add ( idiom ) ;
2010-01-22 14:21:37 +01:00
// finall check if we found any parser
if ( idioms . isEmpty ( ) ) throw new ParserException ( " no parser found for extension ' " + ext + " ' and mime type ' " + mimeType1 + " ' " , url ) ;
2009-10-21 17:12:34 +02:00
return idioms ;
2009-07-14 13:01:05 +02:00
}
public static String supportsMime ( String mimeType ) {
if ( mimeType = = null ) return null ;
2009-07-10 18:15:33 +02:00
mimeType = normalizeMimeType ( mimeType ) ;
2009-11-11 16:49:53 +01:00
if ( denyMime . contains ( mimeType ) ) return " mime type ' " + mimeType + " ' is denied (2) " ;
2009-07-14 13:01:05 +02:00
if ( mime2parser . get ( mimeType ) = = null ) return " no parser for mime ' " + mimeType + " ' available " ;
return null ;
2009-07-10 16:22:17 +02:00
}
2009-10-11 02:12:19 +02:00
public static String supportsExtension ( final DigestURI url ) {
2009-07-10 16:22:17 +02:00
String ext = url . getFileExtension ( ) ;
2009-07-14 13:01:05 +02:00
if ( ext = = null | | ext . length ( ) = = 0 ) return null ;
2009-11-11 16:49:53 +01:00
if ( denyExtension . contains ( ext ) ) return " file extension ' " + ext + " ' is denied (2) " ;
2009-07-14 13:01:05 +02:00
String mimeType = ext2mime . get ( ext ) ;
if ( mimeType = = null ) return " no parser available " ;
Idiom idiom = mime2parser . get ( mimeType ) ;
assert idiom ! = null ;
if ( idiom = = null ) return " no parser available (internal error!) " ;
return null ;
2009-07-10 16:22:17 +02:00
}
2009-10-11 02:12:19 +02:00
public static String mimeOf ( DigestURI url ) {
2009-07-10 16:22:17 +02:00
return mimeOf ( url . getFileExtension ( ) ) ;
}
public static String mimeOf ( String ext ) {
2009-07-14 13:01:05 +02:00
return ext2mime . get ( ext ) ;
2009-07-10 16:22:17 +02:00
}
private static String normalizeMimeType ( String mimeType ) {
if ( mimeType = = null ) return " application/octet-stream " ;
final int pos = mimeType . indexOf ( ';' ) ;
return ( ( pos < 0 ) ? mimeType . trim ( ) : mimeType . substring ( 0 , pos ) . trim ( ) ) ;
}
public static void setDenyMime ( String denyList ) {
denyMime . clear ( ) ;
2009-11-11 16:49:53 +01:00
String n ;
for ( String s : denyList . split ( " , " ) ) {
n = normalizeMimeType ( s ) ;
if ( n ! = null & & n . length ( ) > 0 ) denyMime . add ( n ) ;
}
2009-07-10 16:22:17 +02:00
}
public static String getDenyMime ( ) {
String s = " " ;
for ( String d : denyMime ) s + = d + " , " ;
s = s . substring ( 0 , s . length ( ) - 1 ) ;
return s ;
}
public static void grantMime ( String mime , boolean grant ) {
2009-11-11 16:49:53 +01:00
String n = normalizeMimeType ( mime ) ;
if ( n = = null | | n . length ( ) = = 0 ) return ;
if ( grant ) denyMime . remove ( n ) ; else denyMime . add ( n ) ;
2009-07-10 16:22:17 +02:00
}
2009-07-14 13:01:05 +02:00
public static void setDenyExtension ( String denyList ) {
denyExtension . clear ( ) ;
for ( String s : denyList . split ( " , " ) ) denyExtension . add ( s ) ;
}
public static String getDenyExtension ( ) {
String s = " " ;
for ( String d : denyExtension ) s + = d + " , " ;
s = s . substring ( 0 , s . length ( ) - 1 ) ;
return s ;
}
public static void grantExtension ( String ext , boolean grant ) {
if ( grant ) denyExtension . remove ( ext ) ; else denyExtension . add ( ext ) ;
}
2009-07-10 00:25:31 +02:00
}