2011-06-01 21:31:56 +02:00
//pdfParser.java
2005-04-24 23:47:34 +02:00
//------------------------
//part of YaCy
2008-07-20 19:14:51 +02:00
//(C) by Michael Peter Christen; mc@yacy.net
2005-04-24 23:47:34 +02:00
//first published on http://www.anomic.de
//Frankfurt, Germany, 2005
//
//this file is contributed by Martin Thelian
2009-07-09 22:13:11 +02:00
//
// $LastChangedDate$
// $LastChangedRevision$
// $LastChangedBy$
2005-04-24 23:47:34 +02:00
//
//This program is free software; you can redistribute it and/or modify
//it under the terms of the GNU General Public License as published by
//the Free Software Foundation; either version 2 of the License, or
//(at your option) any later version.
//
//This program is distributed in the hope that it will be useful,
//but WITHOUT ANY WARRANTY; without even the implied warranty of
//MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
//GNU General Public License for more details.
//
//You should have received a copy of the GNU General Public License
//along with this program; if not, write to the Free Software
//Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
2009-10-18 02:53:43 +02:00
package net.yacy.document.parser ;
2005-04-24 23:24:53 +02:00
2006-09-30 11:31:53 +02:00
import java.io.File ;
2010-06-29 21:20:45 +02:00
import java.io.FileInputStream ;
import java.io.FileNotFoundException ;
2008-05-08 18:55:45 +02:00
import java.io.IOException ;
2005-04-24 23:24:53 +02:00
import java.io.InputStream ;
2009-10-15 11:51:16 +02:00
2011-06-01 21:31:56 +02:00
import org.apache.pdfbox.cos.COSName ;
import org.apache.pdfbox.exceptions.CryptographyException ;
import org.apache.pdfbox.pdmodel.PDDocument ;
import org.apache.pdfbox.pdmodel.PDDocumentInformation ;
import org.apache.pdfbox.pdmodel.encryption.AccessPermission ;
import org.apache.pdfbox.pdmodel.encryption.BadSecurityHandlerException ;
import org.apache.pdfbox.pdmodel.encryption.StandardDecryptionMaterial ;
2013-05-29 16:09:05 +02:00
import org.apache.pdfbox.pdmodel.font.PDCIDFont ;
import org.apache.pdfbox.pdmodel.font.PDCIDFontType0Font ;
import org.apache.pdfbox.pdmodel.font.PDCIDFontType2Font ;
2011-06-01 21:31:56 +02:00
import org.apache.pdfbox.pdmodel.font.PDFont ;
2013-05-29 16:09:05 +02:00
import org.apache.pdfbox.pdmodel.font.PDMMType1Font ;
import org.apache.pdfbox.pdmodel.font.PDSimpleFont ;
import org.apache.pdfbox.pdmodel.font.PDTrueTypeFont ;
import org.apache.pdfbox.pdmodel.font.PDType0Font ;
import org.apache.pdfbox.pdmodel.font.PDType1AfmPfbFont ;
import org.apache.pdfbox.pdmodel.font.PDType1CFont ;
import org.apache.pdfbox.pdmodel.font.PDType1Font ;
import org.apache.pdfbox.pdmodel.font.PDType3Font ;
2011-06-01 21:31:56 +02:00
import org.apache.pdfbox.util.PDFTextStripper ;
2009-10-10 01:13:30 +02:00
2012-12-27 04:16:31 +01:00
import net.yacy.cora.document.MultiProtocolURI ;
2013-07-09 14:28:25 +02:00
import net.yacy.cora.util.ConcurrentLog ;
2012-12-27 04:16:31 +01:00
import net.yacy.document.AbstractParser ;
import net.yacy.document.Document ;
import net.yacy.document.Parser ;
import net.yacy.kelondro.data.meta.DigestURI ;
import net.yacy.kelondro.io.CharBuffer ;
import net.yacy.kelondro.util.FileUtils ;
import net.yacy.kelondro.util.MemoryControl ;
2005-04-24 23:24:53 +02:00
2010-12-27 16:37:11 +01:00
public class pdfParser extends AbstractParser implements Parser {
2011-06-01 21:31:56 +02:00
public pdfParser ( ) {
2010-06-29 21:20:45 +02:00
super ( " Acrobat Portable Document Parser " ) ;
2011-06-01 21:31:56 +02:00
this . SUPPORTED_EXTENSIONS . add ( " pdf " ) ;
this . SUPPORTED_MIME_TYPES . add ( " application/pdf " ) ;
this . SUPPORTED_MIME_TYPES . add ( " application/x-pdf " ) ;
this . SUPPORTED_MIME_TYPES . add ( " application/acrobat " ) ;
this . SUPPORTED_MIME_TYPES . add ( " applications/vnd.pdf " ) ;
this . SUPPORTED_MIME_TYPES . add ( " text/pdf " ) ;
this . SUPPORTED_MIME_TYPES . add ( " text/x-pdf " ) ;
2009-07-14 13:01:05 +02:00
}
2011-06-01 21:31:56 +02:00
2013-06-12 00:17:44 +02:00
static {
clean_up_idiotic_PDFParser_font_cache_which_eats_up_tons_of_megabytes ( ) ; // must be called here to get that into the class loader; it will block other threads otherwise;
}
2012-01-10 03:02:17 +01:00
@Override
2012-06-11 16:48:53 +02:00
public Document [ ] parse ( final DigestURI location , final String mimeType , final String charset , final InputStream source ) throws Parser . Failure , InterruptedException {
2011-06-01 21:31:56 +02:00
// check memory for parser
2012-09-26 14:03:51 +02:00
if ( ! MemoryControl . request ( 200 * 1024 * 1024 , false ) )
2011-06-01 21:31:56 +02:00
throw new Parser . Failure ( " Not enough Memory available for pdf parser: " + MemoryControl . available ( ) , location ) ;
2009-11-06 23:41:37 +01:00
// create a pdf parser
2013-05-29 16:09:05 +02:00
PDDocument pdfDoc ;
2010-09-08 09:32:47 +02:00
//final PDFParser pdfParser;
2009-11-06 20:47:24 +01:00
try {
2013-05-29 16:09:05 +02:00
Thread . currentThread ( ) . setPriority ( Thread . MIN_PRIORITY ) ; // the pdfparser is a big pain
2010-09-08 09:32:47 +02:00
pdfDoc = PDDocument . load ( source ) ;
2013-05-29 16:09:05 +02:00
//PDFParser pdfParser = new PDFParser(source);
2010-09-08 09:32:47 +02:00
//pdfParser.parse();
//pdfDoc = pdfParser.getPDDocument();
2011-06-01 21:31:56 +02:00
} catch ( final IOException e ) {
2010-06-29 21:20:45 +02:00
throw new Parser . Failure ( e . getMessage ( ) , location ) ;
2009-11-06 20:47:24 +01:00
} finally {
Thread . currentThread ( ) . setPriority ( Thread . NORM_PRIORITY ) ;
}
2011-06-01 21:31:56 +02:00
2010-06-29 21:20:45 +02:00
if ( pdfDoc . isEncrypted ( ) ) {
2009-11-06 20:47:24 +01:00
try {
2010-06-29 21:20:45 +02:00
pdfDoc . openProtection ( new StandardDecryptionMaterial ( " " ) ) ;
2011-06-01 21:31:56 +02:00
} catch ( final BadSecurityHandlerException e ) {
try { pdfDoc . close ( ) ; } catch ( final IOException ee ) { }
2010-06-29 21:20:45 +02:00
throw new Parser . Failure ( " Document is encrypted (1): " + e . getMessage ( ) , location ) ;
2011-06-01 21:31:56 +02:00
} catch ( final IOException e ) {
try { pdfDoc . close ( ) ; } catch ( final IOException ee ) { }
2010-06-29 21:20:45 +02:00
throw new Parser . Failure ( " Document is encrypted (2): " + e . getMessage ( ) , location ) ;
2011-06-01 21:31:56 +02:00
} catch ( final CryptographyException e ) {
try { pdfDoc . close ( ) ; } catch ( final IOException ee ) { }
2010-06-29 21:20:45 +02:00
throw new Parser . Failure ( " Document is encrypted (3): " + e . getMessage ( ) , location ) ;
2006-09-30 11:31:53 +02:00
}
2010-06-29 21:20:45 +02:00
final AccessPermission perm = pdfDoc . getCurrentAccessPermission ( ) ;
2009-11-06 20:47:24 +01:00
if ( perm = = null | | ! perm . canExtractContent ( ) )
2011-06-01 21:31:56 +02:00
throw new Parser . Failure ( " Document is encrypted and cannot be decrypted " , location ) ;
2009-11-06 20:47:24 +01:00
}
2011-06-01 21:31:56 +02:00
2009-11-06 20:47:24 +01:00
// extracting some metadata
2013-05-29 16:09:05 +02:00
PDDocumentInformation info = pdfDoc . getDocumentInformation ( ) ;
2010-05-11 13:14:05 +02:00
String docTitle = null , docSubject = null , docAuthor = null , docPublisher = null , docKeywordStr = null ;
2010-09-08 09:32:47 +02:00
if ( info ! = null ) {
docTitle = info . getTitle ( ) ;
docSubject = info . getSubject ( ) ;
docAuthor = info . getAuthor ( ) ;
docPublisher = info . getProducer ( ) ;
2012-07-10 22:59:03 +02:00
if ( docPublisher = = null | | docPublisher . isEmpty ( ) ) docPublisher = info . getCreator ( ) ;
2010-09-08 09:32:47 +02:00
docKeywordStr = info . getKeywords ( ) ;
// unused:
// info.getTrapped());
// info.getCreationDate());
// info.getModificationDate();
}
2013-05-29 16:09:05 +02:00
info = null ;
2011-06-01 21:31:56 +02:00
2012-07-10 22:59:03 +02:00
if ( docTitle = = null | | docTitle . isEmpty ( ) ) {
2010-12-27 16:37:11 +01:00
docTitle = MultiProtocolURI . unescape ( location . getFileName ( ) ) ;
}
2012-01-24 20:42:30 +01:00
final CharBuffer writer = new CharBuffer ( odtParser . MAX_DOCSIZE ) ;
byte [ ] contentBytes = new byte [ 0 ] ;
2009-11-06 20:47:24 +01:00
try {
2010-04-15 00:17:18 +02:00
// create a writer for output
2012-01-24 20:42:30 +01:00
final PDFTextStripper stripper = new PDFTextStripper ( ) ;
2012-01-23 00:46:02 +01:00
2012-01-23 20:58:36 +01:00
stripper . setEndPage ( 3 ) ; // get first 3 pages (always)
writer . append ( stripper . getText ( pdfDoc ) ) ;
2012-01-23 00:46:02 +01:00
contentBytes = writer . getBytes ( ) ; // remember text in case of interrupting thread
2012-01-24 20:42:30 +01:00
2012-01-23 20:58:36 +01:00
stripper . setStartPage ( 4 ) ; // continue with page 4 (terminated, resulting in no text)
stripper . setEndPage ( Integer . MAX_VALUE ) ; // set to default
2011-09-15 13:17:38 +02:00
// we start the pdf parsing in a separate thread to ensure that it can be terminated
2013-05-29 16:09:05 +02:00
final PDDocument pdfDocC = pdfDoc ;
2011-09-15 13:17:38 +02:00
final Thread t = new Thread ( ) {
2012-01-10 03:02:17 +01:00
@Override
2011-09-15 13:17:38 +02:00
public void run ( ) {
2012-07-02 09:51:43 +02:00
Thread . currentThread ( ) . setName ( " pdfParser.getText: " + location ) ;
2012-01-24 20:42:30 +01:00
try {
2013-05-29 16:09:05 +02:00
writer . append ( stripper . getText ( pdfDocC ) ) ;
2011-09-15 13:17:38 +02:00
} catch ( final Throwable e ) { }
2012-02-02 07:37:00 +01:00
}
} ;
2011-09-15 13:17:38 +02:00
t . start ( ) ;
t . join ( 3000 ) ;
2012-01-24 20:42:30 +01:00
if ( t . isAlive ( ) ) t . interrupt ( ) ;
2012-01-23 00:46:02 +01:00
pdfDoc . close ( ) ;
2012-02-02 07:37:00 +01:00
contentBytes = writer . getBytes ( ) ; // get final text before closing writer
2012-08-05 15:49:27 +02:00
} catch ( final Throwable e ) {
2011-06-05 22:04:41 +02:00
// close the writer
if ( writer ! = null ) try { writer . close ( ) ; } catch ( final Exception ex ) { }
2012-08-05 15:49:27 +02:00
try { pdfDoc . close ( ) ; } catch ( final Throwable ee ) { }
2011-06-05 22:04:41 +02:00
//throw new Parser.Failure(e.getMessage(), location);
2010-06-29 21:20:45 +02:00
} finally {
2012-08-05 15:49:27 +02:00
try { pdfDoc . close ( ) ; } catch ( final Throwable e ) { }
2012-02-02 07:37:00 +01:00
writer . close ( ) ;
2009-11-06 20:47:24 +01:00
}
2010-06-21 16:54:54 +02:00
2009-11-06 20:47:24 +01:00
String [ ] docKeywords = null ;
2010-12-27 16:37:11 +01:00
if ( docKeywordStr ! = null ) {
docKeywords = docKeywordStr . split ( " |, " ) ;
}
if ( docTitle = = null ) {
docTitle = docSubject ;
}
2012-02-02 07:37:00 +01:00
2010-09-07 19:13:47 +02:00
// clear resources in pdfbox. they say that is resolved but it's not. see:
// https://issues.apache.org/jira/browse/PDFBOX-313
// https://issues.apache.org/jira/browse/PDFBOX-351
// https://issues.apache.org/jira/browse/PDFBOX-441
// the pdfbox still generates enormeous number of object allocations and don't delete these
// the following Object are statically stored and never flushed:
// COSFloat, COSArray, COSInteger, COSObjectKey, COSObject, COSDictionary,
// COSStream, COSString, COSName, COSDocument, COSInteger[], COSNull
// the great number of these objects can easily be seen in Java Visual VM
// we try to get this shit out of the memory here by forced clear calls, hope the best the rubbish gets out.
2013-05-29 16:09:05 +02:00
pdfDoc = null ;
clean_up_idiotic_PDFParser_font_cache_which_eats_up_tons_of_megabytes ( ) ;
2010-06-29 21:20:45 +02:00
return new Document [ ] { new Document (
2010-06-26 12:32:47 +02:00
location ,
mimeType ,
" UTF-8 " ,
2011-04-21 15:58:49 +02:00
this ,
2010-06-26 12:32:47 +02:00
null ,
docKeywords ,
2012-08-31 10:30:43 +02:00
singleList ( docTitle ) ,
2010-06-26 12:32:47 +02:00
docAuthor ,
docPublisher ,
null ,
null ,
2011-06-01 21:31:56 +02:00
0 . 0f , 0 . 0f ,
2010-06-26 12:32:47 +02:00
contentBytes ,
null ,
null ,
2010-08-25 20:24:54 +02:00
null ,
2010-06-29 21:20:45 +02:00
false ) } ;
2005-04-24 23:24:53 +02:00
}
2010-09-08 09:32:47 +02:00
2013-05-29 16:09:05 +02:00
@SuppressWarnings ( " static-access " )
public static void clean_up_idiotic_PDFParser_font_cache_which_eats_up_tons_of_megabytes ( ) {
// thank you very much, PDFParser hackers, this font cache will occupy >80MB RAM for a single pdf and then stays forever
// AND I DO NOT EVEN NEED A FONT HERE TO PARSE THE TEXT!
// Don't be so ignorant, just google once "PDFParser OutOfMemoryError" to feel the pain.
PDFont . clearResources ( ) ;
COSName . clearResources ( ) ;
PDType1Font . clearResources ( ) ;
PDTrueTypeFont . clearResources ( ) ;
PDType0Font . clearResources ( ) ;
PDType1AfmPfbFont . clearResources ( ) ;
PDType3Font . clearResources ( ) ;
PDType1CFont . clearResources ( ) ;
PDCIDFont . clearResources ( ) ;
PDCIDFontType0Font . clearResources ( ) ;
PDCIDFontType2Font . clearResources ( ) ;
PDMMType1Font . clearResources ( ) ;
PDSimpleFont . clearResources ( ) ;
}
2008-05-08 18:55:45 +02:00
/ * *
* test
* @param args
* /
2008-08-02 14:12:04 +02:00
public static void main ( final String [ ] args ) {
2010-06-29 21:20:45 +02:00
if ( args . length > 0 & & args [ 0 ] . length ( ) > 0 ) {
2008-05-08 18:55:45 +02:00
// file
final File pdfFile = new File ( args [ 0 ] ) ;
if ( pdfFile . canRead ( ) ) {
2011-06-01 21:31:56 +02:00
2008-05-08 18:55:45 +02:00
System . out . println ( pdfFile . getAbsolutePath ( ) ) ;
final long startTime = System . currentTimeMillis ( ) ;
2011-06-01 21:31:56 +02:00
2008-05-08 18:55:45 +02:00
// parse
final AbstractParser parser = new pdfParser ( ) ;
2009-07-08 23:48:08 +02:00
Document document = null ;
2008-05-08 18:55:45 +02:00
try {
2010-06-29 21:20:45 +02:00
document = Document . mergeDocuments ( null , " application/pdf " , parser . parse ( null , " application/pdf " , null , new FileInputStream ( pdfFile ) ) ) ;
} catch ( final Parser . Failure e ) {
System . err . println ( " Cannot parse file " + pdfFile . getAbsolutePath ( ) ) ;
2013-07-09 14:28:25 +02:00
ConcurrentLog . logException ( e ) ;
2008-08-02 14:12:04 +02:00
} catch ( final InterruptedException e ) {
2008-05-08 18:55:45 +02:00
System . err . println ( " Interrupted while parsing! " ) ;
2013-07-09 14:28:25 +02:00
ConcurrentLog . logException ( e ) ;
2008-08-02 14:12:04 +02:00
} catch ( final NoClassDefFoundError e ) {
2008-05-08 18:55:45 +02:00
System . err . println ( " class not found: " + e . getMessage ( ) ) ;
2011-06-01 21:31:56 +02:00
} catch ( final FileNotFoundException e ) {
2013-07-09 14:28:25 +02:00
ConcurrentLog . logException ( e ) ;
2008-05-08 18:55:45 +02:00
}
2011-06-01 21:31:56 +02:00
2008-05-08 18:55:45 +02:00
// statistics
System . out . println ( " \ ttime elapsed: " + ( System . currentTimeMillis ( ) - startTime ) + " ms " ) ;
2011-06-01 21:31:56 +02:00
2008-05-08 18:55:45 +02:00
// output
2010-06-29 21:20:45 +02:00
if ( document = = null ) {
2008-05-08 18:55:45 +02:00
System . out . println ( " \ t!!!Parsing without result!!! " ) ;
} else {
System . out . println ( " \ tParsed text with " + document . getTextLength ( ) + " chars of text and " + document . getAnchors ( ) . size ( ) + " anchors " ) ;
try {
// write file
2012-07-04 21:15:10 +02:00
FileUtils . copy ( document . getTextStream ( ) , new File ( " parsedPdf.txt " ) ) ;
2008-08-02 14:12:04 +02:00
} catch ( final IOException e ) {
2008-05-08 18:55:45 +02:00
System . err . println ( " error saving parsed document " ) ;
2013-07-09 14:28:25 +02:00
ConcurrentLog . logException ( e ) ;
2008-05-08 18:55:45 +02:00
}
}
} else {
System . err . println ( " Cannot read file " + pdfFile . getAbsolutePath ( ) ) ;
}
} else {
System . out . println ( " Please give a filename as first argument. " ) ;
}
}
2005-04-24 23:24:53 +02:00
}