yacy_search_server/source/de/anomic/plasma/parser/pdf/pdfParser.java

package de.anomic.plasma.parser.pdf;

import java.io.BufferedInputStream;
import java.io.ByteArrayInputStream;
import java.io.ByteArrayOutputStream;
import java.io.File;
import java.io.FileInputStream;
import java.io.FileNotFoundException;
import java.io.InputStream;
import java.io.OutputStreamWriter;
import java.net.URL;
import java.util.Arrays;
import java.util.HashSet;


import org.pdfbox.pdfparser.PDFParser;
import org.pdfbox.pdmodel.PDDocument;
import org.pdfbox.pdmodel.PDDocumentInformation;
import org.pdfbox.util.PDFTextStripper;

import de.anomic.plasma.plasmaParserDocument;
import de.anomic.plasma.parser.Parser;
import de.anomic.plasma.parser.ParserException;

public class pdfParser implements Parser
{

    /**
     * a list of mime types that are supported by this parser class
     */
    public static final HashSet SUPPORTED_MIME_TYPES = new HashSet(Arrays.asList(new String[] {
        new String("application/pdf")
    }));    
    
    public pdfParser() {
        super();
    }
    
    public HashSet getSupportedMimeTypes() {
        return SUPPORTED_MIME_TYPES;
    }
    
    public plasmaParserDocument parse(URL location, String mimeType, File sourceFile) throws ParserException {
        BufferedInputStream contentInputStream = null;
        try {
            contentInputStream = new BufferedInputStream(new FileInputStream(sourceFile));
        } catch (FileNotFoundException e) {
            e.printStackTrace();
        }
        return this.parse(location, mimeType, contentInputStream);
    }

    public plasmaParserDocument parse(URL location, String mimeType, byte[] source) throws ParserException {
        ByteArrayInputStream contentInputStream = new ByteArrayInputStream(source);
        return this.parse(location,mimeType,contentInputStream);
    }    
    
    public plasmaParserDocument parse(URL location, String mimeType, InputStream source) throws ParserException {
        
        try {       
            String docTitle = null, docSubject = null, docAuthor = null, docKeyWords = null;
            
            PDFParser parser = new PDFParser(source);
            parser.parse();
            
            PDFTextStripper stripper = new PDFTextStripper();
            PDDocument theDocument = parser.getPDDocument();
                              
            PDDocumentInformation theDocInfo = theDocument.getDocumentInformation();
            
            if (theDocInfo != null)
            {
                docTitle = theDocInfo.getTitle();
                docSubject = theDocInfo.getSubject();
                docAuthor = theDocInfo.getAuthor();
                docKeyWords = theDocInfo.getKeywords();
            }
            
            ByteArrayOutputStream out = new ByteArrayOutputStream();
            OutputStreamWriter writer = new OutputStreamWriter( out );            
            stripper.writeText(theDocument, writer );
            
            writer.close();
            theDocument.close();
            
            byte[] contents = out.toByteArray();
			
            /*
             *         public document(URL location, String mimeType,
                            String keywords, String shortTitle, String longTitle,
                            String[] sections, String abstrct,
                            byte[] text, Map anchors, Map images) {
             * 
             */            
            plasmaParserDocument theDoc = new plasmaParserDocument(
                    location,
                    mimeType,
                    docKeyWords,
                    docSubject,
                    docTitle,
                    null,
                    null,
                    contents,
                    null,
                    null);
            
            return theDoc;
        }
        catch (Exception e) {            
            throw new ParserException("Unable to parse the pdf content. " + e.getMessage());
        }        
    }
    
    public void reset() {
    	// TODO Auto-generated method stub
    	
    }

}
) adding an new package for extra content parsers ) adding content parser for - pdf (using the pdf-box library) - doc (using the textmining.org library) ) adding a Interface for content parsers ) adding a configuration file which can be used to configure which parser is used for which mimeType ) Sempahore class was moved and renamed to serverSemaphore ) Changing yacy shutdown behaviour Buzy waiting loop for shutdown was removed and replaced with a blocking call (using the semaphore class mentioned above) to the new switchboard.waitForShutdown method. git-svn-id: https://svn.berlios.de/svnroot/repos/yacy/trunk@46 6c8d7289-2bf4-0310-a012-ef5d649a1542 2005-04-24 23:24:53 +02:00			`package de.anomic.plasma.parser.pdf;`

			`import java.io.BufferedInputStream;`
			`import java.io.ByteArrayInputStream;`
			`import java.io.ByteArrayOutputStream;`
			`import java.io.File;`
			`import java.io.FileInputStream;`
			`import java.io.FileNotFoundException;`
			`import java.io.InputStream;`
			`import java.io.OutputStreamWriter;`
			`import java.net.URL;`
			`import java.util.Arrays;`
			`import java.util.HashSet;`


			`import org.pdfbox.pdfparser.PDFParser;`
			`import org.pdfbox.pdmodel.PDDocument;`
			`import org.pdfbox.pdmodel.PDDocumentInformation;`
			`import org.pdfbox.util.PDFTextStripper;`

			`import de.anomic.plasma.plasmaParserDocument;`
			`import de.anomic.plasma.parser.Parser;`
			`import de.anomic.plasma.parser.ParserException;`

			`public class pdfParser implements Parser`
			`{`

			`/**`
			`* a list of mime types that are supported by this parser class`
			`*/`
*) I've accidentally used Java 5.0 syntax for enumerations git-svn-id: https://svn.berlios.de/svnroot/repos/yacy/trunk@47 6c8d7289-2bf4-0310-a012-ef5d649a1542 2005-04-24 23:42:02 +02:00			`public static final HashSet SUPPORTED_MIME_TYPES = new HashSet(Arrays.asList(new String[] {`
) adding an new package for extra content parsers ) adding content parser for - pdf (using the pdf-box library) - doc (using the textmining.org library) ) adding a Interface for content parsers ) adding a configuration file which can be used to configure which parser is used for which mimeType ) Sempahore class was moved and renamed to serverSemaphore ) Changing yacy shutdown behaviour Buzy waiting loop for shutdown was removed and replaced with a blocking call (using the semaphore class mentioned above) to the new switchboard.waitForShutdown method. git-svn-id: https://svn.berlios.de/svnroot/repos/yacy/trunk@46 6c8d7289-2bf4-0310-a012-ef5d649a1542 2005-04-24 23:24:53 +02:00			`new String("application/pdf")`
			`}));`

			`public pdfParser() {`
			`super();`
			`}`

			`public HashSet getSupportedMimeTypes() {`
			`return SUPPORTED_MIME_TYPES;`
			`}`

			`public plasmaParserDocument parse(URL location, String mimeType, File sourceFile) throws ParserException {`
			`BufferedInputStream contentInputStream = null;`
			`try {`
			`contentInputStream = new BufferedInputStream(new FileInputStream(sourceFile));`
			`} catch (FileNotFoundException e) {`
			`e.printStackTrace();`
			`}`
			`return this.parse(location, mimeType, contentInputStream);`
			`}`

			`public plasmaParserDocument parse(URL location, String mimeType, byte[] source) throws ParserException {`
			`ByteArrayInputStream contentInputStream = new ByteArrayInputStream(source);`
			`return this.parse(location,mimeType,contentInputStream);`
			`}`

			`public plasmaParserDocument parse(URL location, String mimeType, InputStream source) throws ParserException {`

			`try {`
			`String docTitle = null, docSubject = null, docAuthor = null, docKeyWords = null;`

			`PDFParser parser = new PDFParser(source);`
			`parser.parse();`

			`PDFTextStripper stripper = new PDFTextStripper();`
			`PDDocument theDocument = parser.getPDDocument();`

			`PDDocumentInformation theDocInfo = theDocument.getDocumentInformation();`

			`if (theDocInfo != null)`
			`{`
			`docTitle = theDocInfo.getTitle();`
			`docSubject = theDocInfo.getSubject();`
			`docAuthor = theDocInfo.getAuthor();`
			`docKeyWords = theDocInfo.getKeywords();`
			`}`

			`ByteArrayOutputStream out = new ByteArrayOutputStream();`
			`OutputStreamWriter writer = new OutputStreamWriter( out );`
			`stripper.writeText(theDocument, writer );`

			`writer.close();`
			`theDocument.close();`

			`byte[] contents = out.toByteArray();`

			`/*`
			`* public document(URL location, String mimeType,`
			`String keywords, String shortTitle, String longTitle,`
			`String[] sections, String abstrct,`
			`byte[] text, Map anchors, Map images) {`
			`*`
			`*/`
			`plasmaParserDocument theDoc = new plasmaParserDocument(`
			`location,`
			`mimeType,`
			`docKeyWords,`
			`docSubject,`
			`docTitle,`
			`null,`
			`null,`
			`contents,`
			`null,`
			`null);`

			`return theDoc;`
			`}`
			`catch (Exception e) {`
			`throw new ParserException("Unable to parse the pdf content. " + e.getMessage());`
			`}`
			`}`

			`public void reset() {`
			`// TODO Auto-generated method stub`

			`}`

			`}`