package de.anomic.plasma.parser.pdf; import java.io.BufferedInputStream; import java.io.ByteArrayInputStream; import java.io.ByteArrayOutputStream; import java.io.File; import java.io.FileInputStream; import java.io.FileNotFoundException; import java.io.InputStream; import java.io.OutputStreamWriter; import java.net.URL; import java.util.Arrays; import java.util.HashSet; import org.pdfbox.pdfparser.PDFParser; import org.pdfbox.pdmodel.PDDocument; import org.pdfbox.pdmodel.PDDocumentInformation; import org.pdfbox.util.PDFTextStripper; import de.anomic.plasma.plasmaParserDocument; import de.anomic.plasma.parser.Parser; import de.anomic.plasma.parser.ParserException; public class pdfParser implements Parser { /** * a list of mime types that are supported by this parser class */ public static final HashSet SUPPORTED_MIME_TYPES = new HashSet(Arrays.asList(new String[] { new String("application/pdf") })); public pdfParser() { super(); } public HashSet getSupportedMimeTypes() { return SUPPORTED_MIME_TYPES; } public plasmaParserDocument parse(URL location, String mimeType, File sourceFile) throws ParserException { BufferedInputStream contentInputStream = null; try { contentInputStream = new BufferedInputStream(new FileInputStream(sourceFile)); } catch (FileNotFoundException e) { e.printStackTrace(); } return this.parse(location, mimeType, contentInputStream); } public plasmaParserDocument parse(URL location, String mimeType, byte[] source) throws ParserException { ByteArrayInputStream contentInputStream = new ByteArrayInputStream(source); return this.parse(location,mimeType,contentInputStream); } public plasmaParserDocument parse(URL location, String mimeType, InputStream source) throws ParserException { try { String docTitle = null, docSubject = null, docAuthor = null, docKeyWords = null; PDFParser parser = new PDFParser(source); parser.parse(); PDFTextStripper stripper = new PDFTextStripper(); PDDocument theDocument = parser.getPDDocument(); PDDocumentInformation theDocInfo = theDocument.getDocumentInformation(); if (theDocInfo != null) { docTitle = theDocInfo.getTitle(); docSubject = theDocInfo.getSubject(); docAuthor = theDocInfo.getAuthor(); docKeyWords = theDocInfo.getKeywords(); } ByteArrayOutputStream out = new ByteArrayOutputStream(); OutputStreamWriter writer = new OutputStreamWriter( out ); stripper.writeText(theDocument, writer ); writer.close(); theDocument.close(); byte[] contents = out.toByteArray(); /* * public document(URL location, String mimeType, String keywords, String shortTitle, String longTitle, String[] sections, String abstrct, byte[] text, Map anchors, Map images) { * */ plasmaParserDocument theDoc = new plasmaParserDocument( location, mimeType, docKeyWords, docSubject, docTitle, null, null, contents, null, null); return theDoc; } catch (Exception e) { throw new ParserException("Unable to parse the pdf content. " + e.getMessage()); } } public void reset() { // TODO Auto-generated method stub } }