slighlty changed way of pdf parser integration

git-svn-id: https://svn.berlios.de/svnroot/repos/yacy/trunk@7124 6c8d7289-2bf4-0310-a012-ef5d649a1542
This commit is contained in:
orbiter 2010-09-08 07:32:47 +00:00
parent 6d83c7cb62
commit c0b08ac59b

View File

@ -36,7 +36,6 @@ import java.io.UnsupportedEncodingException;
import org.apache.pdfbox.cos.COSName; import org.apache.pdfbox.cos.COSName;
import org.apache.pdfbox.exceptions.CryptographyException; import org.apache.pdfbox.exceptions.CryptographyException;
import org.apache.pdfbox.pdfparser.PDFParser;
import org.apache.pdfbox.pdmodel.PDDocument; import org.apache.pdfbox.pdmodel.PDDocument;
import org.apache.pdfbox.pdmodel.PDDocumentInformation; import org.apache.pdfbox.pdmodel.PDDocumentInformation;
import org.apache.pdfbox.pdmodel.encryption.AccessPermission; import org.apache.pdfbox.pdmodel.encryption.AccessPermission;
@ -71,12 +70,13 @@ public class pdfParser extends AbstractParser implements Parser {
// create a pdf parser // create a pdf parser
PDDocument pdfDoc = null; PDDocument pdfDoc = null;
final PDFParser pdfParser; //final PDFParser pdfParser;
try { try {
Thread.currentThread().setPriority(Thread.MIN_PRIORITY); Thread.currentThread().setPriority(Thread.MIN_PRIORITY);
pdfParser = new PDFParser(source); pdfDoc = PDDocument.load(source);
pdfParser.parse(); //pdfParser = new PDFParser(source);
pdfDoc = pdfParser.getPDDocument(); //pdfParser.parse();
//pdfDoc = pdfParser.getPDDocument();
} catch (IOException e) { } catch (IOException e) {
if (pdfDoc != null) try {pdfDoc.close();} catch (IOException ee) {} if (pdfDoc != null) try {pdfDoc.close();} catch (IOException ee) {}
throw new Parser.Failure(e.getMessage(), location); throw new Parser.Failure(e.getMessage(), location);
@ -103,15 +103,20 @@ public class pdfParser extends AbstractParser implements Parser {
} }
// extracting some metadata // extracting some metadata
final PDDocumentInformation theDocInfo = pdfDoc.getDocumentInformation(); final PDDocumentInformation info = pdfDoc.getDocumentInformation();
String docTitle = null, docSubject = null, docAuthor = null, docPublisher = null, docKeywordStr = null; String docTitle = null, docSubject = null, docAuthor = null, docPublisher = null, docKeywordStr = null;
if (theDocInfo != null) { if (info != null) {
docTitle = theDocInfo.getTitle(); docTitle = info.getTitle();
docSubject = theDocInfo.getSubject(); docSubject = info.getSubject();
docAuthor = theDocInfo.getAuthor(); docAuthor = info.getAuthor();
docPublisher = theDocInfo.getProducer(); docPublisher = info.getProducer();
docKeywordStr = theDocInfo.getKeywords(); if (docPublisher == null || docPublisher.length() == 0) docPublisher = info.getCreator();
} docKeywordStr = info.getKeywords();
// unused:
// info.getTrapped());
// info.getCreationDate());
// info.getModificationDate();
}
CharBuffer writer = null; CharBuffer writer = null;
try { try {
@ -175,7 +180,7 @@ public class pdfParser extends AbstractParser implements Parser {
null, null,
false)}; false)};
} }
/** /**
* test * test
* @param args * @param args