slighlty changed way of pdf parser integration

git-svn-id: https://svn.berlios.de/svnroot/repos/yacy/trunk@7124 6c8d7289-2bf4-0310-a012-ef5d649a1542
This commit is contained in:
orbiter 2010-09-08 07:32:47 +00:00
parent 6d83c7cb62
commit c0b08ac59b

View File

@ -36,7 +36,6 @@ import java.io.UnsupportedEncodingException;
import org.apache.pdfbox.cos.COSName;
import org.apache.pdfbox.exceptions.CryptographyException;
import org.apache.pdfbox.pdfparser.PDFParser;
import org.apache.pdfbox.pdmodel.PDDocument;
import org.apache.pdfbox.pdmodel.PDDocumentInformation;
import org.apache.pdfbox.pdmodel.encryption.AccessPermission;
@ -71,12 +70,13 @@ public class pdfParser extends AbstractParser implements Parser {
// create a pdf parser
PDDocument pdfDoc = null;
final PDFParser pdfParser;
//final PDFParser pdfParser;
try {
Thread.currentThread().setPriority(Thread.MIN_PRIORITY);
pdfParser = new PDFParser(source);
pdfParser.parse();
pdfDoc = pdfParser.getPDDocument();
pdfDoc = PDDocument.load(source);
//pdfParser = new PDFParser(source);
//pdfParser.parse();
//pdfDoc = pdfParser.getPDDocument();
} catch (IOException e) {
if (pdfDoc != null) try {pdfDoc.close();} catch (IOException ee) {}
throw new Parser.Failure(e.getMessage(), location);
@ -103,14 +103,19 @@ public class pdfParser extends AbstractParser implements Parser {
}
// extracting some metadata
final PDDocumentInformation theDocInfo = pdfDoc.getDocumentInformation();
final PDDocumentInformation info = pdfDoc.getDocumentInformation();
String docTitle = null, docSubject = null, docAuthor = null, docPublisher = null, docKeywordStr = null;
if (theDocInfo != null) {
docTitle = theDocInfo.getTitle();
docSubject = theDocInfo.getSubject();
docAuthor = theDocInfo.getAuthor();
docPublisher = theDocInfo.getProducer();
docKeywordStr = theDocInfo.getKeywords();
if (info != null) {
docTitle = info.getTitle();
docSubject = info.getSubject();
docAuthor = info.getAuthor();
docPublisher = info.getProducer();
if (docPublisher == null || docPublisher.length() == 0) docPublisher = info.getCreator();
docKeywordStr = info.getKeywords();
// unused:
// info.getTrapped());
// info.getCreationDate());
// info.getModificationDate();
}
CharBuffer writer = null;