mirror of
https://github.com/yacy/yacy_search_server.git
synced 2024-09-19 00:01:41 +02:00
slighlty changed way of pdf parser integration
git-svn-id: https://svn.berlios.de/svnroot/repos/yacy/trunk@7124 6c8d7289-2bf4-0310-a012-ef5d649a1542
This commit is contained in:
parent
6d83c7cb62
commit
c0b08ac59b
|
@ -36,7 +36,6 @@ import java.io.UnsupportedEncodingException;
|
||||||
|
|
||||||
import org.apache.pdfbox.cos.COSName;
|
import org.apache.pdfbox.cos.COSName;
|
||||||
import org.apache.pdfbox.exceptions.CryptographyException;
|
import org.apache.pdfbox.exceptions.CryptographyException;
|
||||||
import org.apache.pdfbox.pdfparser.PDFParser;
|
|
||||||
import org.apache.pdfbox.pdmodel.PDDocument;
|
import org.apache.pdfbox.pdmodel.PDDocument;
|
||||||
import org.apache.pdfbox.pdmodel.PDDocumentInformation;
|
import org.apache.pdfbox.pdmodel.PDDocumentInformation;
|
||||||
import org.apache.pdfbox.pdmodel.encryption.AccessPermission;
|
import org.apache.pdfbox.pdmodel.encryption.AccessPermission;
|
||||||
|
@ -71,12 +70,13 @@ public class pdfParser extends AbstractParser implements Parser {
|
||||||
|
|
||||||
// create a pdf parser
|
// create a pdf parser
|
||||||
PDDocument pdfDoc = null;
|
PDDocument pdfDoc = null;
|
||||||
final PDFParser pdfParser;
|
//final PDFParser pdfParser;
|
||||||
try {
|
try {
|
||||||
Thread.currentThread().setPriority(Thread.MIN_PRIORITY);
|
Thread.currentThread().setPriority(Thread.MIN_PRIORITY);
|
||||||
pdfParser = new PDFParser(source);
|
pdfDoc = PDDocument.load(source);
|
||||||
pdfParser.parse();
|
//pdfParser = new PDFParser(source);
|
||||||
pdfDoc = pdfParser.getPDDocument();
|
//pdfParser.parse();
|
||||||
|
//pdfDoc = pdfParser.getPDDocument();
|
||||||
} catch (IOException e) {
|
} catch (IOException e) {
|
||||||
if (pdfDoc != null) try {pdfDoc.close();} catch (IOException ee) {}
|
if (pdfDoc != null) try {pdfDoc.close();} catch (IOException ee) {}
|
||||||
throw new Parser.Failure(e.getMessage(), location);
|
throw new Parser.Failure(e.getMessage(), location);
|
||||||
|
@ -103,15 +103,20 @@ public class pdfParser extends AbstractParser implements Parser {
|
||||||
}
|
}
|
||||||
|
|
||||||
// extracting some metadata
|
// extracting some metadata
|
||||||
final PDDocumentInformation theDocInfo = pdfDoc.getDocumentInformation();
|
final PDDocumentInformation info = pdfDoc.getDocumentInformation();
|
||||||
String docTitle = null, docSubject = null, docAuthor = null, docPublisher = null, docKeywordStr = null;
|
String docTitle = null, docSubject = null, docAuthor = null, docPublisher = null, docKeywordStr = null;
|
||||||
if (theDocInfo != null) {
|
if (info != null) {
|
||||||
docTitle = theDocInfo.getTitle();
|
docTitle = info.getTitle();
|
||||||
docSubject = theDocInfo.getSubject();
|
docSubject = info.getSubject();
|
||||||
docAuthor = theDocInfo.getAuthor();
|
docAuthor = info.getAuthor();
|
||||||
docPublisher = theDocInfo.getProducer();
|
docPublisher = info.getProducer();
|
||||||
docKeywordStr = theDocInfo.getKeywords();
|
if (docPublisher == null || docPublisher.length() == 0) docPublisher = info.getCreator();
|
||||||
}
|
docKeywordStr = info.getKeywords();
|
||||||
|
// unused:
|
||||||
|
// info.getTrapped());
|
||||||
|
// info.getCreationDate());
|
||||||
|
// info.getModificationDate();
|
||||||
|
}
|
||||||
|
|
||||||
CharBuffer writer = null;
|
CharBuffer writer = null;
|
||||||
try {
|
try {
|
||||||
|
@ -175,7 +180,7 @@ public class pdfParser extends AbstractParser implements Parser {
|
||||||
null,
|
null,
|
||||||
false)};
|
false)};
|
||||||
}
|
}
|
||||||
|
|
||||||
/**
|
/**
|
||||||
* test
|
* test
|
||||||
* @param args
|
* @param args
|
||||||
|
|
Loading…
Reference in New Issue
Block a user