mirror of
https://github.com/yacy/yacy_search_server.git
synced 2024-09-21 00:00:13 +02:00
7bcfa033c9
a cache access shall not made directly to the cache any more, all loading attempts shall use the LoaderDispatcher. To control the usage of the cache, a enum instance from CrawlProfile.CacheStrategy shall be used. Some direct loading methods without the usage of a cache strategy have been removed. This affects also the verify-option of the yacysearch servlet. If there is a 'verify=false' now after this commit this does not necessarily mean that no snippets are generated. Instead, all snippets that can be retrieved using the cache only are presented. This still means that the search hit was not verified because the snippet was generated using the cache. If a cache-based generation of snippets is not possible, then the verify=false causes that the link is not rejected. git-svn-id: https://svn.berlios.de/svnroot/repos/yacy/trunk@6936 6c8d7289-2bf4-0310-a012-ef5d649a1542
265 lines
10 KiB
Java
265 lines
10 KiB
Java
//pdfParser.java
|
|
//------------------------
|
|
//part of YaCy
|
|
//(C) by Michael Peter Christen; mc@yacy.net
|
|
//first published on http://www.anomic.de
|
|
//Frankfurt, Germany, 2005
|
|
//
|
|
//this file is contributed by Martin Thelian
|
|
//
|
|
// $LastChangedDate$
|
|
// $LastChangedRevision$
|
|
// $LastChangedBy$
|
|
//
|
|
//This program is free software; you can redistribute it and/or modify
|
|
//it under the terms of the GNU General Public License as published by
|
|
//the Free Software Foundation; either version 2 of the License, or
|
|
//(at your option) any later version.
|
|
//
|
|
//This program is distributed in the hope that it will be useful,
|
|
//but WITHOUT ANY WARRANTY; without even the implied warranty of
|
|
//MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
|
|
//GNU General Public License for more details.
|
|
//
|
|
//You should have received a copy of the GNU General Public License
|
|
//along with this program; if not, write to the Free Software
|
|
//Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
|
|
|
|
package net.yacy.document.parser;
|
|
|
|
import java.io.File;
|
|
import java.io.IOException;
|
|
import java.io.InputStream;
|
|
import java.io.UnsupportedEncodingException;
|
|
import java.io.Writer;
|
|
import java.util.HashSet;
|
|
import java.util.Set;
|
|
|
|
import org.apache.pdfbox.exceptions.CryptographyException;
|
|
import org.apache.pdfbox.pdfparser.PDFParser;
|
|
import org.apache.pdfbox.pdmodel.PDDocument;
|
|
import org.apache.pdfbox.pdmodel.PDDocumentInformation;
|
|
import org.apache.pdfbox.pdmodel.encryption.AccessPermission;
|
|
import org.apache.pdfbox.pdmodel.encryption.BadSecurityHandlerException;
|
|
import org.apache.pdfbox.pdmodel.encryption.StandardDecryptionMaterial;
|
|
import org.apache.pdfbox.util.PDFTextStripper;
|
|
|
|
import net.yacy.cora.document.MultiProtocolURI;
|
|
import net.yacy.document.AbstractParser;
|
|
import net.yacy.document.Document;
|
|
import net.yacy.document.Idiom;
|
|
import net.yacy.document.ParserException;
|
|
import net.yacy.kelondro.io.CharBuffer;
|
|
import net.yacy.kelondro.logging.Log;
|
|
import net.yacy.kelondro.util.FileUtils;
|
|
|
|
|
|
public class pdfParser extends AbstractParser implements Idiom {
|
|
|
|
/**
|
|
* a list of mime types that are supported by this parser class
|
|
* @see #getSupportedMimeTypes()
|
|
*/
|
|
public static final Set<String> SUPPORTED_MIME_TYPES = new HashSet<String>();
|
|
public static final Set<String> SUPPORTED_EXTENSIONS = new HashSet<String>();
|
|
static {
|
|
SUPPORTED_EXTENSIONS.add("pdf");
|
|
SUPPORTED_MIME_TYPES.add("application/pdf");
|
|
SUPPORTED_MIME_TYPES.add("application/x-pdf");
|
|
SUPPORTED_MIME_TYPES.add("application/acrobat");
|
|
SUPPORTED_MIME_TYPES.add("applications/vnd.pdf");
|
|
SUPPORTED_MIME_TYPES.add("text/pdf");
|
|
SUPPORTED_MIME_TYPES.add("text/x-pdf");
|
|
}
|
|
|
|
public pdfParser() {
|
|
super("Acrobat Portable Document Parser");
|
|
}
|
|
|
|
public Set<String> supportedMimeTypes() {
|
|
return SUPPORTED_MIME_TYPES;
|
|
}
|
|
|
|
public Set<String> supportedExtensions() {
|
|
return SUPPORTED_EXTENSIONS;
|
|
}
|
|
|
|
public Document parse(final MultiProtocolURI location, final String mimeType, final String charset, final InputStream source) throws ParserException, InterruptedException {
|
|
|
|
// create a pdf parser
|
|
final PDDocument theDocument;
|
|
final PDFParser parser;
|
|
try {
|
|
Thread.currentThread().setPriority(Thread.MIN_PRIORITY);
|
|
parser = new PDFParser(source);
|
|
parser.parse();
|
|
theDocument = parser.getPDDocument();
|
|
} catch (IOException e) {
|
|
Log.logException(e);
|
|
throw new ParserException(e.getMessage(), location);
|
|
} finally {
|
|
Thread.currentThread().setPriority(Thread.NORM_PRIORITY);
|
|
}
|
|
|
|
checkInterruption();
|
|
|
|
if (theDocument.isEncrypted()) {
|
|
try {
|
|
theDocument.openProtection(new StandardDecryptionMaterial(""));
|
|
} catch (BadSecurityHandlerException e) {
|
|
Log.logException(e);
|
|
throw new ParserException("Document is encrypted (1): " + e.getMessage(), location);
|
|
} catch (IOException e) {
|
|
Log.logException(e);
|
|
throw new ParserException("Document is encrypted (2): " + e.getMessage(), location);
|
|
} catch (CryptographyException e) {
|
|
Log.logException(e);
|
|
throw new ParserException("Document is encrypted (3): " + e.getMessage(), location);
|
|
}
|
|
final AccessPermission perm = theDocument.getCurrentAccessPermission();
|
|
if (perm == null || !perm.canExtractContent())
|
|
throw new ParserException("Document is encrypted and cannot decrypted", location);
|
|
}
|
|
|
|
// extracting some metadata
|
|
final PDDocumentInformation theDocInfo = theDocument.getDocumentInformation();
|
|
String docTitle = null, docSubject = null, docAuthor = null, docPublisher = null, docKeywordStr = null;
|
|
if (theDocInfo != null) {
|
|
docTitle = theDocInfo.getTitle();
|
|
docSubject = theDocInfo.getSubject();
|
|
docAuthor = theDocInfo.getAuthor();
|
|
docPublisher = theDocInfo.getProducer();
|
|
docKeywordStr = theDocInfo.getKeywords();
|
|
}
|
|
|
|
Writer writer = null;
|
|
File writerFile = null;
|
|
PDFTextStripper stripper = null;
|
|
try {
|
|
// create a writer for output
|
|
writer = new CharBuffer();
|
|
stripper = new PDFTextStripper();
|
|
stripper.writeText(theDocument, writer); // may throw a NPE
|
|
theDocument.close();
|
|
writer.close();
|
|
} catch (IOException e) {
|
|
Log.logException(e);
|
|
// close the writer
|
|
if (writer != null) try { writer.close(); } catch (final Exception ex) {}
|
|
|
|
// delete the file
|
|
if (writerFile != null) FileUtils.deletedelete(writerFile);
|
|
throw new ParserException(e.getMessage(), location);
|
|
}
|
|
|
|
String[] docKeywords = null;
|
|
if (docKeywordStr != null) docKeywords = docKeywordStr.split(" |,");
|
|
|
|
Document theDoc = null;
|
|
if (docTitle == null) docTitle = docSubject;
|
|
|
|
if (writer instanceof CharBuffer) {
|
|
byte[] contentBytes;
|
|
try {
|
|
contentBytes = ((CharBuffer) writer).toString().getBytes("UTF-8");
|
|
} catch (UnsupportedEncodingException e) {
|
|
Log.logException(e);
|
|
throw new ParserException(e.getMessage(), location);
|
|
}
|
|
theDoc = new Document(
|
|
location,
|
|
mimeType,
|
|
"UTF-8",
|
|
null,
|
|
docKeywords,
|
|
docTitle,
|
|
docAuthor,
|
|
docPublisher,
|
|
null,
|
|
null,
|
|
contentBytes,
|
|
null,
|
|
null,
|
|
false);
|
|
} else {
|
|
theDoc = new Document(
|
|
location,
|
|
mimeType,
|
|
"UTF-8",
|
|
null,
|
|
docKeywords,
|
|
docTitle,
|
|
docAuthor,
|
|
docPublisher,
|
|
null,
|
|
null,
|
|
writerFile,
|
|
null,
|
|
null,
|
|
false);
|
|
}
|
|
|
|
return theDoc;
|
|
}
|
|
|
|
@Override
|
|
public void reset() {
|
|
// Nothing todo here at the moment
|
|
super.reset();
|
|
}
|
|
|
|
/**
|
|
* test
|
|
* @param args
|
|
*/
|
|
public static void main(final String[] args) {
|
|
if(args.length > 0 && args[0].length() > 0) {
|
|
// file
|
|
final File pdfFile = new File(args[0]);
|
|
if(pdfFile.canRead()) {
|
|
|
|
System.out.println(pdfFile.getAbsolutePath());
|
|
final long startTime = System.currentTimeMillis();
|
|
|
|
// parse
|
|
final AbstractParser parser = new pdfParser();
|
|
Document document = null;
|
|
try {
|
|
document = parser.parse(null, "application/pdf", null, pdfFile);
|
|
|
|
} catch (final ParserException e) {
|
|
System.err.println("Cannot parse file "+ pdfFile.getAbsolutePath());
|
|
Log.logException(e);
|
|
} catch (final InterruptedException e) {
|
|
System.err.println("Interrupted while parsing!");
|
|
Log.logException(e);
|
|
} catch (final NoClassDefFoundError e) {
|
|
System.err.println("class not found: " + e.getMessage());
|
|
}
|
|
|
|
// statistics
|
|
System.out.println("\ttime elapsed: " + (System.currentTimeMillis() - startTime) + " ms");
|
|
|
|
// output
|
|
if(document == null) {
|
|
System.out.println("\t!!!Parsing without result!!!");
|
|
} else {
|
|
System.out.println("\tParsed text with " + document.getTextLength() + " chars of text and " + document.getAnchors().size() + " anchors");
|
|
try {
|
|
// write file
|
|
FileUtils.copy(document.getText(), new File("parsedPdf.txt"));
|
|
} catch (final IOException e) {
|
|
System.err.println("error saving parsed document");
|
|
Log.logException(e);
|
|
}
|
|
}
|
|
} else {
|
|
System.err.println("Cannot read file "+ pdfFile.getAbsolutePath());
|
|
}
|
|
} else {
|
|
System.out.println("Please give a filename as first argument.");
|
|
}
|
|
}
|
|
|
|
}
|