mirror of
https://github.com/yacy/yacy_search_server.git
synced 2024-09-19 00:01:41 +02:00
Updated pdf cache clear steps consistently with current pdfbox version
- Removed calls to no more existing clearResources functions (on PDFont class and its children) since upgrade to pdfbox 2.n.n - Removed hacky usage of protected internal ClassLoader function. This removes the warnings displayed when running with JDK9 or JDK10 : [java] WARNING: Illegal reflective access by net.yacy.document.parser.pdfParser$ResourceCleaner (file:<path>) to method java.lang.ClassLoader.findLoadedClass(java.lang.String) [java] WARNING: Please consider reporting this to the maintainers of net.yacy.document.parser.pdfParser$ResourceCleaner [java] WARNING: Use --illegal-access=warn to enable warnings of further illegal reflective access operations [java] WARNING: All illegal access operations will be denied in a future release Crawling thousands of pdf documents from various sources after modifications applied, revealed no new memory leak related to pdfbox (measurements done with JVisualVM).
This commit is contained in:
parent
685122363d
commit
54fbe166ba
|
@ -32,7 +32,6 @@ import java.io.FileInputStream;
|
|||
import java.io.FileNotFoundException;
|
||||
import java.io.IOException;
|
||||
import java.io.InputStream;
|
||||
import java.lang.reflect.Method;
|
||||
import java.nio.charset.StandardCharsets;
|
||||
import java.util.ArrayList;
|
||||
import java.util.Collection;
|
||||
|
@ -40,6 +39,7 @@ import java.util.Date;
|
|||
import java.util.HashSet;
|
||||
import java.util.List;
|
||||
|
||||
import org.apache.pdfbox.cos.COSName;
|
||||
import org.apache.pdfbox.io.MemoryUsageSetting;
|
||||
import org.apache.pdfbox.pdmodel.PDDocument;
|
||||
import org.apache.pdfbox.pdmodel.PDDocumentInformation;
|
||||
|
@ -81,10 +81,6 @@ public class pdfParser extends AbstractParser implements Parser {
|
|||
this.SUPPORTED_MIME_TYPES.add("text/x-pdf");
|
||||
}
|
||||
|
||||
static {
|
||||
clean_up_idiotic_PDFParser_font_cache_which_eats_up_tons_of_megabytes(); // must be called here to get that into the class loader; it will block other threads otherwise;
|
||||
}
|
||||
|
||||
@Override
|
||||
public Document[] parse(
|
||||
final DigestURL location,
|
||||
|
@ -249,18 +245,9 @@ public class pdfParser extends AbstractParser implements Parser {
|
|||
try {pdfDoc.close();} catch (final Throwable e) {}
|
||||
}
|
||||
|
||||
// clear resources in pdfbox. they say that is resolved but it's not. see:
|
||||
// https://issues.apache.org/jira/browse/PDFBOX-313
|
||||
// https://issues.apache.org/jira/browse/PDFBOX-351
|
||||
// https://issues.apache.org/jira/browse/PDFBOX-441
|
||||
// the pdfbox still generates enormeous number of object allocations and don't delete these
|
||||
// the following Object are statically stored and never flushed:
|
||||
// COSFloat, COSArray, COSInteger, COSObjectKey, COSObject, COSDictionary,
|
||||
// COSStream, COSString, COSName, COSDocument, COSInteger[], COSNull
|
||||
// the great number of these objects can easily be seen in Java Visual VM
|
||||
// we try to get this shit out of the memory here by forced clear calls, hope the best the rubbish gets out.
|
||||
// clear cached resources in pdfbox.
|
||||
pdfDoc = null;
|
||||
clean_up_idiotic_PDFParser_font_cache_which_eats_up_tons_of_megabytes();
|
||||
clearPdfBoxCaches();
|
||||
|
||||
return result;
|
||||
}
|
||||
|
@ -295,55 +282,35 @@ public class pdfParser extends AbstractParser implements Parser {
|
|||
return linkCollections;
|
||||
}
|
||||
|
||||
public static void clean_up_idiotic_PDFParser_font_cache_which_eats_up_tons_of_megabytes() {
|
||||
// thank you very much, PDFParser hackers, this font cache will occupy >80MB RAM for a single pdf and then stays forever
|
||||
// AND I DO NOT EVEN NEED A FONT HERE TO PARSE THE TEXT!
|
||||
// Don't be so ignorant, just google once "PDFParser OutOfMemoryError" to feel the pain.
|
||||
ResourceCleaner cl = new ResourceCleaner();
|
||||
cl.clearClassResources("org.apache.pdfbox.cos.COSName");
|
||||
cl.clearClassResources("org.apache.pdfbox.pdmodel.font.PDFont");
|
||||
cl.clearClassResources("org.apache.pdfbox.pdmodel.font.PDType1Font");
|
||||
cl.clearClassResources("org.apache.pdfbox.pdmodel.font.PDTrueTypeFont");
|
||||
cl.clearClassResources("org.apache.pdfbox.pdmodel.font.PDType0Font");
|
||||
cl.clearClassResources("org.apache.pdfbox.pdmodel.font.PDType1AfmPfbFont");
|
||||
cl.clearClassResources("org.apache.pdfbox.pdmodel.font.PDType3Font");
|
||||
cl.clearClassResources("org.apache.pdfbox.pdmodel.font.PDType1CFont");
|
||||
cl.clearClassResources("org.apache.pdfbox.pdmodel.font.PDCIDFont");
|
||||
cl.clearClassResources("org.apache.pdfbox.pdmodel.font.PDCIDFontType0Font");
|
||||
cl.clearClassResources("org.apache.pdfbox.pdmodel.font.PDCIDFontType2Font");
|
||||
cl.clearClassResources("org.apache.pdfbox.pdmodel.font.PDMMType1Font");
|
||||
cl.clearClassResources("org.apache.pdfbox.pdmodel.font.PDSimpleFont");
|
||||
/**
|
||||
* Clean up cache resources allocated by PDFBox that would otherwise not be released.
|
||||
*/
|
||||
public static void clearPdfBoxCaches() {
|
||||
/*
|
||||
* Prior to pdfbox 2.0.0 font cache occupied > 80MB RAM for a single pdf and
|
||||
* then stayed forever (detected in YaCy with pdfbox version 1.2.1). The
|
||||
* situation is now from far better, but one (unnecessary?) cache structure in
|
||||
* the COSName class still needs to be explicitely cleared.
|
||||
*/
|
||||
|
||||
// History of related issues :
|
||||
// http://markmail.org/thread/quk5odee4hbsauhu
|
||||
// https://issues.apache.org/jira/browse/PDFBOX-313
|
||||
// https://issues.apache.org/jira/browse/PDFBOX-351
|
||||
// https://issues.apache.org/jira/browse/PDFBOX-441
|
||||
// https://issues.apache.org/jira/browse/PDFBOX-2200
|
||||
// https://issues.apache.org/jira/browse/PDFBOX-2149
|
||||
|
||||
COSName.clearResources();
|
||||
|
||||
/*
|
||||
* Prior to PDFBox 2.0.0, clearResources() function had to be called on the
|
||||
* org.apache.pdfbox.pdmodel.font.PDFont class and its children. After version
|
||||
* 2.0.0, there is no more such a function in PDFont class as font cache is
|
||||
* handled differently and hopefully more properly.
|
||||
*/
|
||||
}
|
||||
|
||||
@SuppressWarnings({ "unchecked", "rawtypes" })
|
||||
private static class ResourceCleaner {
|
||||
Method findLoadedClass;
|
||||
private ClassLoader sys;
|
||||
public ResourceCleaner() {
|
||||
try {
|
||||
this.findLoadedClass = ClassLoader.class.getDeclaredMethod("findLoadedClass", new Class[] { String.class });
|
||||
this.findLoadedClass.setAccessible(true);
|
||||
this.sys = ClassLoader.getSystemClassLoader();
|
||||
} catch (Throwable e) {
|
||||
e.printStackTrace();
|
||||
this.findLoadedClass = null;
|
||||
this.sys = null;
|
||||
}
|
||||
}
|
||||
public void clearClassResources(String name) {
|
||||
if (this.findLoadedClass == null) return;
|
||||
try {
|
||||
Object pdfparserpainclass = this.findLoadedClass.invoke(this.sys, name);
|
||||
if (pdfparserpainclass != null) {
|
||||
Method clearResources = ((Class) pdfparserpainclass).getDeclaredMethod("clearResources", new Class[] {});
|
||||
if (clearResources != null) clearResources.invoke(null);
|
||||
}
|
||||
} catch (Throwable e) {
|
||||
//e.printStackTrace();
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
/**
|
||||
* test
|
||||
* @param args
|
||||
|
|
|
@ -2454,7 +2454,7 @@ public final class Switchboard extends serverSwitch {
|
|||
|
||||
public static void clearCaches() {
|
||||
// flush caches in used libraries
|
||||
pdfParser.clean_up_idiotic_PDFParser_font_cache_which_eats_up_tons_of_megabytes(); // eats up megabytes, see http://markmail.org/thread/quk5odee4hbsauhu
|
||||
pdfParser.clearPdfBoxCaches();
|
||||
|
||||
// clear caches
|
||||
if (WordCache.sizeCommonWords() > 1000) WordCache.clearCommonWords();
|
||||
|
|
Loading…
Reference in New Issue
Block a user