extend snapshot Html2Image.pdf2image to use PDFBox image export capability

if no external tool installed (and for Win) Resulting jpg are not always perfect (if graphic included) but imho sufficient.
2024-09-19 00:01:41 +02:00 · 2016-05-16 02:13:33 +02:00 · 2016-05-16 02:13:33 +02:00 · 24b0fa2a38
commit 24b0fa2a38
parent eb2a00b1d8
3 changed files with 76 additions and 18 deletions
--- a/source/net/yacy/cora/util/Html2Image.java
+++ b/source/net/yacy/cora/util/Html2Image.java
@ -20,6 +20,18 @@

 package net.yacy.cora.util;

+import java.awt.Container;
+import java.awt.Dimension;
+import java.awt.Graphics;
+import java.awt.Image;
+import java.awt.MediaTracker;
+import java.awt.image.BufferedImage;
+import java.beans.PropertyChangeEvent;
+import java.beans.PropertyChangeListener;
+import java.io.File;
+import java.io.IOException;
+import java.util.List;
+
 import javax.imageio.ImageIO;
 import javax.swing.JEditorPane;
 import javax.swing.text.Document;
@ -34,18 +46,13 @@ import net.yacy.document.ImageParser;
 import net.yacy.kelondro.util.FileUtils;
 import net.yacy.kelondro.util.OS;

-import java.awt.Container;
-import java.awt.Dimension;
-import java.awt.Graphics;
-import java.awt.Image;
-import java.awt.MediaTracker;
-import java.awt.image.BufferedImage;
-import java.beans.PropertyChangeEvent;
-import java.beans.PropertyChangeListener;
-import java.io.File;
-import java.io.IOException;
-import java.util.List;
+import org.apache.pdfbox.pdmodel.PDDocument;
+import org.apache.pdfbox.pdmodel.PDPage;

+/**
+ * Convert html to an copy on disk-image in a other file format
+ * currently (pdf and/or jpg)
+ */
 public class Html2Image {
    
    // Mac
@ -132,18 +139,32 @@ public class Html2Image {
    }
    
    /**
-     * convert a pdf to an image. proper values are i.e. width = 1024, height = 1024, density = 300, quality = 75
-     * @param pdf
-     * @param image
+     * convert a pdf (first page) to an image. proper values are i.e. width = 1024, height = 1024, density = 300, quality = 75
+     * using internal pdf library or external command line tool on linux or mac
+     * @param pdf input pdf file
+     * @param image output jpg file
     * @param width
     * @param height
-     * @param density
+     * @param density (dpi)
     * @param quality
     * @return
     */
    public static boolean pdf2image(File pdf, File image, int width, int height, int density, int quality) {
        final File convert = convertMac1.exists() ? convertMac1 : convertMac2.exists() ? convertMac2 : convertDebian;
-        
+
+        // convert pdf to jpg using internal pdfbox capability
+        if (OS.isWindows || !convert.exists()) {
+            try {
+                PDDocument pdoc = PDDocument.load(pdf);
+                PDPage page = (PDPage) pdoc.getDocumentCatalog().getAllPages().get(0);
+                BufferedImage bi = page.convertToImage(BufferedImage.TYPE_INT_RGB, density);
+
+                return ImageIO.write(bi, "jpg", image);
+
+            } catch (IOException ex) { }
+        }
+
+        // convert on mac or linux using external command line utility
        try {
            // i.e. convert -density 300 -trim yacy.pdf[0] -trim -resize 1024x -crop x1024+0+0 -quality 75% yacy-convert-300.jpg
            // note: both -trim are necessary, otherwise it is trimmed only on one side. The [0] selects the first page of the pdf
--- a/source/net/yacy/document/parser/pdfParser.java
+++ b/source/net/yacy/document/parser/pdfParser.java
@ -41,6 +41,7 @@ import java.util.HashSet;
 import java.util.List;

 import org.apache.pdfbox.exceptions.CryptographyException;
+import org.apache.pdfbox.pdfparser.PDFParser;
 import org.apache.pdfbox.pdmodel.PDDocument;
 import org.apache.pdfbox.pdmodel.PDDocumentInformation;
 import org.apache.pdfbox.pdmodel.PDPage;
@ -65,7 +66,6 @@ import net.yacy.document.VocabularyScraper;
 import net.yacy.kelondro.io.CharBuffer;
 import net.yacy.kelondro.util.FileUtils;
 import net.yacy.kelondro.util.MemoryControl;
-import org.apache.pdfbox.pdfparser.PDFParser;


 public class pdfParser extends AbstractParser implements Parser {
@ -204,7 +204,7 @@ public class pdfParser extends AbstractParser implements Parser {
                            docPublisher,
                            null,
                            null,
-                            0.0f, 0.0f,
+                            0.0d, 0.0d,
                            pages == null || page > pages.length ? new byte[0] : UTF8.getBytes(pages[page]),
                            pdflinks == null || page >= pdflinks.length ? null : pdflinks[page],
                            null,
--- a/test/java/net/yacy/cora/util/Html2ImageTest.java
+++ b/test/java/net/yacy/cora/util/Html2ImageTest.java
@ -0,0 +1,37 @@
+package net.yacy.cora.util;
+
+import java.io.File;
+import java.io.FilenameFilter;
+import java.util.ArrayList;
+import java.util.List;
+import net.yacy.utils.translation.ExtensionsFileFilter;
+import org.junit.Test;
+import static org.junit.Assert.*;
+
+
+public class Html2ImageTest {
+
+    /**
+     * Test of pdf2image method, of class Html2Image.
+     */
+    @Test
+    public void testPdf2image() {
+        // collect pdf filenames in test directory
+        File pd = new File("test/parsertest");
+        List<String> extensions = new ArrayList();
+        extensions.add("pdf");
+        FilenameFilter fileFilter = new ExtensionsFileFilter(extensions);
+        String[] pdffiles = pd.list(fileFilter);
+
+        for (String pdffilename : pdffiles) {
+            File pdffile = new File(pd, pdffilename);
+            File jpgfile = new File("test/DATA", pdffilename + ".jpg");
+            if (jpgfile.exists()) {
+                jpgfile.delete();
+            }
+            Html2Image.pdf2image(pdffile, jpgfile, 1024, 1024, 300, 75);
+            assertTrue(jpgfile.exists());
+        }
+    }
+
+}