extend snapshot Html2Image.pdf2image to use PDFBox image export capability

if no external tool installed (and for Win)
Resulting jpg are not always perfect (if graphic included) but imho sufficient.
This commit is contained in:
reger 2016-05-16 02:13:33 +02:00
parent eb2a00b1d8
commit 24b0fa2a38
3 changed files with 76 additions and 18 deletions

View File

@ -20,6 +20,18 @@
package net.yacy.cora.util;
import java.awt.Container;
import java.awt.Dimension;
import java.awt.Graphics;
import java.awt.Image;
import java.awt.MediaTracker;
import java.awt.image.BufferedImage;
import java.beans.PropertyChangeEvent;
import java.beans.PropertyChangeListener;
import java.io.File;
import java.io.IOException;
import java.util.List;
import javax.imageio.ImageIO;
import javax.swing.JEditorPane;
import javax.swing.text.Document;
@ -34,18 +46,13 @@ import net.yacy.document.ImageParser;
import net.yacy.kelondro.util.FileUtils;
import net.yacy.kelondro.util.OS;
import java.awt.Container;
import java.awt.Dimension;
import java.awt.Graphics;
import java.awt.Image;
import java.awt.MediaTracker;
import java.awt.image.BufferedImage;
import java.beans.PropertyChangeEvent;
import java.beans.PropertyChangeListener;
import java.io.File;
import java.io.IOException;
import java.util.List;
import org.apache.pdfbox.pdmodel.PDDocument;
import org.apache.pdfbox.pdmodel.PDPage;
/**
* Convert html to an copy on disk-image in a other file format
* currently (pdf and/or jpg)
*/
public class Html2Image {
// Mac
@ -132,18 +139,32 @@ public class Html2Image {
}
/**
* convert a pdf to an image. proper values are i.e. width = 1024, height = 1024, density = 300, quality = 75
* @param pdf
* @param image
* convert a pdf (first page) to an image. proper values are i.e. width = 1024, height = 1024, density = 300, quality = 75
* using internal pdf library or external command line tool on linux or mac
* @param pdf input pdf file
* @param image output jpg file
* @param width
* @param height
* @param density
* @param density (dpi)
* @param quality
* @return
*/
public static boolean pdf2image(File pdf, File image, int width, int height, int density, int quality) {
final File convert = convertMac1.exists() ? convertMac1 : convertMac2.exists() ? convertMac2 : convertDebian;
// convert pdf to jpg using internal pdfbox capability
if (OS.isWindows || !convert.exists()) {
try {
PDDocument pdoc = PDDocument.load(pdf);
PDPage page = (PDPage) pdoc.getDocumentCatalog().getAllPages().get(0);
BufferedImage bi = page.convertToImage(BufferedImage.TYPE_INT_RGB, density);
return ImageIO.write(bi, "jpg", image);
} catch (IOException ex) { }
}
// convert on mac or linux using external command line utility
try {
// i.e. convert -density 300 -trim yacy.pdf[0] -trim -resize 1024x -crop x1024+0+0 -quality 75% yacy-convert-300.jpg
// note: both -trim are necessary, otherwise it is trimmed only on one side. The [0] selects the first page of the pdf

View File

@ -41,6 +41,7 @@ import java.util.HashSet;
import java.util.List;
import org.apache.pdfbox.exceptions.CryptographyException;
import org.apache.pdfbox.pdfparser.PDFParser;
import org.apache.pdfbox.pdmodel.PDDocument;
import org.apache.pdfbox.pdmodel.PDDocumentInformation;
import org.apache.pdfbox.pdmodel.PDPage;
@ -65,7 +66,6 @@ import net.yacy.document.VocabularyScraper;
import net.yacy.kelondro.io.CharBuffer;
import net.yacy.kelondro.util.FileUtils;
import net.yacy.kelondro.util.MemoryControl;
import org.apache.pdfbox.pdfparser.PDFParser;
public class pdfParser extends AbstractParser implements Parser {
@ -204,7 +204,7 @@ public class pdfParser extends AbstractParser implements Parser {
docPublisher,
null,
null,
0.0f, 0.0f,
0.0d, 0.0d,
pages == null || page > pages.length ? new byte[0] : UTF8.getBytes(pages[page]),
pdflinks == null || page >= pdflinks.length ? null : pdflinks[page],
null,

View File

@ -0,0 +1,37 @@
package net.yacy.cora.util;
import java.io.File;
import java.io.FilenameFilter;
import java.util.ArrayList;
import java.util.List;
import net.yacy.utils.translation.ExtensionsFileFilter;
import org.junit.Test;
import static org.junit.Assert.*;
public class Html2ImageTest {
/**
* Test of pdf2image method, of class Html2Image.
*/
@Test
public void testPdf2image() {
// collect pdf filenames in test directory
File pd = new File("test/parsertest");
List<String> extensions = new ArrayList();
extensions.add("pdf");
FilenameFilter fileFilter = new ExtensionsFileFilter(extensions);
String[] pdffiles = pd.list(fileFilter);
for (String pdffilename : pdffiles) {
File pdffile = new File(pd, pdffilename);
File jpgfile = new File("test/DATA", pdffilename + ".jpg");
if (jpgfile.exists()) {
jpgfile.delete();
}
Html2Image.pdf2image(pdffile, jpgfile, 1024, 1024, 300, 75);
assertTrue(jpgfile.exists());
}
}
}