mirror of
https://github.com/yacy/yacy_search_server.git
synced 2024-09-19 00:01:41 +02:00
extend snapshot Html2Image.pdf2image to use PDFBox image export capability
if no external tool installed (and for Win) Resulting jpg are not always perfect (if graphic included) but imho sufficient.
This commit is contained in:
parent
eb2a00b1d8
commit
24b0fa2a38
|
@ -20,6 +20,18 @@
|
|||
|
||||
package net.yacy.cora.util;
|
||||
|
||||
import java.awt.Container;
|
||||
import java.awt.Dimension;
|
||||
import java.awt.Graphics;
|
||||
import java.awt.Image;
|
||||
import java.awt.MediaTracker;
|
||||
import java.awt.image.BufferedImage;
|
||||
import java.beans.PropertyChangeEvent;
|
||||
import java.beans.PropertyChangeListener;
|
||||
import java.io.File;
|
||||
import java.io.IOException;
|
||||
import java.util.List;
|
||||
|
||||
import javax.imageio.ImageIO;
|
||||
import javax.swing.JEditorPane;
|
||||
import javax.swing.text.Document;
|
||||
|
@ -34,18 +46,13 @@ import net.yacy.document.ImageParser;
|
|||
import net.yacy.kelondro.util.FileUtils;
|
||||
import net.yacy.kelondro.util.OS;
|
||||
|
||||
import java.awt.Container;
|
||||
import java.awt.Dimension;
|
||||
import java.awt.Graphics;
|
||||
import java.awt.Image;
|
||||
import java.awt.MediaTracker;
|
||||
import java.awt.image.BufferedImage;
|
||||
import java.beans.PropertyChangeEvent;
|
||||
import java.beans.PropertyChangeListener;
|
||||
import java.io.File;
|
||||
import java.io.IOException;
|
||||
import java.util.List;
|
||||
import org.apache.pdfbox.pdmodel.PDDocument;
|
||||
import org.apache.pdfbox.pdmodel.PDPage;
|
||||
|
||||
/**
|
||||
* Convert html to an copy on disk-image in a other file format
|
||||
* currently (pdf and/or jpg)
|
||||
*/
|
||||
public class Html2Image {
|
||||
|
||||
// Mac
|
||||
|
@ -132,18 +139,32 @@ public class Html2Image {
|
|||
}
|
||||
|
||||
/**
|
||||
* convert a pdf to an image. proper values are i.e. width = 1024, height = 1024, density = 300, quality = 75
|
||||
* @param pdf
|
||||
* @param image
|
||||
* convert a pdf (first page) to an image. proper values are i.e. width = 1024, height = 1024, density = 300, quality = 75
|
||||
* using internal pdf library or external command line tool on linux or mac
|
||||
* @param pdf input pdf file
|
||||
* @param image output jpg file
|
||||
* @param width
|
||||
* @param height
|
||||
* @param density
|
||||
* @param density (dpi)
|
||||
* @param quality
|
||||
* @return
|
||||
*/
|
||||
public static boolean pdf2image(File pdf, File image, int width, int height, int density, int quality) {
|
||||
final File convert = convertMac1.exists() ? convertMac1 : convertMac2.exists() ? convertMac2 : convertDebian;
|
||||
|
||||
|
||||
// convert pdf to jpg using internal pdfbox capability
|
||||
if (OS.isWindows || !convert.exists()) {
|
||||
try {
|
||||
PDDocument pdoc = PDDocument.load(pdf);
|
||||
PDPage page = (PDPage) pdoc.getDocumentCatalog().getAllPages().get(0);
|
||||
BufferedImage bi = page.convertToImage(BufferedImage.TYPE_INT_RGB, density);
|
||||
|
||||
return ImageIO.write(bi, "jpg", image);
|
||||
|
||||
} catch (IOException ex) { }
|
||||
}
|
||||
|
||||
// convert on mac or linux using external command line utility
|
||||
try {
|
||||
// i.e. convert -density 300 -trim yacy.pdf[0] -trim -resize 1024x -crop x1024+0+0 -quality 75% yacy-convert-300.jpg
|
||||
// note: both -trim are necessary, otherwise it is trimmed only on one side. The [0] selects the first page of the pdf
|
||||
|
|
|
@ -41,6 +41,7 @@ import java.util.HashSet;
|
|||
import java.util.List;
|
||||
|
||||
import org.apache.pdfbox.exceptions.CryptographyException;
|
||||
import org.apache.pdfbox.pdfparser.PDFParser;
|
||||
import org.apache.pdfbox.pdmodel.PDDocument;
|
||||
import org.apache.pdfbox.pdmodel.PDDocumentInformation;
|
||||
import org.apache.pdfbox.pdmodel.PDPage;
|
||||
|
@ -65,7 +66,6 @@ import net.yacy.document.VocabularyScraper;
|
|||
import net.yacy.kelondro.io.CharBuffer;
|
||||
import net.yacy.kelondro.util.FileUtils;
|
||||
import net.yacy.kelondro.util.MemoryControl;
|
||||
import org.apache.pdfbox.pdfparser.PDFParser;
|
||||
|
||||
|
||||
public class pdfParser extends AbstractParser implements Parser {
|
||||
|
@ -204,7 +204,7 @@ public class pdfParser extends AbstractParser implements Parser {
|
|||
docPublisher,
|
||||
null,
|
||||
null,
|
||||
0.0f, 0.0f,
|
||||
0.0d, 0.0d,
|
||||
pages == null || page > pages.length ? new byte[0] : UTF8.getBytes(pages[page]),
|
||||
pdflinks == null || page >= pdflinks.length ? null : pdflinks[page],
|
||||
null,
|
||||
|
|
37
test/java/net/yacy/cora/util/Html2ImageTest.java
Normal file
37
test/java/net/yacy/cora/util/Html2ImageTest.java
Normal file
|
@ -0,0 +1,37 @@
|
|||
package net.yacy.cora.util;
|
||||
|
||||
import java.io.File;
|
||||
import java.io.FilenameFilter;
|
||||
import java.util.ArrayList;
|
||||
import java.util.List;
|
||||
import net.yacy.utils.translation.ExtensionsFileFilter;
|
||||
import org.junit.Test;
|
||||
import static org.junit.Assert.*;
|
||||
|
||||
|
||||
public class Html2ImageTest {
|
||||
|
||||
/**
|
||||
* Test of pdf2image method, of class Html2Image.
|
||||
*/
|
||||
@Test
|
||||
public void testPdf2image() {
|
||||
// collect pdf filenames in test directory
|
||||
File pd = new File("test/parsertest");
|
||||
List<String> extensions = new ArrayList();
|
||||
extensions.add("pdf");
|
||||
FilenameFilter fileFilter = new ExtensionsFileFilter(extensions);
|
||||
String[] pdffiles = pd.list(fileFilter);
|
||||
|
||||
for (String pdffilename : pdffiles) {
|
||||
File pdffile = new File(pd, pdffilename);
|
||||
File jpgfile = new File("test/DATA", pdffilename + ".jpg");
|
||||
if (jpgfile.exists()) {
|
||||
jpgfile.delete();
|
||||
}
|
||||
Html2Image.pdf2image(pdffile, jpgfile, 1024, 1024, 300, 75);
|
||||
assertTrue(jpgfile.exists());
|
||||
}
|
||||
}
|
||||
|
||||
}
|
Loading…
Reference in New Issue
Block a user