Added a configurable timeout to wkhtmltopdf calls for pdf snapshots

Necessary to prevent blocking the indexing workflow when some
wkhtmltopdf renderings fail without terminating
This commit is contained in:
luccioman 2018-12-11 22:31:31 +01:00
parent 7c7b38cb5a
commit 08ea0b0397
6 changed files with 121 additions and 36 deletions

View File

@ -858,6 +858,10 @@ crawler.latencyFactor = 0.5
# defined here
crawler.onDemandLimit = 1000
# The maximum time in seconds to wait for each wkhtmltopdf call when rendering PDF snapshots
# Beyond that limit the process is killed
snapshots.wkhtmltopdf.timeout = 30
# maximum size of indexing queue
indexer.slots = 100

View File

@ -193,21 +193,30 @@ public class Html2Image {
return available;
}
/**
* write a pdf of a web page
* @param url
* @param proxy must be of the form http://host:port; use YaCy here as proxy which is mostly http://localhost:8090
* @param destination
* @return
*/
public static boolean writeWkhtmltopdf(String url, String proxy, String userAgent, final String acceptLanguage, File destination) {
/**
* Run the wkhtmltopdf external tool to fetch and render to PDF a web resource.
* wKhtmltopdf may be called multiple times with various parameters flavors in
* case of failure.
*
* @param url the URL of a web resource to fetch, render and convert to
* a pdf file. Must not be null.
* @param proxy the eventual proxy address to use. Can be null. Must be of
* the form http://host:port; use YaCy here as proxy which is
* mostly http://localhost:8090
* @param destination the destination PDF file that should be written. Must not
* be null.
* @param maxSeconds the maximum time in seconds to wait for each wkhtmltopdf
* call termination. Beyond this limit the process is killed.
* @return true when the destination file was successfully written
*/
public static boolean writeWkhtmltopdf(String url, String proxy, String userAgent, final String acceptLanguage, final File destination, final long maxSeconds) {
boolean success = false;
for (boolean ignoreErrors: new boolean[]{false, true}) {
success = writeWkhtmltopdfInternal(url, proxy, destination, userAgent, acceptLanguage, ignoreErrors);
success = writeWkhtmltopdfInternal(url, proxy, destination, userAgent, acceptLanguage, ignoreErrors, maxSeconds);
if (success) break;
if (!success && proxy != null) {
ConcurrentLog.warn("Html2Image", "trying to load without proxy: " + url);
success = writeWkhtmltopdfInternal(url, null, destination, userAgent, acceptLanguage, ignoreErrors);
success = writeWkhtmltopdfInternal(url, null, destination, userAgent, acceptLanguage, ignoreErrors, maxSeconds);
if (success) break;
}
}
@ -219,7 +228,23 @@ public class Html2Image {
return success;
}
private static boolean writeWkhtmltopdfInternal(final String url, final String proxy, final File destination, final String userAgent, final String acceptLanguage, final boolean ignoreErrors) {
/**
* Run wkhtmltopdf in a separate process to fetch and render to PDF a web
* resource.
*
* @param url the URL of a web resource to fetch, render and convert to
* a pdf file. Must not be null.
* @param proxy the eventual proxy address to use. Can be null.
* @param destination the destination PDF file that should be written. Must not
* be null.
* @param ignoreErrors when true wkhtmltopdf is instructed to ignore load errors
* @param maxSeconds the maximum time in seconds to wait for the wkhtmltopdf
* dedicated process termination. Beyond this limit the
* process is killed.
* @return true when the destination file was successfully written
*/
private static boolean writeWkhtmltopdfInternal(final String url, final String proxy, final File destination,
final String userAgent, final String acceptLanguage, final boolean ignoreErrors, final long maxSeconds) {
final String wkhtmltopdfCmd;
final File wkhtmltopdf = wkhtmltopdfExecutable();
if(wkhtmltopdf != null) {
@ -241,26 +266,57 @@ public class Html2Image {
url + " " + destination.getAbsolutePath();
try {
ConcurrentLog.info("Html2Pdf", "creating pdf from url " + url + " with command: " + commandline);
List<String> message;
if (!usexvfb) {
message = OS.execSynchronous(commandline);
if (destination.exists()) return true;
ConcurrentLog.warn("Html2Image", "failed to create pdf " + (proxy == null ? "" : "using proxy " + proxy) + " with command: " + commandline);
for (String m: message) ConcurrentLog.warn("Html2Image", ">> " + m);
if (!usexvfb && execWkhtmlToPdf(proxy, destination, commandline, maxSeconds)) {
return true;
}
// if this fails, we should try to wrap the X server with a virtual screen using xvfb, this works on headless servers
commandline = "xvfb-run -a " + commandline;
message = OS.execSynchronous(commandline);
if (destination.exists()) {usexvfb = true; return true;}
ConcurrentLog.warn("Html2Pdf", "failed to create pdf " + (proxy == null ? "" : "using proxy " + proxy) + " and xvfb with command: " + commandline);
for (String m: message) ConcurrentLog.warn("Html2Image", ">> " + m);
return false;
} catch (IOException e) {
e.printStackTrace();
ConcurrentLog.warn("Html2Pdf", "exception while creation of pdf with command: " + commandline);
return execWkhtmlToPdf(proxy, destination, commandline, maxSeconds);
} catch (final IOException e) {
ConcurrentLog.warn("Html2Pdf", "exception while creation of pdf with command: " + commandline, e);
return false;
}
}
/**
* Run a wkhtmltopdf commandline in a separate process.
*
* @param proxy the eventual proxy address to use. Can be null.
* @param destination the destination PDF file that should be written. Must not
* be null.
* @param commandline the wkhtmltopdf command line to execute. Must not be null.
* @param maxSeconds the maximum time in seconds to wait for the process
* termination. Beyond this limit the process is killed.
* @return true when the destination file was successfully written
* @throws IOException when an unexpected error occurred
*/
private static boolean execWkhtmlToPdf(final String proxy, final File destination, final String commandline, final long maxSeconds)
throws IOException {
final Process p = Runtime.getRuntime().exec(commandline);
try {
p.waitFor(maxSeconds, TimeUnit.SECONDS);
} catch (final InterruptedException e) {
p.destroyForcibly();
ConcurrentLog.warn("Html2Pdf", "Interrupted creation of pdf. Killing the process started with command : " + commandline);
Thread.currentThread().interrupt(); // Keep the thread interrupted state
return false;
}
if(p.isAlive()) {
ConcurrentLog.warn("Html2Pdf", "Creation of pdf did not terminate within " + maxSeconds + " seconds. Killing the process started with command : " + commandline);
p.destroyForcibly();
return false;
}
if (p.exitValue() == 0 && destination.exists()) {
return true;
}
final List<String> messages = OS.readStreams(p);
ConcurrentLog.warn("Html2Image", "failed to create pdf " + (proxy == null ? "" : "using proxy " + proxy) + " with command : " + commandline);
for (final String message : messages) {
ConcurrentLog.warn("Html2Image", ">> " + message);
}
return false;
}
/**
* Convert a pdf (first page) to an image. Proper values are i.e. width = 1024, height = 1024, density = 300, quality = 75
@ -459,7 +515,7 @@ public class Html2Image {
return;
}
if(Html2Image.writeWkhtmltopdf(args[0], null, ClientIdentification.yacyInternetCrawlerAgent.userAgent,
"en-us,en;q=0.5", targetPdfFile)) {
"en-us,en;q=0.5", targetPdfFile, 30)) {
if(targetPath.endsWith(".jpg") || targetPath.endsWith(".png")) {
if(Html2Image.pdf2image(targetPdfFile, new File(targetPath), 1024, 1024, 300, 75)) {
ConcurrentLog.info("Html2Image", "wrote " + targetPath + " converted from " + targetPdfFile);

View File

@ -65,6 +65,9 @@ public class Transactions {
private static ExecutorService executor = Executors.newCachedThreadPool();
private static AtomicInteger executorRunning = new AtomicInteger(0);
/** the maximum to wait for each wkhtmltopdf call when rendering PDF snapshots */
private static long wkhtmltopdfTimeout = 30;
static {
for (int i = 0; i < WHITESPACE.length; i++) WHITESPACE[i] = 32;
}
@ -77,13 +80,18 @@ public class Transactions {
}
}
public static void init(File dir) {
/**
* @param dir the parent directory of inventory and archive snapshots.
* @param wkhtmltopdfTimeout the maximum to wait for each wkhtmltopdf call when rendering PDF snapshots
*/
public static void init(final File dir, final long wkhtmltopdfSecondsTimeout) {
transactionDir = dir;
transactionDir.mkdirs();
inventoryDir = new File(transactionDir, State.INVENTORY.dirname);
inventory = new Snapshots(inventoryDir);
archiveDir = new File(transactionDir, State.ARCHIVE.dirname);
archive = new Snapshots(archiveDir);
wkhtmltopdfTimeout = wkhtmltopdfSecondsTimeout;
}
public static synchronized void migrateIPV6Snapshots() {
@ -228,7 +236,7 @@ public class Transactions {
public void run() {
executorRunning.incrementAndGet();
try {
Html2Image.writeWkhtmltopdf(urls, proxy, ClientIdentification.browserAgent.userAgent, acceptLanguage, pdfPath);
Html2Image.writeWkhtmltopdf(urls, proxy, ClientIdentification.browserAgent.userAgent, acceptLanguage, pdfPath, wkhtmltopdfTimeout);
} catch (Throwable e) {} finally {
executorRunning.decrementAndGet();
}
@ -236,7 +244,7 @@ public class Transactions {
};
executor.execute(t);
} else {
success = Html2Image.writeWkhtmltopdf(urls, proxy, ClientIdentification.browserAgent.userAgent, acceptLanguage, pdfPath);
success = Html2Image.writeWkhtmltopdf(urls, proxy, ClientIdentification.browserAgent.userAgent, acceptLanguage, pdfPath, wkhtmltopdfTimeout);
}
return success;

View File

@ -166,19 +166,29 @@ public final class OS {
// runs a unix/linux command and returns output as Vector of Strings
// this method blocks until the command is executed
final Process p = Runtime.getRuntime().exec(command);
return execSynchronousProcess(p);
return readStreams(p);
}
public static List<String> execSynchronous(final String[] command) throws IOException {
// runs a unix/linux command and returns output as Vector of Strings
// this method blocks until the command is executed
final Process p = Runtime.getRuntime().exec(command);
return execSynchronousProcess(p);
return readStreams(p);
}
private static List<String> execSynchronousProcess(Process p) throws IOException {
/**
* Read all lines from both standard and error output from the given process
* @param p a process
* @return all the lines from the process standard and error ouput
* @throws IOException when an unexpected error occurred
*/
public static List<String> readStreams(final Process p) throws IOException {
String line;
final List<String> output = new ArrayList<String>();
final List<String> output = new ArrayList<>();
if(p == null) {
return output;
}
try (final InputStreamReader streamReader = new InputStreamReader(p.getInputStream());
final BufferedReader in = new BufferedReader(streamReader);) {

View File

@ -770,7 +770,8 @@ public final class Switchboard extends serverSwitch {
getConfigInt(SwitchboardConstants.HTCACHE_COMPRESSION_LEVEL,
SwitchboardConstants.HTCACHE_COMPRESSION_LEVEL_DEFAULT));
final File transactiondir = new File(this.htCachePath, "snapshots");
Transactions.init(transactiondir);
Transactions.init(transactiondir, getConfigLong(SwitchboardConstants.SNAPSHOTS_WKHTMLTOPDF_TIMEOUT,
SwitchboardConstants.SNAPSHOTS_WKHTMLTOPDF_TIMEOUT_DEFAULT));
// create the surrogates directories
this.surrogatesInPath =

View File

@ -329,10 +329,10 @@ public final class SwitchboardConstants {
/** Default value controlling whether a self-signed certificate is acceptable from a remote Solr instance with authentication credentials. */
public static final boolean FEDERATED_SERVICE_SOLR_INDEXING_AUTHENTICATED_ALLOW_SELF_SIGNED_DEFAULT = false;
/** Key of the setting controlling wheter to use or not an embedded Solr instance */
/** Key of the setting controlling whether to use or not an embedded Solr instance */
public static final String CORE_SERVICE_FULLTEXT = "core.service.fulltext";
/** Default setting value controlling wheter to use or not an embedded Solr instance */
/** Default setting value controlling whether to use or not an embedded Solr instance */
public static final boolean CORE_SERVICE_FULLTEXT_DEFAULT = true;
public static final String CORE_SERVICE_RWI = "core.service.rwi.tmp";
@ -354,6 +354,12 @@ public final class SwitchboardConstants {
public static final String CRAWLER_USER_AGENT_MINIMUMDELTA = "crawler.userAgent.minimumdelta";
public static final String CRAWLER_USER_AGENT_CLIENTTIMEOUT = "crawler.userAgent.clienttimeout";
/** Key of the setting controlling the maximum time to wait for each wkhtmltopdf call when rendering PDF snapshots */
public static final String SNAPSHOTS_WKHTMLTOPDF_TIMEOUT = "snapshots.wkhtmltopdf.timeout";
/** Default maximum time in seconds to wait for each wkhtmltopdf call when rendering PDF snapshots*/
public static final long SNAPSHOTS_WKHTMLTOPDF_TIMEOUT_DEFAULT = 30;
/* --- debug flags --- */
/** when set to true : do not use the local dht/rwi index (which is not done if we do remote searches) */