mirror of
https://github.com/yacy/yacy_search_server.git
synced 2024-09-19 00:01:41 +02:00
Added a configurable timeout to wkhtmltopdf calls for pdf snapshots
Necessary to prevent blocking the indexing workflow when some wkhtmltopdf renderings fail without terminating
This commit is contained in:
parent
7c7b38cb5a
commit
08ea0b0397
|
@ -858,6 +858,10 @@ crawler.latencyFactor = 0.5
|
|||
# defined here
|
||||
crawler.onDemandLimit = 1000
|
||||
|
||||
# The maximum time in seconds to wait for each wkhtmltopdf call when rendering PDF snapshots
|
||||
# Beyond that limit the process is killed
|
||||
snapshots.wkhtmltopdf.timeout = 30
|
||||
|
||||
# maximum size of indexing queue
|
||||
indexer.slots = 100
|
||||
|
||||
|
|
|
@ -193,21 +193,30 @@ public class Html2Image {
|
|||
return available;
|
||||
}
|
||||
|
||||
/**
|
||||
* write a pdf of a web page
|
||||
* @param url
|
||||
* @param proxy must be of the form http://host:port; use YaCy here as proxy which is mostly http://localhost:8090
|
||||
* @param destination
|
||||
* @return
|
||||
*/
|
||||
public static boolean writeWkhtmltopdf(String url, String proxy, String userAgent, final String acceptLanguage, File destination) {
|
||||
/**
|
||||
* Run the wkhtmltopdf external tool to fetch and render to PDF a web resource.
|
||||
* wKhtmltopdf may be called multiple times with various parameters flavors in
|
||||
* case of failure.
|
||||
*
|
||||
* @param url the URL of a web resource to fetch, render and convert to
|
||||
* a pdf file. Must not be null.
|
||||
* @param proxy the eventual proxy address to use. Can be null. Must be of
|
||||
* the form http://host:port; use YaCy here as proxy which is
|
||||
* mostly http://localhost:8090
|
||||
* @param destination the destination PDF file that should be written. Must not
|
||||
* be null.
|
||||
* @param maxSeconds the maximum time in seconds to wait for each wkhtmltopdf
|
||||
* call termination. Beyond this limit the process is killed.
|
||||
* @return true when the destination file was successfully written
|
||||
*/
|
||||
public static boolean writeWkhtmltopdf(String url, String proxy, String userAgent, final String acceptLanguage, final File destination, final long maxSeconds) {
|
||||
boolean success = false;
|
||||
for (boolean ignoreErrors: new boolean[]{false, true}) {
|
||||
success = writeWkhtmltopdfInternal(url, proxy, destination, userAgent, acceptLanguage, ignoreErrors);
|
||||
success = writeWkhtmltopdfInternal(url, proxy, destination, userAgent, acceptLanguage, ignoreErrors, maxSeconds);
|
||||
if (success) break;
|
||||
if (!success && proxy != null) {
|
||||
ConcurrentLog.warn("Html2Image", "trying to load without proxy: " + url);
|
||||
success = writeWkhtmltopdfInternal(url, null, destination, userAgent, acceptLanguage, ignoreErrors);
|
||||
success = writeWkhtmltopdfInternal(url, null, destination, userAgent, acceptLanguage, ignoreErrors, maxSeconds);
|
||||
if (success) break;
|
||||
}
|
||||
}
|
||||
|
@ -219,7 +228,23 @@ public class Html2Image {
|
|||
return success;
|
||||
}
|
||||
|
||||
private static boolean writeWkhtmltopdfInternal(final String url, final String proxy, final File destination, final String userAgent, final String acceptLanguage, final boolean ignoreErrors) {
|
||||
/**
|
||||
* Run wkhtmltopdf in a separate process to fetch and render to PDF a web
|
||||
* resource.
|
||||
*
|
||||
* @param url the URL of a web resource to fetch, render and convert to
|
||||
* a pdf file. Must not be null.
|
||||
* @param proxy the eventual proxy address to use. Can be null.
|
||||
* @param destination the destination PDF file that should be written. Must not
|
||||
* be null.
|
||||
* @param ignoreErrors when true wkhtmltopdf is instructed to ignore load errors
|
||||
* @param maxSeconds the maximum time in seconds to wait for the wkhtmltopdf
|
||||
* dedicated process termination. Beyond this limit the
|
||||
* process is killed.
|
||||
* @return true when the destination file was successfully written
|
||||
*/
|
||||
private static boolean writeWkhtmltopdfInternal(final String url, final String proxy, final File destination,
|
||||
final String userAgent, final String acceptLanguage, final boolean ignoreErrors, final long maxSeconds) {
|
||||
final String wkhtmltopdfCmd;
|
||||
final File wkhtmltopdf = wkhtmltopdfExecutable();
|
||||
if(wkhtmltopdf != null) {
|
||||
|
@ -241,26 +266,57 @@ public class Html2Image {
|
|||
url + " " + destination.getAbsolutePath();
|
||||
try {
|
||||
ConcurrentLog.info("Html2Pdf", "creating pdf from url " + url + " with command: " + commandline);
|
||||
List<String> message;
|
||||
if (!usexvfb) {
|
||||
message = OS.execSynchronous(commandline);
|
||||
if (destination.exists()) return true;
|
||||
ConcurrentLog.warn("Html2Image", "failed to create pdf " + (proxy == null ? "" : "using proxy " + proxy) + " with command: " + commandline);
|
||||
for (String m: message) ConcurrentLog.warn("Html2Image", ">> " + m);
|
||||
if (!usexvfb && execWkhtmlToPdf(proxy, destination, commandline, maxSeconds)) {
|
||||
return true;
|
||||
}
|
||||
// if this fails, we should try to wrap the X server with a virtual screen using xvfb, this works on headless servers
|
||||
commandline = "xvfb-run -a " + commandline;
|
||||
message = OS.execSynchronous(commandline);
|
||||
if (destination.exists()) {usexvfb = true; return true;}
|
||||
ConcurrentLog.warn("Html2Pdf", "failed to create pdf " + (proxy == null ? "" : "using proxy " + proxy) + " and xvfb with command: " + commandline);
|
||||
for (String m: message) ConcurrentLog.warn("Html2Image", ">> " + m);
|
||||
return false;
|
||||
} catch (IOException e) {
|
||||
e.printStackTrace();
|
||||
ConcurrentLog.warn("Html2Pdf", "exception while creation of pdf with command: " + commandline);
|
||||
return execWkhtmlToPdf(proxy, destination, commandline, maxSeconds);
|
||||
} catch (final IOException e) {
|
||||
ConcurrentLog.warn("Html2Pdf", "exception while creation of pdf with command: " + commandline, e);
|
||||
return false;
|
||||
}
|
||||
}
|
||||
|
||||
/**
|
||||
* Run a wkhtmltopdf commandline in a separate process.
|
||||
*
|
||||
* @param proxy the eventual proxy address to use. Can be null.
|
||||
* @param destination the destination PDF file that should be written. Must not
|
||||
* be null.
|
||||
* @param commandline the wkhtmltopdf command line to execute. Must not be null.
|
||||
* @param maxSeconds the maximum time in seconds to wait for the process
|
||||
* termination. Beyond this limit the process is killed.
|
||||
* @return true when the destination file was successfully written
|
||||
* @throws IOException when an unexpected error occurred
|
||||
*/
|
||||
private static boolean execWkhtmlToPdf(final String proxy, final File destination, final String commandline, final long maxSeconds)
|
||||
throws IOException {
|
||||
final Process p = Runtime.getRuntime().exec(commandline);
|
||||
|
||||
try {
|
||||
p.waitFor(maxSeconds, TimeUnit.SECONDS);
|
||||
} catch (final InterruptedException e) {
|
||||
p.destroyForcibly();
|
||||
ConcurrentLog.warn("Html2Pdf", "Interrupted creation of pdf. Killing the process started with command : " + commandline);
|
||||
Thread.currentThread().interrupt(); // Keep the thread interrupted state
|
||||
return false;
|
||||
}
|
||||
if(p.isAlive()) {
|
||||
ConcurrentLog.warn("Html2Pdf", "Creation of pdf did not terminate within " + maxSeconds + " seconds. Killing the process started with command : " + commandline);
|
||||
p.destroyForcibly();
|
||||
return false;
|
||||
}
|
||||
if (p.exitValue() == 0 && destination.exists()) {
|
||||
return true;
|
||||
}
|
||||
final List<String> messages = OS.readStreams(p);
|
||||
ConcurrentLog.warn("Html2Image", "failed to create pdf " + (proxy == null ? "" : "using proxy " + proxy) + " with command : " + commandline);
|
||||
for (final String message : messages) {
|
||||
ConcurrentLog.warn("Html2Image", ">> " + message);
|
||||
}
|
||||
return false;
|
||||
}
|
||||
|
||||
/**
|
||||
* Convert a pdf (first page) to an image. Proper values are i.e. width = 1024, height = 1024, density = 300, quality = 75
|
||||
|
@ -459,7 +515,7 @@ public class Html2Image {
|
|||
return;
|
||||
}
|
||||
if(Html2Image.writeWkhtmltopdf(args[0], null, ClientIdentification.yacyInternetCrawlerAgent.userAgent,
|
||||
"en-us,en;q=0.5", targetPdfFile)) {
|
||||
"en-us,en;q=0.5", targetPdfFile, 30)) {
|
||||
if(targetPath.endsWith(".jpg") || targetPath.endsWith(".png")) {
|
||||
if(Html2Image.pdf2image(targetPdfFile, new File(targetPath), 1024, 1024, 300, 75)) {
|
||||
ConcurrentLog.info("Html2Image", "wrote " + targetPath + " converted from " + targetPdfFile);
|
||||
|
|
|
@ -65,6 +65,9 @@ public class Transactions {
|
|||
private static ExecutorService executor = Executors.newCachedThreadPool();
|
||||
private static AtomicInteger executorRunning = new AtomicInteger(0);
|
||||
|
||||
/** the maximum to wait for each wkhtmltopdf call when rendering PDF snapshots */
|
||||
private static long wkhtmltopdfTimeout = 30;
|
||||
|
||||
static {
|
||||
for (int i = 0; i < WHITESPACE.length; i++) WHITESPACE[i] = 32;
|
||||
}
|
||||
|
@ -77,13 +80,18 @@ public class Transactions {
|
|||
}
|
||||
}
|
||||
|
||||
public static void init(File dir) {
|
||||
/**
|
||||
* @param dir the parent directory of inventory and archive snapshots.
|
||||
* @param wkhtmltopdfTimeout the maximum to wait for each wkhtmltopdf call when rendering PDF snapshots
|
||||
*/
|
||||
public static void init(final File dir, final long wkhtmltopdfSecondsTimeout) {
|
||||
transactionDir = dir;
|
||||
transactionDir.mkdirs();
|
||||
inventoryDir = new File(transactionDir, State.INVENTORY.dirname);
|
||||
inventory = new Snapshots(inventoryDir);
|
||||
archiveDir = new File(transactionDir, State.ARCHIVE.dirname);
|
||||
archive = new Snapshots(archiveDir);
|
||||
wkhtmltopdfTimeout = wkhtmltopdfSecondsTimeout;
|
||||
}
|
||||
|
||||
public static synchronized void migrateIPV6Snapshots() {
|
||||
|
@ -228,7 +236,7 @@ public class Transactions {
|
|||
public void run() {
|
||||
executorRunning.incrementAndGet();
|
||||
try {
|
||||
Html2Image.writeWkhtmltopdf(urls, proxy, ClientIdentification.browserAgent.userAgent, acceptLanguage, pdfPath);
|
||||
Html2Image.writeWkhtmltopdf(urls, proxy, ClientIdentification.browserAgent.userAgent, acceptLanguage, pdfPath, wkhtmltopdfTimeout);
|
||||
} catch (Throwable e) {} finally {
|
||||
executorRunning.decrementAndGet();
|
||||
}
|
||||
|
@ -236,7 +244,7 @@ public class Transactions {
|
|||
};
|
||||
executor.execute(t);
|
||||
} else {
|
||||
success = Html2Image.writeWkhtmltopdf(urls, proxy, ClientIdentification.browserAgent.userAgent, acceptLanguage, pdfPath);
|
||||
success = Html2Image.writeWkhtmltopdf(urls, proxy, ClientIdentification.browserAgent.userAgent, acceptLanguage, pdfPath, wkhtmltopdfTimeout);
|
||||
}
|
||||
|
||||
return success;
|
||||
|
|
|
@ -166,19 +166,29 @@ public final class OS {
|
|||
// runs a unix/linux command and returns output as Vector of Strings
|
||||
// this method blocks until the command is executed
|
||||
final Process p = Runtime.getRuntime().exec(command);
|
||||
return execSynchronousProcess(p);
|
||||
return readStreams(p);
|
||||
}
|
||||
|
||||
public static List<String> execSynchronous(final String[] command) throws IOException {
|
||||
// runs a unix/linux command and returns output as Vector of Strings
|
||||
// this method blocks until the command is executed
|
||||
final Process p = Runtime.getRuntime().exec(command);
|
||||
return execSynchronousProcess(p);
|
||||
return readStreams(p);
|
||||
}
|
||||
|
||||
private static List<String> execSynchronousProcess(Process p) throws IOException {
|
||||
/**
|
||||
* Read all lines from both standard and error output from the given process
|
||||
* @param p a process
|
||||
* @return all the lines from the process standard and error ouput
|
||||
* @throws IOException when an unexpected error occurred
|
||||
*/
|
||||
public static List<String> readStreams(final Process p) throws IOException {
|
||||
String line;
|
||||
final List<String> output = new ArrayList<String>();
|
||||
final List<String> output = new ArrayList<>();
|
||||
|
||||
if(p == null) {
|
||||
return output;
|
||||
}
|
||||
|
||||
try (final InputStreamReader streamReader = new InputStreamReader(p.getInputStream());
|
||||
final BufferedReader in = new BufferedReader(streamReader);) {
|
||||
|
|
|
@ -770,7 +770,8 @@ public final class Switchboard extends serverSwitch {
|
|||
getConfigInt(SwitchboardConstants.HTCACHE_COMPRESSION_LEVEL,
|
||||
SwitchboardConstants.HTCACHE_COMPRESSION_LEVEL_DEFAULT));
|
||||
final File transactiondir = new File(this.htCachePath, "snapshots");
|
||||
Transactions.init(transactiondir);
|
||||
Transactions.init(transactiondir, getConfigLong(SwitchboardConstants.SNAPSHOTS_WKHTMLTOPDF_TIMEOUT,
|
||||
SwitchboardConstants.SNAPSHOTS_WKHTMLTOPDF_TIMEOUT_DEFAULT));
|
||||
|
||||
// create the surrogates directories
|
||||
this.surrogatesInPath =
|
||||
|
|
|
@ -329,10 +329,10 @@ public final class SwitchboardConstants {
|
|||
/** Default value controlling whether a self-signed certificate is acceptable from a remote Solr instance with authentication credentials. */
|
||||
public static final boolean FEDERATED_SERVICE_SOLR_INDEXING_AUTHENTICATED_ALLOW_SELF_SIGNED_DEFAULT = false;
|
||||
|
||||
/** Key of the setting controlling wheter to use or not an embedded Solr instance */
|
||||
/** Key of the setting controlling whether to use or not an embedded Solr instance */
|
||||
public static final String CORE_SERVICE_FULLTEXT = "core.service.fulltext";
|
||||
|
||||
/** Default setting value controlling wheter to use or not an embedded Solr instance */
|
||||
/** Default setting value controlling whether to use or not an embedded Solr instance */
|
||||
public static final boolean CORE_SERVICE_FULLTEXT_DEFAULT = true;
|
||||
|
||||
public static final String CORE_SERVICE_RWI = "core.service.rwi.tmp";
|
||||
|
@ -354,6 +354,12 @@ public final class SwitchboardConstants {
|
|||
public static final String CRAWLER_USER_AGENT_MINIMUMDELTA = "crawler.userAgent.minimumdelta";
|
||||
public static final String CRAWLER_USER_AGENT_CLIENTTIMEOUT = "crawler.userAgent.clienttimeout";
|
||||
|
||||
/** Key of the setting controlling the maximum time to wait for each wkhtmltopdf call when rendering PDF snapshots */
|
||||
public static final String SNAPSHOTS_WKHTMLTOPDF_TIMEOUT = "snapshots.wkhtmltopdf.timeout";
|
||||
|
||||
/** Default maximum time in seconds to wait for each wkhtmltopdf call when rendering PDF snapshots*/
|
||||
public static final long SNAPSHOTS_WKHTMLTOPDF_TIMEOUT_DEFAULT = 30;
|
||||
|
||||
/* --- debug flags --- */
|
||||
|
||||
/** when set to true : do not use the local dht/rwi index (which is not done if we do remote searches) */
|
||||
|
|
Loading…
Reference in New Issue
Block a user