in case that loading from the cache fails, load from wkhtmltopdf without

cache using the user agent string given in the crawl profile
This commit is contained in:
Michael Peter Christen 2014-12-02 13:35:19 +01:00
parent d5bac64421
commit e586e423aa
3 changed files with 13 additions and 8 deletions

View File

@ -71,17 +71,22 @@ public class Html2Image {
* @param destination * @param destination
* @return * @return
*/ */
public static boolean writeWkhtmltopdf(String url, String proxy, File destination) { public static boolean writeWkhtmltopdf(String url, String proxy, String userAgent, File destination) {
boolean success = writeWkhtmltopdfInternal(url, proxy, destination); boolean success = writeWkhtmltopdfInternal(url, proxy, destination, null, false);
if (success) return true; if (success) return true;
if (proxy == null) return false; if (proxy == null) return false;
ConcurrentLog.warn("Html2Image", "trying to load without proxy: " + url); ConcurrentLog.warn("Html2Image", "trying to load without proxy: " + url);
return writeWkhtmltopdfInternal(url, null, destination); return writeWkhtmltopdfInternal(url, null, destination, userAgent, true);
} }
private static boolean writeWkhtmltopdfInternal(String url, String proxy, File destination) { private static boolean writeWkhtmltopdfInternal(String url, String proxy, File destination, String userAgent, boolean ignoreErrors) {
final File wkhtmltopdf = wkhtmltopdfMac.exists() ? wkhtmltopdfMac : wkhtmltopdfDebian; final File wkhtmltopdf = wkhtmltopdfMac.exists() ? wkhtmltopdfMac : wkhtmltopdfDebian;
String commandline = wkhtmltopdf.getAbsolutePath() + " -q --title " + url + (proxy == null ? " " : " --proxy " + proxy + " ") + (OS.isMacArchitecture ? "--load-error-handling ignore " : "--ignore-load-errors ") + url + " " + destination.getAbsolutePath(); String commandline =
wkhtmltopdf.getAbsolutePath() + " -q --title " + url +
(userAgent == null ? "" : "--custom-header 'User-Agent' '" + userAgent + "' --custom-header-propagation") +
(proxy == null ? " " : " --proxy " + proxy + " ") +
(ignoreErrors ? (OS.isMacArchitecture ? "--load-error-handling ignore " : "--ignore-load-errors ") : "") +
url + " " + destination.getAbsolutePath();
try { try {
List<String> message; List<String> message;
if (!usexvfb) { if (!usexvfb) {

View File

@ -70,14 +70,14 @@ public class Snapshots {
* @param proxy - a string of the form 'http://<host>:<port> * @param proxy - a string of the form 'http://<host>:<port>
* @return * @return
*/ */
public File downloadPDFSnapshot(final DigestURL url, final int depth, final Date date, boolean replaceOld, String proxy) { public File downloadPDFSnapshot(final DigestURL url, final int depth, final Date date, boolean replaceOld, String proxy, String userAgent) {
Collection<File> oldPaths = findPaths(url, depth); Collection<File> oldPaths = findPaths(url, depth);
if (replaceOld) { if (replaceOld) {
for (File oldPath: oldPaths) oldPath.delete(); for (File oldPath: oldPaths) oldPath.delete();
} }
File path = definePath(url, "pdf", depth, date); File path = definePath(url, "pdf", depth, date);
path.getParentFile().mkdirs(); path.getParentFile().mkdirs();
boolean success = Html2Image.writeWkhtmltopdf(url.toNormalform(true), proxy, path); boolean success = Html2Image.writeWkhtmltopdf(url.toNormalform(true), proxy, userAgent, path);
return success ? path : null; return success ? path : null;
} }

View File

@ -217,7 +217,7 @@ public final class LoaderDispatcher {
String ext = MultiProtocolURL.getFileExtension(file).toLowerCase(); String ext = MultiProtocolURL.getFileExtension(file).toLowerCase();
boolean extok = ext.length() == 0 || file.length() <= 1 || htmlParser.htmlExtensionsSet.contains(ext); boolean extok = ext.length() == 0 || file.length() <= 1 || htmlParser.htmlExtensionsSet.contains(ext);
if (depthok && extok) { if (depthok && extok) {
File snapshotFile = sb.snapshots.downloadPDFSnapshot(request.url(), request.depth(), new Date(), crawlProfile.snapshotReplaceold(), sb.getConfigBool("isTransparentProxy", false) ? "http://127.0.0.1:" + sb.getConfigInt("port", 8090) : null); File snapshotFile = sb.snapshots.downloadPDFSnapshot(request.url(), request.depth(), new Date(), crawlProfile.snapshotReplaceold(), sb.getConfigBool("isTransparentProxy", false) ? "http://127.0.0.1:" + sb.getConfigInt("port", 8090) : null, agent.userAgent);
log.info("SNAPSHOT - " + (snapshotFile == null ? "could not generate snapshot for " + request.url().toNormalform(true) : "wrote " + snapshotFile + " for " + request.url().toNormalform(true))); log.info("SNAPSHOT - " + (snapshotFile == null ? "could not generate snapshot for " + request.url().toNormalform(true) : "wrote " + snapshotFile + " for " + request.url().toNormalform(true)));
} else { } else {
//if (!depthok) log.warn("SNAPSHOT: depth not ok, " + (crawlProfile == null ? "profile = null" : "entry.depth() = " + request.depth() + ", profile.snapshotMaxdepth() = " + crawlProfile.snapshotMaxdepth())); //if (!depthok) log.warn("SNAPSHOT: depth not ok, " + (crawlProfile == null ? "profile = null" : "entry.depth() = " + request.depth() + ", profile.snapshotMaxdepth() = " + crawlProfile.snapshotMaxdepth()));