mirror of
https://github.com/yacy/yacy_search_server.git
synced 2024-09-19 00:01:41 +02:00
in case that loading from the cache fails, load from wkhtmltopdf without
cache using the user agent string given in the crawl profile
This commit is contained in:
parent
d5bac64421
commit
e586e423aa
|
@ -71,17 +71,22 @@ public class Html2Image {
|
||||||
* @param destination
|
* @param destination
|
||||||
* @return
|
* @return
|
||||||
*/
|
*/
|
||||||
public static boolean writeWkhtmltopdf(String url, String proxy, File destination) {
|
public static boolean writeWkhtmltopdf(String url, String proxy, String userAgent, File destination) {
|
||||||
boolean success = writeWkhtmltopdfInternal(url, proxy, destination);
|
boolean success = writeWkhtmltopdfInternal(url, proxy, destination, null, false);
|
||||||
if (success) return true;
|
if (success) return true;
|
||||||
if (proxy == null) return false;
|
if (proxy == null) return false;
|
||||||
ConcurrentLog.warn("Html2Image", "trying to load without proxy: " + url);
|
ConcurrentLog.warn("Html2Image", "trying to load without proxy: " + url);
|
||||||
return writeWkhtmltopdfInternal(url, null, destination);
|
return writeWkhtmltopdfInternal(url, null, destination, userAgent, true);
|
||||||
}
|
}
|
||||||
|
|
||||||
private static boolean writeWkhtmltopdfInternal(String url, String proxy, File destination) {
|
private static boolean writeWkhtmltopdfInternal(String url, String proxy, File destination, String userAgent, boolean ignoreErrors) {
|
||||||
final File wkhtmltopdf = wkhtmltopdfMac.exists() ? wkhtmltopdfMac : wkhtmltopdfDebian;
|
final File wkhtmltopdf = wkhtmltopdfMac.exists() ? wkhtmltopdfMac : wkhtmltopdfDebian;
|
||||||
String commandline = wkhtmltopdf.getAbsolutePath() + " -q --title " + url + (proxy == null ? " " : " --proxy " + proxy + " ") + (OS.isMacArchitecture ? "--load-error-handling ignore " : "--ignore-load-errors ") + url + " " + destination.getAbsolutePath();
|
String commandline =
|
||||||
|
wkhtmltopdf.getAbsolutePath() + " -q --title " + url +
|
||||||
|
(userAgent == null ? "" : "--custom-header 'User-Agent' '" + userAgent + "' --custom-header-propagation") +
|
||||||
|
(proxy == null ? " " : " --proxy " + proxy + " ") +
|
||||||
|
(ignoreErrors ? (OS.isMacArchitecture ? "--load-error-handling ignore " : "--ignore-load-errors ") : "") +
|
||||||
|
url + " " + destination.getAbsolutePath();
|
||||||
try {
|
try {
|
||||||
List<String> message;
|
List<String> message;
|
||||||
if (!usexvfb) {
|
if (!usexvfb) {
|
||||||
|
|
|
@ -70,14 +70,14 @@ public class Snapshots {
|
||||||
* @param proxy - a string of the form 'http://<host>:<port>
|
* @param proxy - a string of the form 'http://<host>:<port>
|
||||||
* @return
|
* @return
|
||||||
*/
|
*/
|
||||||
public File downloadPDFSnapshot(final DigestURL url, final int depth, final Date date, boolean replaceOld, String proxy) {
|
public File downloadPDFSnapshot(final DigestURL url, final int depth, final Date date, boolean replaceOld, String proxy, String userAgent) {
|
||||||
Collection<File> oldPaths = findPaths(url, depth);
|
Collection<File> oldPaths = findPaths(url, depth);
|
||||||
if (replaceOld) {
|
if (replaceOld) {
|
||||||
for (File oldPath: oldPaths) oldPath.delete();
|
for (File oldPath: oldPaths) oldPath.delete();
|
||||||
}
|
}
|
||||||
File path = definePath(url, "pdf", depth, date);
|
File path = definePath(url, "pdf", depth, date);
|
||||||
path.getParentFile().mkdirs();
|
path.getParentFile().mkdirs();
|
||||||
boolean success = Html2Image.writeWkhtmltopdf(url.toNormalform(true), proxy, path);
|
boolean success = Html2Image.writeWkhtmltopdf(url.toNormalform(true), proxy, userAgent, path);
|
||||||
return success ? path : null;
|
return success ? path : null;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
|
@ -217,7 +217,7 @@ public final class LoaderDispatcher {
|
||||||
String ext = MultiProtocolURL.getFileExtension(file).toLowerCase();
|
String ext = MultiProtocolURL.getFileExtension(file).toLowerCase();
|
||||||
boolean extok = ext.length() == 0 || file.length() <= 1 || htmlParser.htmlExtensionsSet.contains(ext);
|
boolean extok = ext.length() == 0 || file.length() <= 1 || htmlParser.htmlExtensionsSet.contains(ext);
|
||||||
if (depthok && extok) {
|
if (depthok && extok) {
|
||||||
File snapshotFile = sb.snapshots.downloadPDFSnapshot(request.url(), request.depth(), new Date(), crawlProfile.snapshotReplaceold(), sb.getConfigBool("isTransparentProxy", false) ? "http://127.0.0.1:" + sb.getConfigInt("port", 8090) : null);
|
File snapshotFile = sb.snapshots.downloadPDFSnapshot(request.url(), request.depth(), new Date(), crawlProfile.snapshotReplaceold(), sb.getConfigBool("isTransparentProxy", false) ? "http://127.0.0.1:" + sb.getConfigInt("port", 8090) : null, agent.userAgent);
|
||||||
log.info("SNAPSHOT - " + (snapshotFile == null ? "could not generate snapshot for " + request.url().toNormalform(true) : "wrote " + snapshotFile + " for " + request.url().toNormalform(true)));
|
log.info("SNAPSHOT - " + (snapshotFile == null ? "could not generate snapshot for " + request.url().toNormalform(true) : "wrote " + snapshotFile + " for " + request.url().toNormalform(true)));
|
||||||
} else {
|
} else {
|
||||||
//if (!depthok) log.warn("SNAPSHOT: depth not ok, " + (crawlProfile == null ? "profile = null" : "entry.depth() = " + request.depth() + ", profile.snapshotMaxdepth() = " + crawlProfile.snapshotMaxdepth()));
|
//if (!depthok) log.warn("SNAPSHOT: depth not ok, " + (crawlProfile == null ? "profile = null" : "entry.depth() = " + request.depth() + ", profile.snapshotMaxdepth() = " + crawlProfile.snapshotMaxdepth()));
|
||||||
|
|
Loading…
Reference in New Issue
Block a user