reactivated on-demand snapshot loading

This commit is contained in:
Michael Peter Christen 2014-12-16 12:09:57 +01:00
parent 2362ad7c34
commit 932faafffe
3 changed files with 62 additions and 45 deletions

View File

@ -26,6 +26,7 @@ import java.io.File;
import java.io.IOException;
import java.net.MalformedURLException;
import java.util.Collection;
import java.util.Date;
import java.util.Map;
import java.util.TreeMap;
@ -248,16 +249,21 @@ public class snapshot {
}
if (pdf || pngjpg) {
Collection<File> pdfSnapshots = Transactions.findPaths(durl, "pdf", Transactions.State.ANY);
Collection<File> pdfSnapshots = Transactions.findPaths(durl, "pdf", Transactions.State.INVENTORY);
File pdfFile = null;
if (pdfSnapshots.size() == 0) {
// if the client is authenticated, we create the pdf on the fly!
if (!authenticated) return null;
SolrDocument sd = sb.index.fulltext().getMetadata(durl.hash());
boolean success = false;
if (sd == null) {
success = Transactions.store(durl, new Date(), 99, false, true, sb.getConfigBool("isTransparentProxy", false) ? "http://127.0.0.1:" + sb.getConfigInt("port", 8090) : null, ClientIdentification.yacyProxyAgent, sb.getConfig("crawler.http.acceptLanguage", null));
} else {
SolrInputDocument sid = sb.index.fulltext().getDefaultConfiguration().toSolrInputDocument(sd);
boolean success = Transactions.store(sid, true, true, sb.getConfigBool("isTransparentProxy", false) ? "http://127.0.0.1:" + sb.getConfigInt("port", 8090) : null, ClientIdentification.yacyProxyAgent, sb.getConfig("crawler.http.acceptLanguage", null));
success = Transactions.store(sid, false, true, true, sb.getConfigBool("isTransparentProxy", false) ? "http://127.0.0.1:" + sb.getConfigInt("port", 8090) : null, ClientIdentification.yacyProxyAgent, sb.getConfig("crawler.http.acceptLanguage", null));
}
if (success) {
pdfSnapshots = Transactions.findPaths(durl, "pdf", Transactions.State.INVENTORY);
pdfSnapshots = Transactions.findPaths(durl, "pdf", Transactions.State.ANY);
if (pdfSnapshots.size() != 0) pdfFile = pdfSnapshots.iterator().next();
}
} else {

View File

@ -146,7 +146,7 @@ public class Transactions {
}
}
public static boolean store(final SolrInputDocument doc, final boolean loadImage, final boolean replaceOld, final String proxy, final ClientIdentification.Agent agent, final String acceptLanguage) {
public static boolean store(final SolrInputDocument doc, final boolean concurrency, final boolean loadImage, final boolean replaceOld, final String proxy, final ClientIdentification.Agent agent, final String acceptLanguage) {
// GET METADATA FROM DOC
final String urls = (String) doc.getFieldValue(CollectionSchema.sku.getSolrFieldName());
@ -160,17 +160,11 @@ public class Transactions {
return false;
}
// CLEAN UP OLD DATA (if wanted)
Collection<File> oldPaths = Transactions.findPaths(url, depth, null, Transactions.State.INVENTORY);
if (replaceOld) {
for (File oldPath: oldPaths) oldPath.delete();
}
boolean success = loadImage ? store(url, date, depth, concurrency, replaceOld, proxy, agent, acceptLanguage) : true;
if (success) {
// STORE METADATA FOR THE IMAGE
File metadataPath = Transactions.definePath(url, depth, date, "xml", Transactions.State.INVENTORY);
metadataPath.getParentFile().mkdirs();
boolean success = true;
try {
if (doc != null) {
FileOutputStream fos = new FileOutputStream(metadataPath);
@ -189,11 +183,29 @@ public class Transactions {
ConcurrentLog.logException(e);
success = false;
}
}
return success;
}
public static boolean store(final DigestURL url, final Date date, final int depth, final boolean concurrency, final boolean replaceOld, final String proxy, final ClientIdentification.Agent agent, final String acceptLanguage) {
// CLEAN UP OLD DATA (if wanted)
Collection<File> oldPaths = Transactions.findPaths(url, depth, null, Transactions.State.INVENTORY);
if (replaceOld && oldPaths != null) {
for (File oldPath: oldPaths) oldPath.delete();
}
// STORE METADATA FOR THE IMAGE
File metadataPath = Transactions.definePath(url, depth, date, "xml", Transactions.State.INVENTORY);
metadataPath.getParentFile().mkdirs();
boolean success = true;
// STORE AN IMAGE
if (success && loadImage) {
final String urls = url.toNormalform(true);
final File pdfPath = Transactions.definePath(url, depth, date, "pdf", Transactions.State.INVENTORY);
if (executorRunning.intValue() < Runtime.getRuntime().availableProcessors()) {
if (concurrency && executorRunning.intValue() < Runtime.getRuntime().availableProcessors()) {
Thread t = new Thread(){
@Override
public void run() {
@ -209,7 +221,6 @@ public class Transactions {
} else {
success = Html2Image.writeWkhtmltopdf(urls, proxy, agent.userAgent, acceptLanguage, pdfPath);
}
}
return success;
}

View File

@ -580,7 +580,7 @@ public class Segment {
String ext = MultiProtocolURL.getFileExtension(url.getFile()).toLowerCase();
if (ext.length() == 0 || url.getFile().length() <= 1 || htmlParser.htmlExtensionsSet.contains(ext)) {
// STORE IMAGE AND METADATA
Transactions.store(vector, crawlProfile.snapshotLoadImage(), crawlProfile.snapshotReplaceold(), proxy, crawlProfile.getAgent(), acceptLanguage);
Transactions.store(vector, true, crawlProfile.snapshotLoadImage(), crawlProfile.snapshotReplaceold(), proxy, crawlProfile.getAgent(), acceptLanguage);
}
}