yacy_search_server/source/net/yacy/crawler/data/Snapshots.java

168 lines
6.5 KiB
Java
Raw Normal View History

/**
* DocumentImage
* Copyright 2014 by Michael Peter Christen
* First released 29.11.2014 at http://yacy.net
*
* This library is free software; you can redistribute it and/or
* modify it under the terms of the GNU Lesser General Public
* License as published by the Free Software Foundation; either
* version 2.1 of the License, or (at your option) any later version.
*
* This library is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
* Lesser General Public License for more details.
*
* You should have received a copy of the GNU Lesser General Public License
* along with this program in the file lgpl21.txt
* If not, see <http://www.gnu.org/licenses/>.
*/
package net.yacy.crawler.data;
import java.io.File;
import java.io.IOException;
import java.util.ArrayList;
import java.util.Collection;
import java.util.Date;
import org.apache.solr.common.SolrDocument;
import net.yacy.cora.date.GenericFormatter;
import net.yacy.cora.document.encoding.ASCII;
import net.yacy.cora.document.id.DigestURL;
import net.yacy.cora.document.id.MultiProtocolURL;
import net.yacy.cora.util.Html2Image;
import net.yacy.search.index.Fulltext;
import net.yacy.search.schema.CollectionSchema;
/**
* This class hosts document snapshots.
*
* The storage is organized in the following hierarchy:
* - in the root path are subpaths for each host:port
* - in the host:port path are subpaths for the crawl depth, two digits length
* - in the crawl depth path are subpaths for the first two charaters of the url-hash, called shard
* - in the shard path are files, named with <urlhash>'.'<date>.<ext>
* .. where the <date> has the form "yyyyMMdd" and ext may be one of {pdf,jpg,png,xml,json}.
* The pdf is created with wxhtmltopdf, jpg/png is created with convert
* and the xml/json is an extract from solr.
*
* The construction of the file name with the date allows to make several copies of the same document
* for different snapshot-times. The usage of the crawl depth makes it easier to extract a specific part
* of the domain.
*/
public class Snapshots {
private File storageLocation;
public Snapshots(File location) {
this.storageLocation = location;
}
/**
* Load a pdf snapshot of a document.
* A proxy must be given to ensure that multiple loads containing i.e. image are cached
* Use http://localhost:<thisport> as proxy.
* @param url
* @param depth
* @param date
* @param proxy - a string of the form 'http://<host>:<port>
* @return
*/
public File downloadPDFSnapshot(final DigestURL url, final int depth, final Date date, boolean replaceOld, String proxy) {
Collection<File> oldPaths = findPaths(url, depth);
if (replaceOld) {
for (File oldPath: oldPaths) oldPath.delete();
}
File path = definePath(url, "pdf", depth, date);
path.getParentFile().mkdirs();
boolean success = Html2Image.writeWkhtmltopdf(url.toNormalform(true), proxy, path);
return success ? path : null;
}
/**
* Compute the path of a snapshot. This does not create the snapshot, only gives a path.
* Also, the path to the storage location is not created.
* @param url
* @param ext
* @param depth
* @param date
* @return a file to the snapshot
*/
public File definePath(final DigestURL url, final String ext, final int depth, final Date date) {
String id = ASCII.String(url.hash());
String ds = GenericFormatter.SHORT_DAY_FORMATTER.format(date);
File path = new File(pathToShard(url, depth), id + "." + ds + "." + ext);
return path;
}
/**
* get the depth to a document, helper method for definePath to determine the depth value
* @param url
* @param fulltext
* @return the crawldepth of the document
*/
public int getDepth(final DigestURL url, final Fulltext fulltext) {
Integer depth = null;
if (fulltext.getDefaultConfiguration().contains(CollectionSchema.crawldepth_i)) {
try {
SolrDocument doc = fulltext.getDefaultConnector().getDocumentById(ASCII.String(url.hash()), CollectionSchema.crawldepth_i.getSolrFieldName());
if (doc != null) {
depth = (Integer) doc.getFieldValue(CollectionSchema.crawldepth_i.getSolrFieldName());
}
} catch (IOException e) {
}
}
return depth == null ? 0 : depth;
}
/**
* for a given url, get all paths for storage locations.
* The locations are all for the single url but may represent different storage times.
* This method is inefficient because it tests all different depths, it would be better to use
* findPaths/3 with a given depth.
* @param url
* @param ext
* @return a set of files for snapshots of the url
*/
public Collection<File> findPaths(final DigestURL url) {
for (int i = 0; i < 100; i++) {
Collection<File> paths = findPaths(url, i);
if (paths.size() > 0) return paths;
}
return new ArrayList<>(0);
}
/**
* for a given url, get all paths for storage locations.
* The locations are all for the single url but may represent different storage times.
* @param url
* @param ext
* @param depth
* @return a set of files for snapshots of the url
*/
public Collection<File> findPaths(final DigestURL url, final int depth) {
String id = ASCII.String(url.hash());
File pathToShard = pathToShard(url, depth);
String[] list = pathToShard.exists() && pathToShard.isDirectory() ? pathToShard.list() : null; // may be null if path does not exist
ArrayList<File> paths = new ArrayList<>();
if (list != null) {
final String ext = MultiProtocolURL.getFileExtension(url.getFileName());
for (String f: list) {
if (f.startsWith(id) && f.endsWith(ext)) paths.add(new File(pathToShard, f));
}
}
return paths;
}
private File pathToShard(final DigestURL url, final int depth) {
String id = ASCII.String(url.hash());
File pathToHostDir = new File(storageLocation, url.getHost() + "." + url.getPort());
File pathToDepthDir = new File(pathToHostDir, depth < 10 ? "0" + depth : Integer.toString(depth));
File pathToShard = new File(pathToDepthDir, id.substring(0, 2));
return pathToShard;
}
}