yacy_search_server/source/net/yacy/crawler/data/Snapshots.java

/**
 *  DocumentImage
 *  Copyright 2014 by Michael Peter Christen
 *  First released 29.11.2014 at http://yacy.net
 *
 *  This library is free software; you can redistribute it and/or
 *  modify it under the terms of the GNU Lesser General Public
 *  License as published by the Free Software Foundation; either
 *  version 2.1 of the License, or (at your option) any later version.
 *
 *  This library is distributed in the hope that it will be useful,
 *  but WITHOUT ANY WARRANTY; without even the implied warranty of
 *  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
 *  Lesser General Public License for more details.
 *
 *  You should have received a copy of the GNU Lesser General Public License
 *  along with this program in the file lgpl21.txt
 *  If not, see <http://www.gnu.org/licenses/>.
 */

package net.yacy.crawler.data;

import java.io.File;
import java.io.IOException;
import java.util.ArrayList;
import java.util.Collection;
import java.util.Date;

import org.apache.solr.common.SolrDocument;

import net.yacy.cora.date.GenericFormatter;
import net.yacy.cora.document.encoding.ASCII;
import net.yacy.cora.document.id.DigestURL;
import net.yacy.cora.document.id.MultiProtocolURL;
import net.yacy.cora.util.Html2Image;
import net.yacy.search.index.Fulltext;
import net.yacy.search.schema.CollectionSchema;

/**
 * This class hosts document snapshots.
 *
 * The storage is organized in the following hierarchy:
 * - in the root path are subpaths for each host:port
 * - in the host:port path are subpaths for the crawl depth, two digits length
 * - in the crawl depth path are subpaths for the first two charaters of the url-hash, called shard
 * - in the shard path are files, named with <urlhash>'.'<date>.<ext>
 * .. where the <date> has the form "yyyyMMdd" and ext may be one of {pdf,jpg,png,xml,json}.
 * The pdf is created with wxhtmltopdf, jpg/png is created with convert
 * and the xml/json is an extract from solr.
 *
 * The construction of the file name with the date allows to make several copies of the same document
 * for different snapshot-times. The usage of the crawl depth makes it easier to extract a specific part
 * of the domain.
 */
public class Snapshots {

    private File storageLocation;

    public Snapshots(File location) {
        this.storageLocation = location;
    }

    /**
     * Load a pdf snapshot of a document.
     * A proxy must be given to ensure that multiple loads containing i.e. image are cached
     * Use http://localhost:<thisport> as proxy.
     * @param url
     * @param depth
     * @param date
     * @param proxy - a string of the form 'http://<host>:<port>
     * @return
     */
    public File downloadPDFSnapshot(final DigestURL url, final int depth, final Date date, boolean replaceOld, String proxy) {
        Collection<File> oldPaths = findPaths(url, depth);
        if (replaceOld) {
            for (File oldPath: oldPaths) oldPath.delete();
        }
        File path = definePath(url, "pdf", depth, date);
        path.getParentFile().mkdirs();
        boolean success = Html2Image.writeWkhtmltopdf(url.toNormalform(true), proxy, path);
        return success ? path : null;
    }

    /**
     * Compute the path of a snapshot. This does not create the snapshot, only gives a path.
     * Also, the path to the storage location is not created.
     * @param url
     * @param ext
     * @param depth
     * @param date
     * @return a file to the snapshot
     */
    public File definePath(final DigestURL url, final String ext, final int depth, final Date date) {
        String id = ASCII.String(url.hash());
        String ds = GenericFormatter.SHORT_DAY_FORMATTER.format(date);
        File path = new File(pathToShard(url, depth), id + "." + ds + "." + ext);
        return path;
    }

    /**
     * get the depth to a document, helper method for definePath to determine the depth value
     * @param url
     * @param fulltext
     * @return the crawldepth of the document
     */
    public int getDepth(final DigestURL url, final Fulltext fulltext) {
        Integer depth = null;
        if (fulltext.getDefaultConfiguration().contains(CollectionSchema.crawldepth_i)) {
            try {
                SolrDocument doc = fulltext.getDefaultConnector().getDocumentById(ASCII.String(url.hash()), CollectionSchema.crawldepth_i.getSolrFieldName());
                if (doc != null) {
                    depth = (Integer) doc.getFieldValue(CollectionSchema.crawldepth_i.getSolrFieldName());
                }
            } catch (IOException e) {
            }
        }
        return depth == null ? 0 : depth;
    }

    /**
     * for a given url, get all paths for storage locations.
     * The locations are all for the single url but may represent different storage times.
     * This method is inefficient because it tests all different depths, it would be better to use
     * findPaths/3 with a given depth.
     * @param url
     * @param ext
     * @return a set of files for snapshots of the url
     */
    public Collection<File> findPaths(final DigestURL url) {
        for (int i = 0; i < 100; i++) {
            Collection<File> paths = findPaths(url, i);
            if (paths.size() > 0) return paths;
        }
        return new ArrayList<>(0);
    }

    /**
     * for a given url, get all paths for storage locations.
     * The locations are all for the single url but may represent different storage times.
     * @param url
     * @param ext
     * @param depth
     * @return a set of files for snapshots of the url
     */
    public Collection<File> findPaths(final DigestURL url, final int depth) {
        String id = ASCII.String(url.hash());
        File pathToShard = pathToShard(url, depth);
        String[] list = pathToShard.exists() && pathToShard.isDirectory() ? pathToShard.list() : null; // may be null if path does not exist
        ArrayList<File> paths = new ArrayList<>();
        if (list != null) {
            final String ext = MultiProtocolURL.getFileExtension(url.getFileName());
            for (String f: list) {
                if (f.startsWith(id) && f.endsWith(ext)) paths.add(new File(pathToShard, f));
            }
        }
        return paths;
    }

    private File pathToShard(final DigestURL url, final int depth) {
        String id = ASCII.String(url.hash());
        File pathToHostDir = new File(storageLocation, url.getHost() + "." + url.getPort());
        File pathToDepthDir = new File(pathToHostDir, depth < 10 ? "0" + depth : Integer.toString(depth));
        File pathToShard = new File(pathToDepthDir, id.substring(0, 2));
        return pathToShard;
    }

}