yacy_search_server/source/net/yacy/crawler/data/Snapshots.java
Michael Peter Christen fed26f33a8 enhanced timezone managament for indexed data:
to support the new time parser and search functions in YaCy a high
precision detection of date and time on the day is necessary. That
requires that the time zone of the document content and the time zone of
the user, doing a search, is detected. The time zone of the search
request is done automatically using the browsers time zone offset which
is delivered to the search request automatically and invisible to the
user. The time zone for the content of web pages cannot be detected
automatically and must be an attribute of crawl starts. The advanced
crawl start now provides an input field to set the time zone in minutes
as an offset number. All parsers must get a time zone offset passed, so
this required the change of the parser java api. A lot of other changes
had been made which corrects the wrong handling of dates in YaCy which
was to add a correction based on the time zone of the server. Now no
correction is added and all dates in YaCy are UTC/GMT time zone, a
normalized time zone for all peers.
2015-04-15 13:17:23 +02:00

456 lines
20 KiB
Java

/**
* DocumentImage
* Copyright 2014 by Michael Peter Christen
* First released 29.11.2014 at http://yacy.net
*
* This library is free software; you can redistribute it and/or
* modify it under the terms of the GNU Lesser General Public
* License as published by the Free Software Foundation; either
* version 2.1 of the License, or (at your option) any later version.
*
* This library is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
* Lesser General Public License for more details.
*
* You should have received a copy of the GNU Lesser General Public License
* along with this program in the file lgpl21.txt
* If not, see <http://www.gnu.org/licenses/>.
*/
package net.yacy.crawler.data;
import java.io.BufferedReader;
import java.io.File;
import java.io.FileInputStream;
import java.io.IOException;
import java.io.InputStreamReader;
import java.text.ParseException;
import java.util.ArrayList;
import java.util.Collection;
import java.util.Date;
import java.util.HashMap;
import java.util.HashSet;
import java.util.Iterator;
import java.util.LinkedHashMap;
import java.util.Map;
import java.util.Set;
import java.util.TreeMap;
import java.util.TreeSet;
import org.apache.solr.common.SolrDocument;
import net.yacy.cora.date.GenericFormatter;
import net.yacy.cora.document.encoding.ASCII;
import net.yacy.cora.document.id.DigestURL;
import net.yacy.search.index.Fulltext;
import net.yacy.search.schema.CollectionSchema;
/**
* This class hosts document snapshots.
*
* The storage is organized in the following hierarchy:
* - in the root path are subpaths for each host:port
* - in the host:port path are subpaths for the crawl depth, two digits length
* - in the crawl depth path are subpaths for the first two charaters of the url-hash, called shard
* - in the shard path are files, named with <urlhash>'.'<date>.<ext>
* .. where the <date> has the form "yyyyMMdd" and ext may be one of {pdf,jpg,png,xml,json}.
* The pdf is created with wxhtmltopdf, jpg/png is created with convert
* and the xml/json is an extract from solr.
*
* The construction of the file name with the date allows to make several copies of the same document
* for different snapshot-times. The usage of the crawl depth makes it easier to extract a specific part
* of the domain.
*/
public class Snapshots {
private File storageLocation;
private Map<String, TreeMap<Integer, TreeSet<String>>> directory; // a TreeMap for each domain (host.port) where the key is the depth and the value is a Set containing a key/urlhash id to get all files into a specific order to provide a recent view on the documents
public Snapshots(final File location) {
this.storageLocation = location;
this.storageLocation.mkdirs();
// scan the location to fill the directory
this.directory = new HashMap<>();
for (String hostport: location.list()) {
TreeMap<Integer, TreeSet<String>> domaindepth = new TreeMap<>();
this.directory.put(hostport, domaindepth);
File domaindir = new File(location, hostport);
if (domaindir.isDirectory()) domainscan: for (String depth: domaindir.list()) {
TreeSet<String> dateid = new TreeSet<>();
Integer depthi = -1;
try {
depthi = Integer.parseInt(depth);
} catch (NumberFormatException e) {
continue domainscan;
}
domaindepth.put(depthi, dateid);
File sharddir = new File(domaindir, depth);
if (sharddir.isDirectory()) for (String shard: sharddir.list()) {
File snapshotdir = new File(sharddir, shard);
if (snapshotdir.isDirectory()) {
for (String snapshotfile: snapshotdir.list()) {
if (snapshotfile.endsWith(".xml")) {
String s = snapshotfile.substring(0, snapshotfile.length() - 4);
int p = s.indexOf('.');
assert p == 12;
if (p > 0) {
String key = s.substring(p + 1) + '.' + s.substring(0, p);
dateid.add(key);
}
}
}
}
}
if (dateid.size() == 0) domaindepth.remove(depthi);
}
if (domaindepth.size() == 0) this.directory.remove(hostport);
}
}
/**
* get the number of entries in the snapshot directory
* @return the total number of different documents
*/
public int size() {
int c = 0;
for (Map<Integer, TreeSet<String>> m: directory.values()) {
for (TreeSet<String> n: m.values()) {
c += n.size();
}
}
return c;
}
/**
* get a list of <host>.<port> names in the snapshot directory
* @return
*/
public Set<String> listHosts() {
return directory.keySet();
}
public final class Revisions {
public final int depth;
public final Date[] dates;
public final String urlhash;
public final String url;
public final File[] pathtoxml;
public Revisions(final String hostport, final int depth, final String datehash) {
this.depth = depth;
int p = datehash.indexOf('.');
this.dates = new Date[1];
String datestring = datehash.substring(0, p);
this.dates[0] = parseDate(datestring);
this.urlhash = datehash.substring(p + 1);
this.pathtoxml = new File[1];
this.pathtoxml[0] = new File(pathToShard(hostport, urlhash, depth), this.urlhash + "." + datestring + ".xml");
String u = null;
if (this.pathtoxml[0].exists()) {
try {
BufferedReader reader = new BufferedReader(new InputStreamReader(new FileInputStream(this.pathtoxml[0])));
String line;
while ((line = reader.readLine()) != null) {
if (line.startsWith("<str name=\"sku\">")) {
u = line.substring(16, line.length() - 6);
break;
}
}
reader.close();
} catch (IOException e) {}
}
this.url = u;
}
}
public Revisions getRevisions(String urlhash) {
if (urlhash == null || urlhash.length() == 0) return null;
// search for the hash, we must iterate through all entries
for (Map.Entry<String, TreeMap<Integer, TreeSet<String>>> hostportDomaindepth: this.directory.entrySet()) {
String hostport = hostportDomaindepth.getKey();
for (Map.Entry<Integer, TreeSet<String>> depthDateHash: hostportDomaindepth.getValue().entrySet()) {
int depth = depthDateHash.getKey();
for (String dateHash: depthDateHash.getValue()) {
if (dateHash.endsWith(urlhash)) {
return new Revisions(hostport, depth, dateHash);
}
}
}
}
return null;
}
/**
* list the snapshots for a given host name
* @param hostport the <host>.<port> identifier for the domain
* @param depth restrict the result to the given depth or if depth == -1 do not restrict to a depth
* @return a map with a set for each depth in the domain of the host name
*/
public TreeMap<Integer, Collection<Revisions>> listIDs(final String hostport, final int depth) {
TreeMap<Integer, Collection<Revisions>> result = new TreeMap<>();
TreeMap<Integer, TreeSet<String>> list = directory.get(hostport);
if (list != null) {
for (Map.Entry<Integer, TreeSet<String>> entry: list.entrySet()) {
if (depth != -1 && entry.getKey() != depth) continue;
Collection<Revisions> r = new ArrayList<>(entry.getValue().size());
for (String datehash: entry.getValue()) {
r.add(new Revisions(hostport, entry.getKey(), datehash));
}
result.put(entry.getKey(), r);
}
}
return result;
}
/**
* get the number of snapshots for the given host name
* @param hostport the <host>.<port> identifier for the domain
* @param depth restrict the result to the given depth or if depth == -1 do not restrict to a depth
* @return a count, the total number of documents for the domain and depth
*/
public int listIDsSize(final String hostport, final int depth) {
int count = 0;
TreeMap<Integer, TreeSet<String>> list = directory.get(hostport);
if (list != null) {
for (Map.Entry<Integer, TreeSet<String>> entry: list.entrySet()) {
if (depth != -1 && entry.getKey() != depth) continue;
count += entry.getValue().size();
}
}
return count;
}
/**
* Compute the path of a snapshot. This does not create the snapshot, only gives a path.
* Also, the path to the storage location is not created.
* @param url
* @param ext
* @param depth
* @param date
* @return a file to the snapshot
*/
public File definePath(final DigestURL url, final int depth, final Date date, final String ext) {
String id = ASCII.String(url.hash());
String ds = GenericFormatter.SHORT_MINUTE_FORMATTER.format(date);
File path = new File(pathToShard(url, depth), id + "." + ds + "." + ext);
return path;
}
/**
* Write information about the storage of a snapshot to the Snapshot-internal index.
* The actual writing of files to the target directory must be done elsewehre, this method does not store the snapshot files.
* @param url
* @param depth
* @param date
*/
public void announceStorage(final DigestURL url, final int depth, final Date date) {
String id = ASCII.String(url.hash());
String ds = GenericFormatter.SHORT_MINUTE_FORMATTER.format(date);
String pathToHostPortDir = pathToHostPortDir(url.getHost(), url.getPort());
TreeMap<Integer, TreeSet<String>> domaindepth = this.directory.get(pathToHostPortDir);
if (domaindepth == null) {domaindepth = new TreeMap<Integer, TreeSet<String>>(); this.directory.put(pathToHostPortDir(url.getHost(), url.getPort()), domaindepth);}
TreeSet<String> dateid = domaindepth.get(depth);
if (dateid == null) {dateid = new TreeSet<String>(); domaindepth.put(depth, dateid);}
dateid.add(ds + '.' + id);
}
/**
* Delete information about the storage of a snapshot to the Snapshot-internal index.
* The actual deletion of files in the target directory must be done elsewehre, this method does not store the snapshot files.
* @param url
* @param depth
* @param date
*/
public Set<Date> announceDeletion(final DigestURL url, final int depth) {
HashSet<Date> dates = new HashSet<>();
String id = ASCII.String(url.hash());
String pathToHostPortDir = pathToHostPortDir(url.getHost(), url.getPort());
TreeMap<Integer, TreeSet<String>> domaindepth = this.directory.get(pathToHostPortDir);
if (domaindepth == null) return dates;
TreeSet<String> dateid = domaindepth.get(depth);
if (dateid == null) return dates;
Iterator<String> i = dateid.iterator();
while (i.hasNext()) {
String dis = i.next();
if (dis.endsWith("." + id)) {
String d = dis.substring(0, dis.length() - id.length() - 1);
Date date = parseDate(d);
if (date != null) dates.add(date);
i.remove();
}
}
if (dateid.size() == 0) domaindepth.remove(depth);
if (domaindepth.size() == 0) this.directory.remove(pathToHostPortDir);
return dates;
}
/**
* Order enum class for the select method
*/
public static enum Order {
ANY, OLDESTFIRST, LATESTFIRST;
}
/**
* select a set of urlhashes from the snapshot directory. The selection either ordered
* by generation date (upwards == OLDESTFIRST or downwards == LATESTFIRST) or with any
* order. The result set can be selected either with a given host or a depth
* @param host selected host or null for all hosts
* @param depth selected depth or null for all depths
* @param order Order.ANY, Order.OLDESTFIRST or Order.LATESTFIRST
* @param maxcount the maximum number of hosthashes. If unlimited, submit Integer.MAX_VALUE
* @return a map of hosthashes with the associated creation date
*/
public LinkedHashMap<String, Revisions> select(final String host, final Integer depth, final Order order, int maxcount) {
TreeMap<String, String[]> dateIdResult = new TreeMap<>();
if (host == null && depth == null) {
loop: for (Map.Entry<String, TreeMap<Integer, TreeSet<String>>> hostportDepths: this.directory.entrySet()) {
for (Map.Entry<Integer, TreeSet<String>> depthIds: hostportDepths.getValue().entrySet()) {
for (String id: depthIds.getValue()) {
dateIdResult.put(id, new String[]{hostportDepths.getKey(), Integer.toString(depthIds.getKey())});
if (order == Order.ANY && dateIdResult.size() >= maxcount) break loop;
}
}
}
}
if (host == null && depth != null) {
loop: for (Map.Entry<String, TreeMap<Integer, TreeSet<String>>> hostportDepths: this.directory.entrySet()) {
TreeSet<String> ids = hostportDepths.getValue().get(depth);
if (ids != null) for (String id: ids) {
dateIdResult.put(id, new String[]{hostportDepths.getKey(), Integer.toString(depth)});
if (order == Order.ANY && dateIdResult.size() >= maxcount) break loop;
}
}
}
if (host != null && depth == null) {
String hostport = pathToHostPortDir(host,80);
TreeMap<Integer, TreeSet<String>> depthIdsMap = this.directory.get(hostport);
if (depthIdsMap != null) loop: for (Map.Entry<Integer, TreeSet<String>> depthIds: depthIdsMap.entrySet()) {
for (String id: depthIds.getValue()) {
dateIdResult.put(id, new String[]{hostport, Integer.toString(depthIds.getKey())});
if (order == Order.ANY && dateIdResult.size() >= maxcount) break loop;
}
}
}
if (host != null && depth != null) {
String hostport = pathToHostPortDir(host,80);
TreeMap<Integer, TreeSet<String>> domaindepth = this.directory.get(hostport);
if (domaindepth != null) {
TreeSet<String> ids = domaindepth.get(depth);
if (ids != null) loop: for (String id: ids) {
dateIdResult.put(id, new String[]{hostport, Integer.toString(depth)});
if (order == Order.ANY && dateIdResult.size() >= maxcount) break loop;
}
}
}
LinkedHashMap<String, Revisions> result = new LinkedHashMap<>();
Iterator<Map.Entry<String, String[]>> i = order == Order.LATESTFIRST ? dateIdResult.descendingMap().entrySet().iterator() : dateIdResult.entrySet().iterator();
while (i.hasNext() && result.size() < maxcount) {
Map.Entry<String, String[]> entry = i.next();
String datehash = entry.getKey();
int p = datehash.indexOf('.');
assert p >= 0;
Revisions r = new Revisions(entry.getValue()[0], Integer.parseInt(entry.getValue()[1]), datehash);
result.put(datehash.substring(p + 1), r);
}
return result;
}
private static Date parseDate(String d) {
try {
return GenericFormatter.SHORT_MINUTE_FORMATTER.parse(d, 0).getTime();
} catch (ParseException e) {
try {
return GenericFormatter.SHORT_DAY_FORMATTER.parse(d, 0).getTime();
} catch (ParseException ee) {
return null;
}
}
}
/**
* get the depth to a document, helper method for definePath to determine the depth value
* @param url
* @param fulltext
* @return the crawldepth of the document
*/
public int getDepth(final DigestURL url, final Fulltext fulltext) {
Integer depth = null;
if (fulltext.getDefaultConfiguration().contains(CollectionSchema.crawldepth_i)) {
try {
SolrDocument doc = fulltext.getDefaultConnector().getDocumentById(ASCII.String(url.hash()), CollectionSchema.crawldepth_i.getSolrFieldName());
if (doc != null) {
depth = (Integer) doc.getFieldValue(CollectionSchema.crawldepth_i.getSolrFieldName());
}
} catch (IOException e) {
}
}
return depth == null ? 0 : depth;
}
/**
* for a given url, get all paths for storage locations.
* The locations are all for the single url but may represent different storage times.
* This method is inefficient because it tests all different depths, it would be better to use
* findPaths/3 with a given depth.
* @param url
* @param ext
* @return a set of files for snapshots of the url
*/
public Collection<File> findPaths(final DigestURL url, final String ext) {
for (int i = 0; i < 100; i++) {
Collection<File> paths = findPaths(url, i, ext);
if (paths.size() > 0) return paths;
}
return new ArrayList<>(0);
}
// pathtoxml = <storageLocation>/<host>.<port>/<depth>/<shard>/<urlhash>.<date>.xml
/**
* for a given url, get all paths for storage locations.
* The locations are all for the single url but may represent different storage times.
* @param url
* @param ext required extension or null if the extension must not be checked
* @param depth
* @return a set of files for snapshots of the url
*/
public Collection<File> findPaths(final DigestURL url, final int depth, final String ext) {
String id = ASCII.String(url.hash());
File pathToShard = pathToShard(url, depth);
String[] list = pathToShard.exists() && pathToShard.isDirectory() ? pathToShard.list() : null; // may be null if path does not exist
ArrayList<File> paths = new ArrayList<>();
if (list != null) {
for (String f: list) {
if (f.startsWith(id) && (ext == null || f.endsWith(ext))) paths.add(new File(pathToShard, f));
}
}
return paths;
}
private File pathToShard(final DigestURL url, final int depth) {
return pathToShard(pathToHostPortDir(url.getHost(), url.getPort()), ASCII.String(url.hash()), depth);
}
private File pathToShard(final String hostport, final String urlhash, final int depth) {
File pathToHostDir = new File(storageLocation, hostport);
File pathToDepthDir = new File(pathToHostDir, pathToDepthDir(depth));
File pathToShard = new File(pathToDepthDir, pathToShard(urlhash));
return pathToShard;
}
private String pathToHostPortDir(final String host, final int port) {
return host + "." + port;
}
private String pathToDepthDir(final int depth) {
return depth < 10 ? "0" + depth : Integer.toString(depth);
}
private String pathToShard(final String urlhash) {
return urlhash.substring(0, 2);
}
}