yacy_search_server/source/net/yacy/kelondro/data/meta/URIMetadataRow.java

571 lines
25 KiB
Java
Raw Normal View History

// URLMetadata.java
// (C) 2006 by Michael Peter Christen; mc@yacy.net, Frankfurt a. M., Germany
// first published 2006 on http://www.anomic.de
//
// This is a part of YaCy, a peer-to-peer based web search engine
//
// $LastChangedDate: 2009-03-20 16:44:59 +0100 (Fr, 20 Mrz 2009) $
// $LastChangedRevision$
// $LastChangedBy$
//
// LICENSE
//
// This program is free software; you can redistribute it and/or modify
// it under the terms of the GNU General Public License as published by
// the Free Software Foundation; either version 2 of the License, or
// (at your option) any later version.
//
// This program is distributed in the hope that it will be useful,
// but WITHOUT ANY WARRANTY; without even the implied warranty of
// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
// GNU General Public License for more details.
//
// You should have received a copy of the GNU General Public License
// along with this program; if not, write to the Free Software
// Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
package net.yacy.kelondro.data.meta;
import java.io.UnsupportedEncodingException;
import java.net.MalformedURLException;
import java.text.ParseException;
import java.util.Date;
import java.util.List;
import java.util.Properties;
import java.util.regex.Pattern;
import net.yacy.kelondro.data.word.WordReferenceRow;
import net.yacy.kelondro.data.word.WordReferenceVars;
import net.yacy.kelondro.index.Row;
import net.yacy.kelondro.io.CharBuffer;
import net.yacy.kelondro.logging.Log;
import net.yacy.kelondro.order.Base64Order;
import net.yacy.kelondro.order.Bitfield;
import net.yacy.kelondro.order.Digest;
import net.yacy.kelondro.order.NaturalOrder;
import net.yacy.kelondro.util.ByteBuffer;
import net.yacy.kelondro.util.DateFormatter;
import net.yacy.kelondro.util.kelondroException;
import net.yacy.kelondro.util.MapTools;
import de.anomic.crawler.retrieval.Request;
import de.anomic.search.QueryParams;
import de.anomic.tools.crypt;
public class URIMetadataRow implements URIMetadata {
// this object stores attributes for URL entries
public static final Row rowdef = new Row(
"String hash-12, " + // the url's hash
"String comp-360, " + // components: the url, description, author, tags and publisher
"Cardinal mod-4 {b256}, " + // last-modified from the httpd
"Cardinal load-4 {b256}, " + // time when the url was loaded
"Cardinal fresh-4 {b256}, " + // time until this url is fresh
"String referrer-12, " + // (one of) the url's referrer hash(es)
"byte[] md5-8, " + // the md5 of the url content (to identify changes)
"Cardinal size-6 {b256}, " + // size of file in bytes
"Cardinal wc-3 {b256}, " + // size of file by number of words; for video and audio: seconds
"byte[] dt-1, " + // doctype, taken from extension or any other heuristic
"Bitfield flags-4, " + // flags; any stuff (see Word-Entity definition)
"String lang-2, " + // language
"Cardinal llocal-2 {b256}, " + // # of outlinks to same domain; for video and image: width
"Cardinal lother-2 {b256}, " + // # of outlinks to outside domain; for video and image: height
"Cardinal limage-2 {b256}, " + // # of embedded image links
"Cardinal laudio-2 {b256}, " + // # of embedded audio links; for audio: track number; for video: number of audio tracks
"Cardinal lvideo-2 {b256}, " + // # of embedded video links
"Cardinal lapp-2 {b256}", // # of embedded links to applications
Base64Order.enhancedCoder
);
/* ===========================================================================
* Constants to access the various columns of an URL entry
* =========================================================================== */
private static final int col_hash = 0; // the url's hash
private static final int col_comp = 1; // components: the url, description, author and tags. As 5th element, an ETag is possible
private static final int col_mod = 2; // the modifed-date time from the server (servertime in row)
private static final int col_load = 3; // time when the url was loaded
private static final int col_fresh = 4; // time until this url is fresh
private static final int col_referrer = 5; // a referrer of the url (there may be several, but this is the one that was acually referring to this one)
private static final int col_md5 = 6; // the md5 of the url content (to identify changes)
private static final int col_size = 7; // size of file in bytes
private static final int col_wc = 8; // size of file by number of words; for video and audio: seconds
private static final int col_dt = 9; // doctype, taken from extension or any other heuristic
private static final int col_flags = 10; // flags; any stuff (see Word-Entity definition)
private static final int col_lang = 11; // language
private static final int col_llocal = 12; // # of outlinks to same domain; for video and image: width
private static final int col_lother = 13; // # of outlinks to outside domain; for video and image: height
private static final int col_limage = 14; // # of embedded image links
private static final int col_laudio = 15; // # of embedded audio links; for audio: track number; for video: number of audio tracks
private static final int col_lvideo = 16; // # of embedded video links
private static final int col_lapp = 17; // # of embedded links to applications
private final Row.Entry entry;
private final String snippet;
private WordReferenceVars word; // this is only used if the url is transported via remote search requests
private final long ranking; // during generation of a search result this value is set
private Components comp;
public URIMetadataRow() {
// create a dummy entry, good to produce poison objects
this.entry = rowdef.newEntry();
this.snippet = null;
this.word = null;
this.ranking = 0;
this.comp = null;
}
public URIMetadataRow(
final DigestURI url,
final String dc_title,
final String dc_creator,
final String dc_subject,
final String dc_publisher,
final Date mod,
final Date load,
final Date fresh,
final String referrer,
final byte[] md5,
final long size,
final int wc,
final char dt,
final Bitfield flags,
final String lang,
final int llocal,
final int lother,
final int laudio,
final int limage,
final int lvideo,
final int lapp) {
// create new entry
this.entry = rowdef.newEntry();
this.entry.setCol(col_hash, url.hash());
this.entry.setCol(col_comp, encodeComp(url, dc_title, dc_creator, dc_subject, dc_publisher));
encodeDate(col_mod, mod);
encodeDate(col_load, load);
encodeDate(col_fresh, fresh);
try {
this.entry.setCol(col_referrer, (referrer == null) ? null : referrer.getBytes("UTF-8"));
} catch (UnsupportedEncodingException e) {
this.entry.setCol(col_referrer, (referrer == null) ? null : referrer.getBytes());
}
this.entry.setCol(col_md5, md5);
this.entry.setCol(col_size, size);
this.entry.setCol(col_wc, wc);
this.entry.setCol(col_dt, new byte[]{(byte) dt});
this.entry.setCol(col_flags, flags.bytes());
try {
this.entry.setCol(col_lang, lang.getBytes("UTF-8"));
} catch (UnsupportedEncodingException e) {
this.entry.setCol(col_lang, lang.getBytes());
}
this.entry.setCol(col_llocal, llocal);
this.entry.setCol(col_lother, lother);
this.entry.setCol(col_limage, limage);
this.entry.setCol(col_laudio, laudio);
this.entry.setCol(col_lvideo, lvideo);
this.entry.setCol(col_lapp, lapp);
//System.out.println("===DEBUG=== " + load.toString() + ", " + decodeDate(col_load).toString());
this.snippet = null;
this.word = null;
this.ranking = 0;
this.comp = null;
}
private void encodeDate(final int col, final Date d) {
// calculates the number of days since 1.1.1970 and returns this as 4-byte array
// 86400000 is the number of milliseconds in one day
this.entry.setCol(col, NaturalOrder.encodeLong(d.getTime() / 86400000L, 4));
}
private Date decodeDate(final int col) {
long t = this.entry.getColLong(col);
/*if (t < 14600) */return new Date(86400000L * t); // time was stored as number of days since epoch
/*
if (t < 350400) return new Date(3600000L * t); // hours since epoch
if (t < 21024000) return new Date(60000L * t); // minutes since epoch
*/
}
public static byte[] encodeComp(final DigestURI url, final String dc_title, final String dc_creator, final String dc_subject, final String dc_publisher) {
final CharBuffer s = new CharBuffer(360);
s.append(url.toNormalform(false, true)).append(10);
s.append(dc_title).append(10);
s.append(dc_creator).append(10);
s.append(dc_subject).append(10);
s.append(dc_publisher).append(10);
try {
return s.toString().getBytes("UTF-8");
} catch (UnsupportedEncodingException e) {
return s.toString().getBytes();
}
}
public URIMetadataRow(final Row.Entry entry, final WordReferenceVars searchedWord, final long ranking) {
this.entry = entry;
this.snippet = null;
this.word = searchedWord;
this.ranking = ranking;
this.comp = null;
}
public URIMetadataRow(final Properties prop) {
// generates an plasmaLURLEntry using the properties from the argument
// the property names must correspond to the one from toString
//System.out.println("DEBUG-ENTRY: prop=" + prop.toString());
DigestURI url;
try {
url = new DigestURI(crypt.simpleDecode(prop.getProperty("url", ""), null), prop.getProperty("hash").getBytes());
} catch (final MalformedURLException e) {
url = null;
}
String descr = crypt.simpleDecode(prop.getProperty("descr", ""), null); if (descr == null) descr = "";
String dc_creator = crypt.simpleDecode(prop.getProperty("author", ""), null); if (dc_creator == null) dc_creator = "";
String tags = crypt.simpleDecode(prop.getProperty("tags", ""), null); if (tags == null) tags = "";
String dc_publisher = crypt.simpleDecode(prop.getProperty("publisher", ""), null); if (dc_publisher == null) dc_publisher = "";
this.entry = rowdef.newEntry();
this.entry.setCol(col_hash, url.hash()); // FIXME potential null pointer access
this.entry.setCol(col_comp, encodeComp(url, descr, dc_creator, tags, dc_publisher));
try {
encodeDate(col_mod, DateFormatter.parseShortDay(prop.getProperty("mod", "20000101")));
} catch (final ParseException e) {
encodeDate(col_mod, new Date());
}
try {
encodeDate(col_load, DateFormatter.parseShortDay(prop.getProperty("load", "20000101")));
} catch (final ParseException e) {
encodeDate(col_load, new Date());
}
try {
encodeDate(col_fresh, DateFormatter.parseShortDay(prop.getProperty("fresh", "20000101")));
} catch (final ParseException e) {
encodeDate(col_fresh, new Date());
}
try {
this.entry.setCol(col_referrer, prop.getProperty("referrer", "").getBytes("UTF-8"));
} catch (UnsupportedEncodingException e1) {
this.entry.setCol(col_referrer, prop.getProperty("referrer", "").getBytes());
}
this.entry.setCol(col_md5, Digest.decodeHex(prop.getProperty("md5", "")));
this.entry.setCol(col_size, Integer.parseInt(prop.getProperty("size", "0")));
this.entry.setCol(col_wc, Integer.parseInt(prop.getProperty("wc", "0")));
this.entry.setCol(col_dt, new byte[]{(byte) prop.getProperty("dt", "t").charAt(0)});
final String flags = prop.getProperty("flags", "AAAAAA");
this.entry.setCol(col_flags, (flags.length() > 6) ? QueryParams.empty_constraint.bytes() : (new Bitfield(4, flags)).bytes());
try {
this.entry.setCol(col_lang, prop.getProperty("lang", "uk").getBytes("UTF-8"));
} catch (UnsupportedEncodingException e) {
this.entry.setCol(col_lang, prop.getProperty("lang", "uk").getBytes());
}
this.entry.setCol(col_llocal, Integer.parseInt(prop.getProperty("llocal", "0")));
this.entry.setCol(col_lother, Integer.parseInt(prop.getProperty("lother", "0")));
this.entry.setCol(col_limage, Integer.parseInt(prop.getProperty("limage", "0")));
this.entry.setCol(col_laudio, Integer.parseInt(prop.getProperty("laudio", "0")));
this.entry.setCol(col_lvideo, Integer.parseInt(prop.getProperty("lvideo", "0")));
this.entry.setCol(col_lapp, Integer.parseInt(prop.getProperty("lapp", "0")));
this.snippet = crypt.simpleDecode(prop.getProperty("snippet", ""), null);
this.word = null;
if (prop.containsKey("word")) throw new kelondroException("old database structure is not supported");
if (prop.containsKey("wi")) {
this.word = new WordReferenceVars(new WordReferenceRow(Base64Order.enhancedCoder.decodeString(prop.getProperty("wi", ""))));
}
this.ranking = 0;
this.comp = null;
}
public static URIMetadataRow importEntry(final String propStr) {
if (propStr == null || (propStr.length() > 0 && propStr.charAt(0) != '{') || !propStr.endsWith("}")) {
return null;
}
try {
return new URIMetadataRow(MapTools.s2p(propStr.substring(1, propStr.length() - 1)));
} catch (final kelondroException e) {
// wrong format
return null;
}
}
private StringBuilder corePropList() {
// generate a parseable string; this is a simple property-list
final Components metadata = this.metadata();
final StringBuilder s = new StringBuilder(300);
if (metadata == null) return null;
//System.out.println("author=" + comp.author());
try {
s.append("hash=").append(new String(hash()));
assert (s.toString().indexOf(0) < 0);
s.append(",url=").append(crypt.simpleEncode(metadata.url().toNormalform(false, true)));
assert (s.toString().indexOf(0) < 0);
s.append(",descr=").append(crypt.simpleEncode(metadata.dc_title()));
assert (s.toString().indexOf(0) < 0);
s.append(",author=").append(crypt.simpleEncode(metadata.dc_creator()));
assert (s.toString().indexOf(0) < 0);
s.append(",tags=").append(crypt.simpleEncode(metadata.dc_subject()));
assert (s.toString().indexOf(0) < 0);
s.append(",publisher=").append(crypt.simpleEncode(metadata.dc_publisher()));
assert (s.toString().indexOf(0) < 0);
s.append(",mod=").append(DateFormatter.formatShortDay(moddate()));
assert (s.toString().indexOf(0) < 0);
s.append(",load=").append(DateFormatter.formatShortDay(loaddate()));
assert (s.toString().indexOf(0) < 0);
s.append(",fresh=").append(DateFormatter.formatShortDay(freshdate()));
assert (s.toString().indexOf(0) < 0);
s.append(",referrer=").append(referrerHash() == null ? "" : new String(referrerHash()));
assert (s.toString().indexOf(0) < 0);
s.append(",md5=").append(md5());
assert (s.toString().indexOf(0) < 0);
s.append(",size=").append(size());
assert (s.toString().indexOf(0) < 0);
s.append(",wc=").append(wordCount());
assert (s.toString().indexOf(0) < 0);
s.append(",dt=").append(doctype());
assert (s.toString().indexOf(0) < 0);
s.append(",flags=").append(flags().exportB64());
assert (s.toString().indexOf(0) < 0);
s.append(",lang=").append(language());
assert (s.toString().indexOf(0) < 0);
s.append(",llocal=").append(llocal());
assert (s.toString().indexOf(0) < 0);
s.append(",lother=").append(lother());
assert (s.toString().indexOf(0) < 0);
s.append(",limage=").append(limage());
assert (s.toString().indexOf(0) < 0);
s.append(",laudio=").append(laudio());
assert (s.toString().indexOf(0) < 0);
s.append(",lvideo=").append(lvideo());
assert (s.toString().indexOf(0) < 0);
s.append(",lapp=").append(lapp());
assert (s.toString().indexOf(0) < 0);
if (this.word != null) {
// append also word properties
s.append(",wi=").append(Base64Order.enhancedCoder.encodeString(word.toPropertyForm()));
}
assert (s.toString().indexOf(0) < 0);
return s;
} catch (final Exception e) {
// serverLog.logFailure("plasmaLURL.corePropList", e.getMessage());
// if (moddate == null) serverLog.logFailure("plasmaLURL.corePropList", "moddate=null");
// if (loaddate == null) serverLog.logFailure("plasmaLURL.corePropList", "loaddate=null");
Log.logException(e);
return null;
}
}
public Row.Entry toRowEntry() {
return this.entry;
}
public byte[] hash() {
// return a url-hash, based on the md5 algorithm
// the result is a String of 12 bytes within a 72-bit space
// (each byte has an 6-bit range)
// that should be enough for all web pages on the world
return this.entry.getPrimaryKeyBytes();
}
public long ranking() {
return this.ranking;
}
public Components metadata() {
// avoid double computation of metadata elements
if (this.comp != null) return this.comp;
// parse elements from comp field;
byte[] c = this.entry.getColBytes(col_comp, true);
List<byte[]> cl = ByteBuffer.split(c, (byte) 10);
this.comp = new Components(
(cl.size() > 0) ? new String(cl.get(0)) : "",
hash(),
(cl.size() > 1) ? new String(cl.get(1)) : "",
(cl.size() > 2) ? new String(cl.get(2)) : "",
(cl.size() > 3) ? new String(cl.get(3)) : "",
(cl.size() > 4) ? new String(cl.get(4)) : "");
return this.comp;
}
public Date moddate() {
return decodeDate(col_mod);
}
public Date loaddate() {
return decodeDate(col_load);
}
public Date freshdate() {
return decodeDate(col_fresh);
}
public byte[] referrerHash() {
// return the creator's hash or null if there is none
// FIXME: There seem to be some malformed entries in the databasees like "null\0\0\0\0\0\0\0\0"
final byte[] r = entry.getColBytes(col_referrer, true);
if (r != null) {
int i = r.length;
while (i > 0) {
if (r[--i] == 0) return null;
}
}
return r;
}
public String md5() {
// returns the md5 in hex representation
return Digest.encodeHex(entry.getColBytes(col_md5, true));
}
public char doctype() {
return (char) entry.getColByte(col_dt);
}
public String language() {
return this.entry.getColString(col_lang, null);
}
public int size() {
return (int) this.entry.getColLong(col_size);
}
public Bitfield flags() {
return new Bitfield(this.entry.getColBytes(col_flags, true));
}
public int wordCount() {
return (int) this.entry.getColLong(col_wc);
}
public int llocal() {
return (int) this.entry.getColLong(col_llocal);
}
public int lother() {
return (int) this.entry.getColLong(col_lother);
}
public int limage() {
return (int) this.entry.getColLong(col_limage);
}
public int laudio() {
return (int) this.entry.getColLong(col_laudio);
}
public int lvideo() {
return (int) this.entry.getColLong(col_lvideo);
}
public int lapp() {
return (int) this.entry.getColLong(col_lapp);
}
public String snippet() {
// the snippet may appear here if the url was transported in a remote search
// it will not be saved anywhere, but can only be requested here
return snippet;
}
public WordReferenceVars word() {
return word;
}
public boolean isOlder(final URIMetadata other) {
if (other == null) return false;
final Date tmoddate = moddate();
final Date omoddate = other.moddate();
if (tmoddate.before(omoddate)) return true;
if (tmoddate.equals(omoddate)) {
final Date tloaddate = loaddate();
final Date oloaddate = other.loaddate();
if (tloaddate.before(oloaddate)) return true;
if (tloaddate.equals(oloaddate)) return true;
}
return false;
}
public String toString(final String snippet) {
// add information needed for remote transport
final StringBuilder core = corePropList();
if (core == null)
return null;
core.ensureCapacity(core.length() + snippet.length() * 2);
core.insert(0, "{");
core.append(",snippet=").append(crypt.simpleEncode(snippet));
core.append("}");
return core.toString();
//return "{" + core + ",snippet=" + crypt.simpleEncode(snippet) + "}";
}
public Request toBalancerEntry(final String initiatorHash) {
return new Request(
initiatorHash.getBytes(),
metadata().url(),
referrerHash(),
metadata().dc_title(),
moddate(),
null,
0,
0,
0);
}
/**
* @return the object as String.<br>
* This e.g. looks like this:
* <pre>{hash=jmqfMk7Y3NKw,referrer=------------,mod=20050610,load=20051003,size=51666,wc=1392,cc=0,local=true,q=AEn,dt=h,lang=uk,url=b|aHR0cDovL3d3dy50cmFuc3BhcmVuY3kub3JnL3N1cnZleXMv,descr=b|S25vd2xlZGdlIENlbnRyZTogQ29ycnVwdGlvbiBTdXJ2ZXlzIGFuZCBJbmRpY2Vz}</pre>
*/
@Override
public String toString() {
final StringBuilder core = corePropList();
if (core == null) return null;
core.insert(0, "{");
core.append("}");
return core.toString();
//return "{" + core + "}";
}
public class Components {
private DigestURI url;
private String urlRaw;
private byte[] urlHash;
private final String dc_title, dc_creator, dc_subject, dc_publisher;
public Components(final String urlRaw, final byte[] urlhash, final String title, final String author, final String tags, final String publisher) {
this.url = null;
this.urlRaw = urlRaw;
this.urlHash = urlhash;
this.dc_title = title;
this.dc_creator = author;
this.dc_subject = tags;
this.dc_publisher = publisher;
}
public boolean matches(Pattern matcher) {
if (this.urlRaw != null) return matcher.matcher(this.urlRaw).matches();
if (this.url != null) return matcher.matcher(this.url.toNormalform(true, true)).matches();
return false;
}
public DigestURI url() {
if (this.url == null) {
try {
this.url = new DigestURI(this.urlRaw, this.urlHash);
} catch (MalformedURLException e) {
this.url = null;
}
this.urlRaw = null;
this.urlHash = null;
}
return this.url;
}
public String dc_title() { return this.dc_title; }
public String dc_creator() { return this.dc_creator; }
public String dc_publisher() { return this.dc_publisher; }
public String dc_subject() { return this.dc_subject; }
}
}