mirror of
https://github.com/yacy/yacy_search_server.git
synced 2024-09-21 00:00:13 +02:00
5f0ab25382
MultiProtocolURI during normalform computation because that should always be done and also be done during initialization of the MultiProtocolURI Object. The new normalform method takes only one argument which should be 'true' unless you know exactly what you are doing.
433 lines
14 KiB
Java
433 lines
14 KiB
Java
/**
|
|
* URIMetadataNode
|
|
* Copyright 2012 by Michael Peter Christen
|
|
* First released 10.8.2012 at http://yacy.net
|
|
*
|
|
* This file is part of YaCy Content Integration
|
|
*
|
|
* This library is free software; you can redistribute it and/or
|
|
* modify it under the terms of the GNU Lesser General Public
|
|
* License as published by the Free Software Foundation; either
|
|
* version 2.1 of the License, or (at your option) any later version.
|
|
*
|
|
* This library is distributed in the hope that it will be useful,
|
|
* but WITHOUT ANY WARRANTY; without even the implied warranty of
|
|
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
|
|
* Lesser General Public License for more details.
|
|
*
|
|
* You should have received a copy of the GNU Lesser General Public License
|
|
* along with this program in the file lgpl21.txt
|
|
* If not, see <http://www.gnu.org/licenses/>.
|
|
*/
|
|
|
|
package net.yacy.kelondro.data.meta;
|
|
|
|
import java.net.MalformedURLException;
|
|
import java.util.ArrayList;
|
|
import java.util.Date;
|
|
import java.util.regex.Pattern;
|
|
|
|
import net.yacy.cora.date.GenericFormatter;
|
|
import net.yacy.cora.document.ASCII;
|
|
import net.yacy.cora.document.UTF8;
|
|
import net.yacy.cora.federate.solr.SolrType;
|
|
import net.yacy.cora.federate.solr.YaCySchema;
|
|
import net.yacy.cora.lod.vocabulary.Tagging;
|
|
import net.yacy.cora.order.Base64Order;
|
|
import net.yacy.crawler.retrieval.Request;
|
|
import net.yacy.crawler.retrieval.Response;
|
|
import net.yacy.document.Condenser;
|
|
import net.yacy.kelondro.data.word.WordReference;
|
|
import net.yacy.kelondro.logging.Log;
|
|
import net.yacy.kelondro.util.Bitfield;
|
|
import net.yacy.utils.crypt;
|
|
|
|
import org.apache.solr.common.SolrDocument;
|
|
|
|
|
|
/**
|
|
* This is the URIMetadata object implementation for Solr documents.
|
|
* The purpose of this object is the migration from the old metadata structure to solr document.
|
|
* Future implementations should try to replace URIMetadata objects completely by SolrDocument objects
|
|
*/
|
|
public class URIMetadataNode implements URIMetadata {
|
|
|
|
private final byte[] hash;
|
|
private final String urlRaw, keywords;
|
|
private DigestURI url;
|
|
private Bitfield flags;
|
|
private final int imagec, audioc, videoc, appc;
|
|
private double lat, lon;
|
|
private long ranking; // during generation of a search result this value is set
|
|
private final SolrDocument doc;
|
|
private final String snippet;
|
|
private WordReference word; // this is only used if the url is transported via remote search requests
|
|
|
|
public URIMetadataNode(final SolrDocument doc) {
|
|
this.doc = doc;
|
|
this.snippet = "";
|
|
this.word = null;
|
|
this.ranking = Long.MIN_VALUE;
|
|
this.hash = ASCII.getBytes(getString(YaCySchema.id));
|
|
this.urlRaw = getString(YaCySchema.sku);
|
|
try {
|
|
this.url = new DigestURI(this.urlRaw, this.hash);
|
|
} catch (MalformedURLException e) {
|
|
Log.logException(e);
|
|
this.url = null;
|
|
}
|
|
|
|
// to set the flags bitfield we need to pre-load some values from the Solr document
|
|
this.keywords = getString(YaCySchema.keywords);
|
|
this.imagec = getInt(YaCySchema.imagescount_i);
|
|
this.audioc = getInt(YaCySchema.audiolinkscount_i);
|
|
this.videoc = getInt(YaCySchema.videolinkscount_i);
|
|
this.appc = getInt(YaCySchema.videolinkscount_i);
|
|
this.lon = 0.0d;
|
|
this.lat = 0.0d;
|
|
String latlon = (String) this.doc.getFieldValue(YaCySchema.coordinate_p.name());
|
|
if (latlon != null) {
|
|
int p = latlon.indexOf(',');
|
|
if (p > 0) {
|
|
this.lat = Double.parseDouble(latlon.substring(0, p));
|
|
this.lon = Double.parseDouble(latlon.substring(p + 1));
|
|
}
|
|
}
|
|
this.flags = new Bitfield();
|
|
if (this.keywords != null && this.keywords.indexOf("indexof") >= 0) this.flags.set(Condenser.flag_cat_indexof, true);
|
|
if (this.lon != 0.0d || this.lat != 0.0d) this.flags.set(Condenser.flag_cat_haslocation, true);
|
|
if (this.imagec > 0) this.flags.set(Condenser.flag_cat_hasimage, true);
|
|
if (this.audioc > 0) this.flags.set(Condenser.flag_cat_hasaudio, true);
|
|
if (this.videoc > 0) this.flags.set(Condenser.flag_cat_hasvideo, true);
|
|
if (this.appc > 0) this.flags.set(Condenser.flag_cat_hasapp, true);
|
|
}
|
|
|
|
public URIMetadataNode(final SolrDocument doc, final WordReference searchedWord, final long ranking) {
|
|
this(doc);
|
|
this.word = searchedWord;
|
|
this.ranking = ranking;
|
|
}
|
|
|
|
public URIMetadataRow toRow() {
|
|
return URIMetadataRow.importEntry(this.toString());
|
|
}
|
|
|
|
public SolrDocument getDocument() {
|
|
return this.doc;
|
|
}
|
|
|
|
private int getInt(YaCySchema field) {
|
|
assert !field.isMultiValued();
|
|
assert field.getType() == SolrType.integer;
|
|
Integer x = (Integer) this.doc.getFieldValue(field.name());
|
|
if (x == null) return 0;
|
|
return x.intValue();
|
|
}
|
|
|
|
private Date getDate(YaCySchema field) {
|
|
assert !field.isMultiValued();
|
|
assert field.getType() == SolrType.date;
|
|
Date x = (Date) this.doc.getFieldValue(field.name());
|
|
if (x == null) return new Date(0);
|
|
Date now = new Date();
|
|
return x.after(now) ? now : x;
|
|
}
|
|
|
|
private String getString(YaCySchema field) {
|
|
assert !field.isMultiValued();
|
|
assert field.getType() == SolrType.string || field.getType() == SolrType.text_general || field.getType() == SolrType.text_en_splitting_tight;
|
|
Object x = this.doc.getFieldValue(field.name());
|
|
if (x == null) return "";
|
|
if (x instanceof ArrayList) {
|
|
@SuppressWarnings("unchecked")
|
|
ArrayList<String> xa = (ArrayList<String>) x;
|
|
return xa.size() == 0 ? "" : xa.get(0);
|
|
}
|
|
return (String) x;
|
|
}
|
|
|
|
@SuppressWarnings("unchecked")
|
|
private ArrayList<String> getArrayList(YaCySchema field) {
|
|
assert field.isMultiValued();
|
|
assert field.getType() == SolrType.string || field.getType() == SolrType.text_general;
|
|
Object r = this.doc.getFieldValue(field.name());
|
|
if (r == null) return new ArrayList<String>(0);
|
|
if (r instanceof ArrayList) {
|
|
return (ArrayList<String>) r;
|
|
}
|
|
ArrayList<String> a = new ArrayList<String>(1);
|
|
a.add((String) r);
|
|
return a;
|
|
}
|
|
|
|
@Override
|
|
public byte[] hash() {
|
|
return this.hash;
|
|
}
|
|
|
|
@Override
|
|
public String hosthash() {
|
|
String hosthash = (String) this.doc.getFieldValue(YaCySchema.host_id_s.name());
|
|
if (hosthash == null) hosthash = ASCII.String(this.hash, 6, 6);
|
|
return hosthash;
|
|
}
|
|
|
|
@Override
|
|
public Date moddate() {
|
|
return getDate(YaCySchema.last_modified);
|
|
}
|
|
|
|
@Override
|
|
public DigestURI url() {
|
|
return this.url;
|
|
}
|
|
|
|
@Override
|
|
public boolean matches(Pattern matcher) {
|
|
return matcher.matcher(this.urlRaw.toLowerCase()).matches();
|
|
}
|
|
|
|
@Override
|
|
public String dc_title() {
|
|
ArrayList<String> a = getArrayList(YaCySchema.title);
|
|
if (a == null || a.size() == 0) return "";
|
|
return a.get(0);
|
|
}
|
|
|
|
@Override
|
|
public String dc_creator() {
|
|
return getString(YaCySchema.author);
|
|
}
|
|
|
|
@Override
|
|
public String dc_publisher() {
|
|
return getString(YaCySchema.publisher_t);
|
|
}
|
|
|
|
@Override
|
|
public String dc_subject() {
|
|
return this.keywords;
|
|
}
|
|
|
|
@Override
|
|
public double lat() {
|
|
return this.lat;
|
|
}
|
|
|
|
@Override
|
|
public double lon() {
|
|
return this.lon;
|
|
}
|
|
|
|
@Override
|
|
public long ranking() {
|
|
return this.ranking;
|
|
}
|
|
|
|
@Override
|
|
public Date loaddate() {
|
|
return getDate(YaCySchema.load_date_dt);
|
|
}
|
|
|
|
@Override
|
|
public Date freshdate() {
|
|
return getDate(YaCySchema.fresh_date_dt);
|
|
}
|
|
|
|
@Override
|
|
public String md5() {
|
|
return getString(YaCySchema.md5_s);
|
|
}
|
|
|
|
@Override
|
|
public char doctype() {
|
|
ArrayList<String> a = getArrayList(YaCySchema.content_type);
|
|
if (a == null || a.size() == 0) return Response.docType(this.url);
|
|
return Response.docType(a.get(0));
|
|
}
|
|
|
|
@Override
|
|
public byte[] language() {
|
|
String language = getString(YaCySchema.language_s);
|
|
if (language == null || language.length() == 0) return ASCII.getBytes("en");
|
|
return UTF8.getBytes(language);
|
|
}
|
|
|
|
|
|
@Override
|
|
public byte[] referrerHash() {
|
|
ArrayList<String> referrer = getArrayList(YaCySchema.referrer_id_txt);
|
|
if (referrer == null || referrer.size() == 0) return null;
|
|
return ASCII.getBytes(referrer.get(0));
|
|
}
|
|
|
|
@Override
|
|
public int size() {
|
|
return getInt(YaCySchema.size_i);
|
|
}
|
|
|
|
@Override
|
|
public Bitfield flags() {
|
|
return this.flags;
|
|
}
|
|
|
|
@Override
|
|
public int wordCount() {
|
|
return getInt(YaCySchema.wordcount_i);
|
|
}
|
|
|
|
@Override
|
|
public int llocal() {
|
|
return getInt(YaCySchema.inboundlinkscount_i);
|
|
}
|
|
|
|
@Override
|
|
public int lother() {
|
|
return getInt(YaCySchema.outboundlinkscount_i);
|
|
}
|
|
|
|
@Override
|
|
public int limage() {
|
|
return this.imagec;
|
|
}
|
|
|
|
@Override
|
|
public int laudio() {
|
|
return this.audioc;
|
|
}
|
|
|
|
@Override
|
|
public int lvideo() {
|
|
return this.videoc;
|
|
}
|
|
|
|
@Override
|
|
public int lapp() {
|
|
return this.appc;
|
|
}
|
|
|
|
@Override
|
|
public String snippet() {
|
|
return this.snippet;
|
|
}
|
|
|
|
@Override
|
|
public String[] collections() {
|
|
ArrayList<String> a = getArrayList(YaCySchema.collection_sxt);
|
|
return a.toArray(new String[a.size()]);
|
|
}
|
|
|
|
@Override
|
|
public WordReference word() {
|
|
return this.word;
|
|
}
|
|
|
|
@Override
|
|
public boolean isOlder(URIMetadata other) {
|
|
if (other == null) return false;
|
|
final Date tmoddate = moddate();
|
|
final Date omoddate = other.moddate();
|
|
if (tmoddate.before(omoddate)) return true;
|
|
if (tmoddate.equals(omoddate)) {
|
|
final Date tloaddate = loaddate();
|
|
final Date oloaddate = other.loaddate();
|
|
if (tloaddate.before(oloaddate)) return true;
|
|
if (tloaddate.equals(oloaddate)) return true;
|
|
}
|
|
return false;
|
|
}
|
|
|
|
public static StringBuilder corePropList(URIMetadata md) {
|
|
// generate a parseable string; this is a simple property-list
|
|
final StringBuilder s = new StringBuilder(300);
|
|
|
|
// create new formatters to make concurrency possible
|
|
final GenericFormatter formatter = new GenericFormatter(GenericFormatter.FORMAT_SHORT_DAY, GenericFormatter.time_minute);
|
|
|
|
try {
|
|
s.append("hash=").append(ASCII.String(md.hash()));
|
|
s.append(",url=").append(crypt.simpleEncode(md.url().toNormalform(true)));
|
|
s.append(",descr=").append(crypt.simpleEncode(md.dc_title()));
|
|
s.append(",author=").append(crypt.simpleEncode(md.dc_creator()));
|
|
s.append(",tags=").append(crypt.simpleEncode(Tagging.cleanTagFromAutotagging(md.dc_subject())));
|
|
s.append(",publisher=").append(crypt.simpleEncode(md.dc_publisher()));
|
|
s.append(",lat=").append(md.lat());
|
|
s.append(",lon=").append(md.lon());
|
|
s.append(",mod=").append(formatter.format(md.moddate()));
|
|
s.append(",load=").append(formatter.format(md.loaddate()));
|
|
s.append(",fresh=").append(formatter.format(md.freshdate()));
|
|
s.append(",referrer=").append(md.referrerHash() == null ? "" : ASCII.String(md.referrerHash()));
|
|
s.append(",md5=").append(md.md5());
|
|
s.append(",size=").append(md.size());
|
|
s.append(",wc=").append(md.wordCount());
|
|
s.append(",dt=").append(md.doctype());
|
|
s.append(",flags=").append(md.flags().exportB64());
|
|
s.append(",lang=").append(md.language() == null ? "EN" : UTF8.String(md.language()));
|
|
s.append(",llocal=").append(md.llocal());
|
|
s.append(",lother=").append(md.lother());
|
|
s.append(",limage=").append(md.limage());
|
|
s.append(",laudio=").append(md.laudio());
|
|
s.append(",lvideo=").append(md.lvideo());
|
|
s.append(",lapp=").append(md.lapp());
|
|
if (md.word() != null) {
|
|
// append also word properties
|
|
final String wprop = md.word().toPropertyForm();
|
|
s.append(",wi=").append(Base64Order.enhancedCoder.encodeString(wprop));
|
|
}
|
|
return s;
|
|
} catch (final Throwable e) {
|
|
Log.logException(e);
|
|
return null;
|
|
}
|
|
}
|
|
|
|
/**
|
|
* the toString format must be completely identical to URIMetadataRow because that is used
|
|
* to transport the data over p2p connections.
|
|
*/
|
|
@Override
|
|
public String toString(String snippet) {
|
|
// add information needed for remote transport
|
|
final StringBuilder core = corePropList(this);
|
|
if (core == null)
|
|
return null;
|
|
|
|
core.ensureCapacity(core.length() + snippet.length() * 2);
|
|
core.insert(0, '{');
|
|
core.append(",snippet=").append(crypt.simpleEncode(snippet));
|
|
core.append('}');
|
|
|
|
return core.toString();
|
|
//return "{" + core + ",snippet=" + crypt.simpleEncode(snippet) + "}";
|
|
}
|
|
|
|
|
|
/**
|
|
* @return the object as String.<br>
|
|
* This e.g. looks like this:
|
|
* <pre>{hash=jmqfMk7Y3NKw,referrer=------------,mod=20050610,load=20051003,size=51666,wc=1392,cc=0,local=true,q=AEn,dt=h,lang=uk,url=b|aHR0cDovL3d3dy50cmFuc3BhcmVuY3kub3JnL3N1cnZleXMv,descr=b|S25vd2xlZGdlIENlbnRyZTogQ29ycnVwdGlvbiBTdXJ2ZXlzIGFuZCBJbmRpY2Vz}</pre>
|
|
*/
|
|
@Override
|
|
public String toString() {
|
|
final StringBuilder core = corePropList(this);
|
|
if (core == null) return null;
|
|
core.insert(0, '{');
|
|
core.append('}');
|
|
return core.toString();
|
|
}
|
|
|
|
@Override
|
|
public Request toBalancerEntry(final String initiatorHash) {
|
|
return new Request(
|
|
ASCII.getBytes(initiatorHash),
|
|
url(),
|
|
referrerHash(),
|
|
dc_title(),
|
|
moddate(),
|
|
null,
|
|
0,
|
|
0,
|
|
0,
|
|
0);
|
|
}
|
|
} |