mirror of
https://github.com/yacy/yacy_search_server.git
synced 2024-09-21 00:00:13 +02:00
5f0ab25382
MultiProtocolURI during normalform computation because that should always be done and also be done during initialization of the MultiProtocolURI Object. The new normalform method takes only one argument which should be 'true' unless you know exactly what you are doing.
299 lines
9.2 KiB
Java
299 lines
9.2 KiB
Java
// DCEntry.java
|
|
// (C) 2009 by Michael Peter Christen; mc@yacy.net, Frankfurt a. M., Germany
|
|
// first published 15.04.2009 on http://yacy.net
|
|
//
|
|
// $LastChangedDate$
|
|
// $LastChangedRevision$
|
|
// $LastChangedBy$
|
|
//
|
|
// LICENSE
|
|
//
|
|
// This program is free software; you can redistribute it and/or modify
|
|
// it under the terms of the GNU General Public License as published by
|
|
// the Free Software Foundation; either version 2 of the License, or
|
|
// (at your option) any later version.
|
|
//
|
|
// This program is distributed in the hope that it will be useful,
|
|
// but WITHOUT ANY WARRANTY; without even the implied warranty of
|
|
// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
|
|
// GNU General Public License for more details.
|
|
//
|
|
// You should have received a copy of the GNU General Public License
|
|
// along with this program; if not, write to the Free Software
|
|
// Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
|
|
|
|
|
|
package net.yacy.document.content;
|
|
|
|
import java.io.IOException;
|
|
import java.io.OutputStreamWriter;
|
|
import java.net.MalformedURLException;
|
|
import java.text.Collator;
|
|
import java.text.ParseException;
|
|
import java.util.ArrayList;
|
|
import java.util.Date;
|
|
import java.util.HashSet;
|
|
import java.util.List;
|
|
import java.util.Locale;
|
|
import java.util.TreeMap;
|
|
|
|
import net.yacy.cora.date.ISO8601Formatter;
|
|
import net.yacy.document.Document;
|
|
import net.yacy.kelondro.data.meta.DigestURI;
|
|
import net.yacy.kelondro.logging.Log;
|
|
|
|
public class DCEntry extends TreeMap<String, String> {
|
|
|
|
private static final long serialVersionUID = -2050291583515701559L;
|
|
|
|
// use a collator to relax when distinguishing between lowercase und uppercase letters
|
|
private static final Collator insensitiveCollator = Collator.getInstance(Locale.US);
|
|
static {
|
|
insensitiveCollator.setStrength(Collator.SECONDARY);
|
|
insensitiveCollator.setDecomposition(Collator.NO_DECOMPOSITION);
|
|
}
|
|
public static final DCEntry poison = new DCEntry();
|
|
|
|
public DCEntry() {
|
|
super((Collator) insensitiveCollator.clone());
|
|
}
|
|
|
|
public DCEntry(
|
|
DigestURI url,
|
|
Date date,
|
|
String title,
|
|
String author,
|
|
String body,
|
|
double lat,
|
|
double lon
|
|
) {
|
|
super((Collator) insensitiveCollator.clone());
|
|
this.put("dc:identifier", url.toNormalform(true));
|
|
this.put("dc:date", ISO8601Formatter.FORMATTER.format(date));
|
|
this.put("dc:title", title);
|
|
this.put("dc:creator", author);
|
|
this.put("dc:description", body);
|
|
this.put("geo:lat", Double.toString(lat));
|
|
this.put("geo:long", Double.toString(lon));
|
|
}
|
|
|
|
/*
|
|
DC according to rfc 5013
|
|
|
|
* dc_title
|
|
* dc_creator
|
|
* dc_subject
|
|
* dc_description
|
|
* dc_publisher
|
|
dc_contributor
|
|
dc_date
|
|
dc_type
|
|
* dc_format
|
|
* dc_identifier
|
|
* dc_source
|
|
dc_language
|
|
dc_relation
|
|
dc_coverage
|
|
dc_rights
|
|
*/
|
|
public Date getDate() {
|
|
String d = this.get("docdatetime");
|
|
if (d == null) d = this.get("dc:date");
|
|
if (d == null) return null;
|
|
if (d.isEmpty()) return null;
|
|
try {
|
|
Date x = ISO8601Formatter.FORMATTER.parse(d);
|
|
Date now = new Date();
|
|
return x.after(now) ? now : x;
|
|
} catch (ParseException e) {
|
|
Log.logException(e);
|
|
return new Date();
|
|
}
|
|
}
|
|
|
|
public DigestURI getIdentifier(boolean useRelationAsAlternative) {
|
|
String u = this.get("url");
|
|
if (u == null) u = this.get("dc:identifier");
|
|
if (u == null) return useRelationAsAlternative ? getRelation() : null;
|
|
String[] urls = u.split(";");
|
|
if (urls.length > 1) {
|
|
// select one that fits
|
|
u = bestU(urls);
|
|
}
|
|
try {
|
|
return new DigestURI(u);
|
|
} catch (MalformedURLException e) {
|
|
if (useRelationAsAlternative) {
|
|
DigestURI relation = this.getRelation();
|
|
if (relation != null) return relation;
|
|
Log.logWarning("DCEntry", "getIdentifier: url is bad, relation also: " + e.getMessage());
|
|
}
|
|
Log.logWarning("DCEntry", "getIdentifier: url is bad: " + e.getMessage());
|
|
return null;
|
|
}
|
|
}
|
|
|
|
public DigestURI getRelation() {
|
|
String u = this.get("dc:relation");
|
|
if (u == null) return null;
|
|
String[] urls = u.split(";");
|
|
if (urls.length > 1) {
|
|
// select one that fits
|
|
u = bestU(urls);
|
|
}
|
|
try {
|
|
return new DigestURI(u);
|
|
} catch (MalformedURLException e) {
|
|
Log.logWarning("DCEntry", "getRelation: url is bad: " + e.getMessage());
|
|
return null;
|
|
}
|
|
}
|
|
|
|
private static String bestU(String[] urls) {
|
|
for (String uu: urls) {
|
|
if (uu.startsWith("http://") && (uu.endsWith(".html") || uu.endsWith(".htm") || uu.endsWith(".pdf") || uu.endsWith(".doc") || uu.endsWith(".rss") || uu.endsWith(".xml"))) return uu;
|
|
}
|
|
for (String uu: urls) {
|
|
if (uu.startsWith("http://")) return uu;
|
|
}
|
|
for (String uu: urls) {
|
|
if (uu.startsWith("ftp://")) return uu;
|
|
}
|
|
for (String uu: urls) {
|
|
//urn identifier koennen ueber den resolver der d-nb aufgeloest werden:
|
|
//http://nbn-resolving.de/urn:nbn:de:bsz:960-opus-1860
|
|
if (uu.startsWith("urn:")) return "http://nbn-resolving.de/" + uu;
|
|
}
|
|
return urls[0];
|
|
}
|
|
|
|
//modified by copperdust; Ukraine, 2012
|
|
public String getLanguage() {//final language computation
|
|
String l = this.get("dc:language");//from document metainfo
|
|
if (l == null) l = getIdentifier(true).language();//from symbolic frequency table
|
|
if (l == null) return this.get("language");//from TLD
|
|
return l;
|
|
}
|
|
|
|
public String getType() {
|
|
String t = this.get("dc:type");
|
|
if (t == null) return "";
|
|
return t;
|
|
}
|
|
|
|
public String getFormat() {
|
|
String t = this.get("dc:format");
|
|
if (t == null) return "";
|
|
return t;
|
|
}
|
|
|
|
public String getSource() {
|
|
String t = this.get("dc:source");
|
|
if (t == null) return "";
|
|
return t;
|
|
}
|
|
|
|
public String getRights() {
|
|
String t = this.get("dc:rights");
|
|
if (t == null) return "";
|
|
return t;
|
|
}
|
|
|
|
public String getTitle() {
|
|
String t = this.get("title");
|
|
if (t == null) t = this.get("dc:title");
|
|
t = stripCDATA(t);
|
|
if (t == null) return "";
|
|
return t;
|
|
}
|
|
|
|
public String getPublisher() {
|
|
String t = this.get("dc:publisher");
|
|
t = stripCDATA(t);
|
|
if (t == null) return "";
|
|
return t;
|
|
}
|
|
|
|
public String getCreator() {
|
|
String t = this.get("author");
|
|
if (t == null) t = this.get("dc:creator");
|
|
t = stripCDATA(t);
|
|
if (t == null) return "";
|
|
return t;
|
|
}
|
|
|
|
public String getDescription() {
|
|
String t = this.get("body");
|
|
if (t == null) t = this.get("dc:description");
|
|
if (t == null) t = this.get("dc:subject");
|
|
if (t == null) t = this.get("categories");
|
|
t = stripCDATA(t);
|
|
if (t == null) return "";
|
|
return t;
|
|
}
|
|
|
|
public String[] getSubject() {
|
|
String t = this.get("categories");
|
|
if (t == null) t = this.get("dc:subject");
|
|
t = stripCDATA(t);
|
|
if (t == null) return new String[]{};
|
|
return t.split(";");
|
|
}
|
|
|
|
public double getLon() {
|
|
String t = this.get("geo:long");
|
|
if (t == null) t = this.get("geo:lon");
|
|
t = stripCDATA(t);
|
|
if (t == null) return 0.0d;
|
|
return Double.parseDouble(t);
|
|
}
|
|
|
|
public double getLat() {
|
|
String t = this.get("geo:lat");
|
|
if (t == null) t = this.get("geo:lat");
|
|
t = stripCDATA(t);
|
|
if (t == null) return 0.0d;
|
|
return Double.parseDouble(t);
|
|
}
|
|
|
|
private static String stripCDATA(String s) {
|
|
if (s == null) return null;
|
|
s = s.trim();
|
|
if (s.startsWith("<![CDATA[")) s = s.substring(9);
|
|
if (s.endsWith("]]")) s = s.substring(0, s.length() - 2);
|
|
return s;
|
|
}
|
|
|
|
public Document document() {
|
|
HashSet<String> languages = new HashSet<String>();
|
|
languages.add(getLanguage());
|
|
List<String> t = new ArrayList<String>(1);
|
|
t.add(getTitle());
|
|
return new Document(
|
|
getIdentifier(true),
|
|
"text/html",
|
|
"UTF-8",
|
|
this,
|
|
languages,
|
|
getSubject(),
|
|
t,
|
|
getCreator(),
|
|
getPublisher(),
|
|
null,
|
|
"",
|
|
getLon(), getLat(),
|
|
getDescription(),
|
|
null,
|
|
null,
|
|
null,
|
|
false);
|
|
}
|
|
|
|
public void writeXML(OutputStreamWriter os) throws IOException {
|
|
Document doc = document();
|
|
if (doc != null) {
|
|
doc.writeXML(os, this.getDate());
|
|
}
|
|
}
|
|
}
|