From 9050a3c4c547b7bcbe3b0e9505fedc961603a296 Mon Sep 17 00:00:00 2001 From: orbiter Date: Thu, 16 Apr 2009 20:47:55 +0000 Subject: [PATCH] alpha version of surrogate reading and indexing. see the example file for an explanation. git-svn-id: https://svn.berlios.de/svnroot/repos/yacy/trunk@5815 6c8d7289-2bf4-0310-a012-ef5d649a1542 --- examples/surrogate_dublin_core.xml | 28 +++++++++++++++++ source/de/anomic/crawler/Surrogate.java | 38 +++++++++++++++++++++-- source/de/anomic/xml/SurrogateReader.java | 33 ++++++++++++++++++-- 3 files changed, 94 insertions(+), 5 deletions(-) create mode 100644 examples/surrogate_dublin_core.xml diff --git a/examples/surrogate_dublin_core.xml b/examples/surrogate_dublin_core.xml new file mode 100644 index 000000000..d71058f72 --- /dev/null +++ b/examples/surrogate_dublin_core.xml @@ -0,0 +1,28 @@ + + + + + + + + + http://de.wikipedia.org/wiki/Alan_Smithee + + de + 2009-03-02T11:12:36Z + + + diff --git a/source/de/anomic/crawler/Surrogate.java b/source/de/anomic/crawler/Surrogate.java index 45819928f..35ed67112 100644 --- a/source/de/anomic/crawler/Surrogate.java +++ b/source/de/anomic/crawler/Surrogate.java @@ -41,9 +41,31 @@ public class Surrogate extends HashMap { public Surrogate() { super(); } + + /* + DC according to rfc 5013 + + * dc_title + * dc_creator + * dc_subject + * dc_description + * dc_publisher + dc_contributor + dc_date + dc_type + * dc_format + * dc_identifier + * dc_source + dc_language + dc_relation + dc_coverage + dc_rights + */ + public Date date() { - String d = this.get("date"); + String d = this.get("dateISO8601"); if (d == null) d = this.get("docdatetime"); + if (d == null) d = this.get("dc:date"); if (d == null) return null; try { return DateFormatter.parseISO8601(d); @@ -54,6 +76,7 @@ public class Surrogate extends HashMap { } public yacyURL url() { String u = this.get("url"); + if (u == null) u = this.get("dc:identifier"); if (u == null) return null; try { return new yacyURL(u, null); @@ -64,19 +87,28 @@ public class Surrogate extends HashMap { } public String language() { String l = this.get("language"); + if (l == null) l = this.get("dc:language"); if (l == null) return "en"; else return l; } public String title() { String t = this.get("title"); - return stripCDATA(t); + if (t == null) t = this.get("dc:title"); + t = stripCDATA(t); + if (t == null) return ""; + return t; } public String body() { String t = this.get("body"); - return stripCDATA(t); + if (t == null) this.get("dc:description"); + t = stripCDATA(t); + if (t == null) return ""; + return t; } public String[] categories() { String t = this.get("categories"); + if (t == null) this.get("dc:subject"); t = stripCDATA(t); + if (t == null) return new String[]{}; return t.split(";"); } private String stripCDATA(String s) { diff --git a/source/de/anomic/xml/SurrogateReader.java b/source/de/anomic/xml/SurrogateReader.java index 09d86c48c..833feae05 100644 --- a/source/de/anomic/xml/SurrogateReader.java +++ b/source/de/anomic/xml/SurrogateReader.java @@ -89,19 +89,23 @@ public class SurrogateReader extends DefaultHandler implements Runnable, Iterato } public void startElement(final String uri, final String name, final String tag, final Attributes atts) throws SAXException { - if ("document".equals(tag)) { + if ("record".equals(tag) || "document".equals(tag)) { this.surrogate = new Surrogate(); } else if ("element".equals(tag)) { this.elementName = atts.getValue("name"); } else if ("value".equals(tag)) { this.buffer.setLength(0); this.parsingValue = true; + } else if (tag.startsWith("dc:")) { + // parse dublin core attribute + this.elementName = tag; + this.parsingValue = true; } } public void endElement(final String uri, final String name, final String tag) { if (tag == null) return; - if ("document".equals(tag)) { + if ("record".equals(tag) || "document".equals(tag)) { //System.out.println("A Title: " + this.surrogate.title()); try { this.surrogates.put(this.surrogate); @@ -124,6 +128,13 @@ public class SurrogateReader extends DefaultHandler implements Runnable, Iterato } this.buffer.setLength(0); this.parsingValue = false; + } else if (tag.startsWith("dc:")) { + final String value = buffer.toString().trim(); + if (this.elementName != null) { + this.surrogate.put(this.elementName, value); + } + this.buffer.setLength(0); + this.parsingValue = false; } } @@ -177,6 +188,24 @@ public class SurrogateReader extends DefaultHandler implements Runnable, Iterato } /* Example surrogate + + + + + + + + http://de.wikipedia.org/wiki/Alan_Smithee + + de + 2009-03-02T11:12:36Z + + + + + +or