alpha version of surrogate reading and indexing.

see the example file for an explanation.

git-svn-id: https://svn.berlios.de/svnroot/repos/yacy/trunk@5815 6c8d7289-2bf4-0310-a012-ef5d649a1542
This commit is contained in:
orbiter 2009-04-16 20:47:55 +00:00
parent 870066ab35
commit 9050a3c4c5
3 changed files with 94 additions and 5 deletions

View File

@ -0,0 +1,28 @@
<?xml version="1.0" encoding="utf-8"?>
<!-- YaCy surrogate file using dublin core notion -->
<!--
This is a surrogate file which is an intermediate document description
file for index generation. Once you have YaCy started, you can copy a file
like this (or actual this file) into DATA/SURROGATE/in and then the indexing
process will read the file, store the content into the search index and moves
the file into DATA/SURROGATE/out
Using surrogate files and the surrogate file format you can easily create your
own data harvesting sources for the YaCy indexer. Just write a file generator
that generates files like this. The xml schema is very similar to that
described in
http://dublincore.org/documents/dc-xml-guidelines/
using the Dublin Core metadata element set.
-->
<surrogates
xmlns:dc="http://purl.org/dc/elements/1.1/">
<record>
<dc:title><![CDATA[Alan Smithee]]></dc:title>
<dc:identifier>http://de.wikipedia.org/wiki/Alan_Smithee</dc:identifier>
<dc:description><![CDATA[Der als Filmregisseur oft genannte '''Alan Smithee''' ist ein Anagramm von „The Alias Men“.]]></dc:description>
<dc:language>de</dc:language>
<dc:date>2009-03-02T11:12:36Z</dc:date> <!-- date is in ISO 8601 -->
</record>
</surrogates>

View File

@ -41,9 +41,31 @@ public class Surrogate extends HashMap<String, String> {
public Surrogate() {
super();
}
/*
DC according to rfc 5013
* dc_title
* dc_creator
* dc_subject
* dc_description
* dc_publisher
dc_contributor
dc_date
dc_type
* dc_format
* dc_identifier
* dc_source
dc_language
dc_relation
dc_coverage
dc_rights
*/
public Date date() {
String d = this.get("date");
String d = this.get("dateISO8601");
if (d == null) d = this.get("docdatetime");
if (d == null) d = this.get("dc:date");
if (d == null) return null;
try {
return DateFormatter.parseISO8601(d);
@ -54,6 +76,7 @@ public class Surrogate extends HashMap<String, String> {
}
public yacyURL url() {
String u = this.get("url");
if (u == null) u = this.get("dc:identifier");
if (u == null) return null;
try {
return new yacyURL(u, null);
@ -64,19 +87,28 @@ public class Surrogate extends HashMap<String, String> {
}
public String language() {
String l = this.get("language");
if (l == null) l = this.get("dc:language");
if (l == null) return "en"; else return l;
}
public String title() {
String t = this.get("title");
return stripCDATA(t);
if (t == null) t = this.get("dc:title");
t = stripCDATA(t);
if (t == null) return "";
return t;
}
public String body() {
String t = this.get("body");
return stripCDATA(t);
if (t == null) this.get("dc:description");
t = stripCDATA(t);
if (t == null) return "";
return t;
}
public String[] categories() {
String t = this.get("categories");
if (t == null) this.get("dc:subject");
t = stripCDATA(t);
if (t == null) return new String[]{};
return t.split(";");
}
private String stripCDATA(String s) {

View File

@ -89,19 +89,23 @@ public class SurrogateReader extends DefaultHandler implements Runnable, Iterato
}
public void startElement(final String uri, final String name, final String tag, final Attributes atts) throws SAXException {
if ("document".equals(tag)) {
if ("record".equals(tag) || "document".equals(tag)) {
this.surrogate = new Surrogate();
} else if ("element".equals(tag)) {
this.elementName = atts.getValue("name");
} else if ("value".equals(tag)) {
this.buffer.setLength(0);
this.parsingValue = true;
} else if (tag.startsWith("dc:")) {
// parse dublin core attribute
this.elementName = tag;
this.parsingValue = true;
}
}
public void endElement(final String uri, final String name, final String tag) {
if (tag == null) return;
if ("document".equals(tag)) {
if ("record".equals(tag) || "document".equals(tag)) {
//System.out.println("A Title: " + this.surrogate.title());
try {
this.surrogates.put(this.surrogate);
@ -124,6 +128,13 @@ public class SurrogateReader extends DefaultHandler implements Runnable, Iterato
}
this.buffer.setLength(0);
this.parsingValue = false;
} else if (tag.startsWith("dc:")) {
final String value = buffer.toString().trim();
if (this.elementName != null) {
this.surrogate.put(this.elementName, value);
}
this.buffer.setLength(0);
this.parsingValue = false;
}
}
@ -177,6 +188,24 @@ public class SurrogateReader extends DefaultHandler implements Runnable, Iterato
}
/*
Example surrogate
<?xml version="1.0" encoding="utf-8"?>
<!-- YaCy surrogate file using dublin core notion -->
<!-- see http://dublincore.org/documents/dc-xml-guidelines/ -->
<surrogates
xmlns:dc="http://purl.org/dc/elements/1.1/">
<record>
<dc:title><![CDATA[Alan Smithee]]></dc:title>
<dc:identifier>http://de.wikipedia.org/wiki/Alan_Smithee</dc:identifier>
<dc:description><![CDATA[Der als Filmregisseur oft genannte '''Alan Smithee''' ist ein Anagramm von ãThe Alias MenÒ.]]></dc:description>
<dc:language>de</dc:language>
<dc:date>2009-03-02T11:12:36Z</dc:date> <!-- date is in ISO 8601 -->
</record>
</surrogates>
or
<?xml version="1.0" encoding="utf-8"?>
<documents>