mirror of
https://github.com/yacy/yacy_search_server.git
synced 2024-09-19 00:01:41 +02:00
alpha version of surrogate reading and indexing.
see the example file for an explanation. git-svn-id: https://svn.berlios.de/svnroot/repos/yacy/trunk@5815 6c8d7289-2bf4-0310-a012-ef5d649a1542
This commit is contained in:
parent
870066ab35
commit
9050a3c4c5
28
examples/surrogate_dublin_core.xml
Normal file
28
examples/surrogate_dublin_core.xml
Normal file
|
@ -0,0 +1,28 @@
|
|||
<?xml version="1.0" encoding="utf-8"?>
|
||||
<!-- YaCy surrogate file using dublin core notion -->
|
||||
<!--
|
||||
This is a surrogate file which is an intermediate document description
|
||||
file for index generation. Once you have YaCy started, you can copy a file
|
||||
like this (or actual this file) into DATA/SURROGATE/in and then the indexing
|
||||
process will read the file, store the content into the search index and moves
|
||||
the file into DATA/SURROGATE/out
|
||||
Using surrogate files and the surrogate file format you can easily create your
|
||||
own data harvesting sources for the YaCy indexer. Just write a file generator
|
||||
that generates files like this. The xml schema is very similar to that
|
||||
described in
|
||||
http://dublincore.org/documents/dc-xml-guidelines/
|
||||
using the Dublin Core metadata element set.
|
||||
-->
|
||||
|
||||
<surrogates
|
||||
xmlns:dc="http://purl.org/dc/elements/1.1/">
|
||||
|
||||
<record>
|
||||
<dc:title><![CDATA[Alan Smithee]]></dc:title>
|
||||
<dc:identifier>http://de.wikipedia.org/wiki/Alan_Smithee</dc:identifier>
|
||||
<dc:description><![CDATA[Der als Filmregisseur oft genannte '''Alan Smithee''' ist ein Anagramm von „The Alias Men“.]]></dc:description>
|
||||
<dc:language>de</dc:language>
|
||||
<dc:date>2009-03-02T11:12:36Z</dc:date> <!-- date is in ISO 8601 -->
|
||||
</record>
|
||||
|
||||
</surrogates>
|
|
@ -41,9 +41,31 @@ public class Surrogate extends HashMap<String, String> {
|
|||
public Surrogate() {
|
||||
super();
|
||||
}
|
||||
|
||||
/*
|
||||
DC according to rfc 5013
|
||||
|
||||
* dc_title
|
||||
* dc_creator
|
||||
* dc_subject
|
||||
* dc_description
|
||||
* dc_publisher
|
||||
dc_contributor
|
||||
dc_date
|
||||
dc_type
|
||||
* dc_format
|
||||
* dc_identifier
|
||||
* dc_source
|
||||
dc_language
|
||||
dc_relation
|
||||
dc_coverage
|
||||
dc_rights
|
||||
*/
|
||||
|
||||
public Date date() {
|
||||
String d = this.get("date");
|
||||
String d = this.get("dateISO8601");
|
||||
if (d == null) d = this.get("docdatetime");
|
||||
if (d == null) d = this.get("dc:date");
|
||||
if (d == null) return null;
|
||||
try {
|
||||
return DateFormatter.parseISO8601(d);
|
||||
|
@ -54,6 +76,7 @@ public class Surrogate extends HashMap<String, String> {
|
|||
}
|
||||
public yacyURL url() {
|
||||
String u = this.get("url");
|
||||
if (u == null) u = this.get("dc:identifier");
|
||||
if (u == null) return null;
|
||||
try {
|
||||
return new yacyURL(u, null);
|
||||
|
@ -64,19 +87,28 @@ public class Surrogate extends HashMap<String, String> {
|
|||
}
|
||||
public String language() {
|
||||
String l = this.get("language");
|
||||
if (l == null) l = this.get("dc:language");
|
||||
if (l == null) return "en"; else return l;
|
||||
}
|
||||
public String title() {
|
||||
String t = this.get("title");
|
||||
return stripCDATA(t);
|
||||
if (t == null) t = this.get("dc:title");
|
||||
t = stripCDATA(t);
|
||||
if (t == null) return "";
|
||||
return t;
|
||||
}
|
||||
public String body() {
|
||||
String t = this.get("body");
|
||||
return stripCDATA(t);
|
||||
if (t == null) this.get("dc:description");
|
||||
t = stripCDATA(t);
|
||||
if (t == null) return "";
|
||||
return t;
|
||||
}
|
||||
public String[] categories() {
|
||||
String t = this.get("categories");
|
||||
if (t == null) this.get("dc:subject");
|
||||
t = stripCDATA(t);
|
||||
if (t == null) return new String[]{};
|
||||
return t.split(";");
|
||||
}
|
||||
private String stripCDATA(String s) {
|
||||
|
|
|
@ -89,19 +89,23 @@ public class SurrogateReader extends DefaultHandler implements Runnable, Iterato
|
|||
}
|
||||
|
||||
public void startElement(final String uri, final String name, final String tag, final Attributes atts) throws SAXException {
|
||||
if ("document".equals(tag)) {
|
||||
if ("record".equals(tag) || "document".equals(tag)) {
|
||||
this.surrogate = new Surrogate();
|
||||
} else if ("element".equals(tag)) {
|
||||
this.elementName = atts.getValue("name");
|
||||
} else if ("value".equals(tag)) {
|
||||
this.buffer.setLength(0);
|
||||
this.parsingValue = true;
|
||||
} else if (tag.startsWith("dc:")) {
|
||||
// parse dublin core attribute
|
||||
this.elementName = tag;
|
||||
this.parsingValue = true;
|
||||
}
|
||||
}
|
||||
|
||||
public void endElement(final String uri, final String name, final String tag) {
|
||||
if (tag == null) return;
|
||||
if ("document".equals(tag)) {
|
||||
if ("record".equals(tag) || "document".equals(tag)) {
|
||||
//System.out.println("A Title: " + this.surrogate.title());
|
||||
try {
|
||||
this.surrogates.put(this.surrogate);
|
||||
|
@ -124,6 +128,13 @@ public class SurrogateReader extends DefaultHandler implements Runnable, Iterato
|
|||
}
|
||||
this.buffer.setLength(0);
|
||||
this.parsingValue = false;
|
||||
} else if (tag.startsWith("dc:")) {
|
||||
final String value = buffer.toString().trim();
|
||||
if (this.elementName != null) {
|
||||
this.surrogate.put(this.elementName, value);
|
||||
}
|
||||
this.buffer.setLength(0);
|
||||
this.parsingValue = false;
|
||||
}
|
||||
}
|
||||
|
||||
|
@ -177,6 +188,24 @@ public class SurrogateReader extends DefaultHandler implements Runnable, Iterato
|
|||
}
|
||||
/*
|
||||
Example surrogate
|
||||
<?xml version="1.0" encoding="utf-8"?>
|
||||
<!-- YaCy surrogate file using dublin core notion -->
|
||||
<!-- see http://dublincore.org/documents/dc-xml-guidelines/ -->
|
||||
<surrogates
|
||||
xmlns:dc="http://purl.org/dc/elements/1.1/">
|
||||
|
||||
<record>
|
||||
<dc:title><![CDATA[Alan Smithee]]></dc:title>
|
||||
<dc:identifier>http://de.wikipedia.org/wiki/Alan_Smithee</dc:identifier>
|
||||
<dc:description><![CDATA[Der als Filmregisseur oft genannte '''Alan Smithee''' ist ein Anagramm von ãThe Alias MenÒ.]]></dc:description>
|
||||
<dc:language>de</dc:language>
|
||||
<dc:date>2009-03-02T11:12:36Z</dc:date> <!-- date is in ISO 8601 -->
|
||||
</record>
|
||||
|
||||
</surrogates>
|
||||
|
||||
|
||||
or
|
||||
|
||||
<?xml version="1.0" encoding="utf-8"?>
|
||||
<documents>
|
||||
|
|
Loading…
Reference in New Issue
Block a user