enhanced metadata enrichment for media file type search:

- Web servers may now deliver YaCy-specific http header field with a
title and keywords. The new http header fields are:
X-YaCy-Media-Title - to be used for media (image, audio, video) titles
X-YaCy-Media-Keywords - to be used for media (image, audio, video)
keywords
- both fields are written to document fields title and keywords and are
searched also during image search.
- to make the usage of arbitrary http header fields (including this new
fields) possible in the /api/push_p.json servlet, a new POST argument is
also introduced to push http header fields. The new POST attribute is
named "responseHeader-X" (where X is the counter). It is allowed to use
this attribute as multi-attribute several times, each can be filled with
a http header line.
- see /api/push_p.html for examples
This commit is contained in:
Michael Peter Christen 2014-06-26 13:02:35 +02:00
parent 49886fab08
commit 36e623d8bf
4 changed files with 44 additions and 12 deletions

View File

@ -26,19 +26,28 @@
<dd>#[count]#</dd>
<dt>Data</dt>
<dd><input name="data-#[count]#" type="file"></dd>
<dd>data-#[count]#=<input name="data-#[count]#" type="file"></dd>
<dt>URL</dt>
<dd><input name="url-#[count]#" type="text" value="http://nowhere.cc/example.txt" size="80" maxlength="512"></dd>
<dt>Last-Modified</dt><!-- see: http://tools.ietf.org/html/rfc2616#section-14.29 -->
<dd><input name="lastModified-#[count]#" type="text" value="Tue, 15 Nov 1994 12:45:26 GMT" size="30" maxlength="40"></dd>
<dt>Content-Type</dt><!-- see: http://www.iana.org/assignments/media-types/media-types.xhtml -->
<dd><input name="contentType-#[count]#" type="text" value="text/plain" size="30" maxlength="80"></dd>
<dd>url-#[count]#=<input name="url-#[count]#" type="text" value="http://nowhere.cc/example.txt" size="80" maxlength="512"></dd>
<dt>Collection</dt>
<dd><input name="collection-#[count]#" type="text" value="push" size="30" maxlength="512"></dd>
<dd>collection-#[count]#=<input name="collection-#[count]#" type="text" value="push" size="80" maxlength="512"></dd>
<dt>Last-Modified</dt><!-- see: http://tools.ietf.org/html/rfc2616#section-14.29 -->
<!--<dd><input name="lastModified-#[count]#" type="text" value="Tue, 15 Nov 1994 12:45:26 GMT" size="30" maxlength="40"></dd>-->
<dd>responseHeader-#[count]#=<input name="responseHeader-#[count]#" type="text" value="Last-Modified:Tue, 15 Nov 1994 12:45:26 GMT" size="80" maxlength="80"></dd>
<dt>Content-Type</dt><!-- see: http://www.iana.org/assignments/media-types/media-types.xhtml -->
<!--<dd><input name="contentType-#[count]#" type="text" value="text/plain" size="30" maxlength="80"></dd>-->
<dd>responseHeader-#[count]#=<input name="responseHeader-#[count]#" type="text" value="Content-Type:text/plain" size="80" maxlength="80"></dd>
<dt></dt><dd>The following attributes are only used for media type content</dd>
<dt>Media-Title</dt>
<dd>responseHeader-#[count]#=<input name="responseHeader-#[count]#" type="text" value="X-YaCy-Media-Title:Hello Pictureworld" size="80" maxlength="200"></dd>
<dt>Media-Keywords ()</dt>
<dd>responseHeader-#[count]#=<input name="responseHeader-#[count]#" type="text" value="X-YaCy-Media-Keywords:uno dos tres cuatro cinco" size="80" maxlength="200"></dd>
</dl>
</dd>
#{/input}#

View File

@ -83,6 +83,15 @@ public class push_p {
responseHeader.put(HeaderFramework.LAST_MODIFIED, lastModified);
responseHeader.put(HeaderFramework.CONTENT_TYPE, contentType);
responseHeader.put(HeaderFramework.CONTENT_LENGTH, Long.toString(data.length));
// add generic fields
String[] responseHeaderMap = post.getParams("responseHeader-" + i); // strings with key-value pairs; separated by ':'
for (String kv: responseHeaderMap) {
int p = kv.indexOf(':');
if (p < 0) continue;
String key = kv.substring(0, p).trim();
String value = kv.substring(p + 1).trim();
responseHeader.put(key, value);
}
CrawlProfile profile = sb.crawler.getPushCrawlProfile(collection);
// create requests and artificial response

View File

@ -107,10 +107,12 @@ public class HeaderFramework extends TreeMap<String, String> implements Map<Stri
public static final String X_ROBOTS_TAG = "X-Robots-Tag"; // see http://googleblog.blogspot.com/2007/07/robots-exclusion-protocol-now-with-even.html
public static final String X_ROBOTS = "X-Robots";
public static final String X_YACY_INDEX_CONTROL = "X-YACY-Index-Control";
public static final String X_YACY_INDEX_CONTROL = "X-YaCy-Index-Control";
//public static final String X_YACY_PREVIOUS_REQUEST_LINE = "X-Previous-Request-Line";
public static final String X_YACY_KEEP_ALIVE_REQUEST_COUNT = "X-Keep-Alive-Request-Count";
public static final String X_YACY_ORIGINAL_REQUEST_LINE = "X-Original-Request-Line";
public static final String X_YACY_MEDIA_TITLE = "X-YaCy-Media-Title"; // can be attached to media files which do not have metadata; this will be used as title
public static final String X_YACY_MEDIA_KEYWORDS = "X-YaCy-Media-Keywords"; // can be attached to media files which do not have metadata; this will be used as keywords (space-separared list of words)
public static final String SET_COOKIE = "Set-Cookie";
public static final String SET_COOKIE2 = "Set-Cookie2";

View File

@ -421,9 +421,15 @@ public class CollectionConfiguration extends SchemaConfiguration implements Seri
}
add(doc, CollectionSchema.collection_sxt, cs);
}
char doctype = Response.docType(responseHeader.getContentType());
List<String> titles = document.titles();
if (allAttr || contains(CollectionSchema.title)) {
if (doctype == Response.DT_IMAGE || doctype == Response.DT_AUDIO || doctype == Response.DT_MOVIE) {
String mediatitle = responseHeader.get(HeaderFramework.X_YACY_MEDIA_TITLE, "");
if (mediatitle.length() > 0) {
if (titles.size() == 0) titles.add(mediatitle); else titles.set(0, mediatitle);
}
}
add(doc, CollectionSchema.title, titles);
if ((allAttr || contains(CollectionSchema.title_exact_signature_l)) && titles.size() > 0) {
add(doc, CollectionSchema.title_exact_signature_l, EnhancedTextProfileSignature.getSignatureLong(titles.get(0)));
@ -473,7 +479,13 @@ public class CollectionConfiguration extends SchemaConfiguration implements Seri
if (document.getDate().before(lastModified)) lastModified = document.getDate();
add(doc, CollectionSchema.last_modified, lastModified);
}
if (allAttr || contains(CollectionSchema.keywords)) add(doc, CollectionSchema.keywords, document.dc_subject(' '));
if (allAttr || contains(CollectionSchema.keywords)) {
String keywords = document.dc_subject(' ');
if (doctype == Response.DT_IMAGE || doctype == Response.DT_AUDIO || doctype == Response.DT_MOVIE) {
keywords = responseHeader.get(HeaderFramework.X_YACY_MEDIA_KEYWORDS, keywords);
}
add(doc, CollectionSchema.keywords, keywords);
}
if (allAttr || contains(CollectionSchema.synonyms_sxt)) {
List<String> synonyms = condenser.synonyms();
add(doc, CollectionSchema.synonyms_sxt, synonyms);