enhanced metadata enrichment for media file type search:

- Web servers may now deliver YaCy-specific http header field with a title and keywords. The new http header fields are: X-YaCy-Media-Title - to be used for media (image, audio, video) titles X-YaCy-Media-Keywords - to be used for media (image, audio, video) keywords - both fields are written to document fields title and keywords and are searched also during image search. - to make the usage of arbitrary http header fields (including this new fields) possible in the /api/push_p.json servlet, a new POST argument is also introduced to push http header fields. The new POST attribute is named "responseHeader-X" (where X is the counter). It is allowed to use this attribute as multi-attribute several times, each can be filled with a http header line. - see /api/push_p.html for examples
2024-09-19 00:01:41 +02:00 · 2014-06-26 13:02:35 +02:00 · 2014-06-26 13:02:35 +02:00 · 36e623d8bf
commit 36e623d8bf
parent 49886fab08
4 changed files with 44 additions and 12 deletions
--- a/htroot/api/push_p.html
+++ b/htroot/api/push_p.html
@ -26,19 +26,28 @@
 						<dd>#[count]#</dd>
 						
 						<dt>Data</dt>
-						<dd><input name="data-#[count]#" type="file"></dd>
+						<dd>data-#[count]#=<input name="data-#[count]#" type="file"></dd>
 						
 						<dt>URL</dt>
-						<dd><input name="url-#[count]#" type="text" value="http://nowhere.cc/example.txt" size="80" maxlength="512"></dd>
-						
-						<dt>Last-Modified</dt><!-- see: http://tools.ietf.org/html/rfc2616#section-14.29 -->
-						<dd><input name="lastModified-#[count]#" type="text" value="Tue, 15 Nov 1994 12:45:26 GMT" size="30" maxlength="40"></dd>
-						
-						<dt>Content-Type</dt><!-- see: http://www.iana.org/assignments/media-types/media-types.xhtml -->
-						<dd><input name="contentType-#[count]#" type="text" value="text/plain" size="30" maxlength="80"></dd>
+						<dd>url-#[count]#=<input name="url-#[count]#" type="text" value="http://nowhere.cc/example.txt" size="80" maxlength="512"></dd>
 						
 						<dt>Collection</dt>
-						<dd><input name="collection-#[count]#" type="text" value="push" size="30" maxlength="512"></dd>
+						<dd>collection-#[count]#=<input name="collection-#[count]#" type="text" value="push" size="80" maxlength="512"></dd>
+						
+						<dt>Last-Modified</dt><!-- see: http://tools.ietf.org/html/rfc2616#section-14.29 -->
+						<!--<dd><input name="lastModified-#[count]#" type="text" value="Tue, 15 Nov 1994 12:45:26 GMT" size="30" maxlength="40"></dd>-->
+						<dd>responseHeader-#[count]#=<input name="responseHeader-#[count]#" type="text" value="Last-Modified:Tue, 15 Nov 1994 12:45:26 GMT" size="80" maxlength="80"></dd>
+						
+						<dt>Content-Type</dt><!-- see: http://www.iana.org/assignments/media-types/media-types.xhtml -->
+						<!--<dd><input name="contentType-#[count]#" type="text" value="text/plain" size="30" maxlength="80"></dd>-->
+						<dd>responseHeader-#[count]#=<input name="responseHeader-#[count]#" type="text" value="Content-Type:text/plain" size="80" maxlength="80"></dd>
+
+						<dt></dt><dd>The following attributes are only used for media type content</dd>
+						<dt>Media-Title</dt>
+						<dd>responseHeader-#[count]#=<input name="responseHeader-#[count]#" type="text" value="X-YaCy-Media-Title:Hello Pictureworld" size="80" maxlength="200"></dd>
+						
+						<dt>Media-Keywords ()</dt>
+						<dd>responseHeader-#[count]#=<input name="responseHeader-#[count]#" type="text" value="X-YaCy-Media-Keywords:uno dos tres cuatro cinco" size="80" maxlength="200"></dd>
 					</dl>
 				</dd>
 				#{/input}#
--- a/htroot/api/push_p.java
+++ b/htroot/api/push_p.java
@ -83,6 +83,15 @@ public class push_p {
                responseHeader.put(HeaderFramework.LAST_MODIFIED, lastModified);
                responseHeader.put(HeaderFramework.CONTENT_TYPE, contentType);
                responseHeader.put(HeaderFramework.CONTENT_LENGTH, Long.toString(data.length));
+                // add generic fields
+                String[] responseHeaderMap = post.getParams("responseHeader-" + i); // strings with key-value pairs; separated by ':'
+                for (String kv: responseHeaderMap) {
+                    int p = kv.indexOf(':');
+                    if (p < 0) continue;
+                    String key = kv.substring(0, p).trim();
+                    String value = kv.substring(p + 1).trim();
+                    responseHeader.put(key, value);
+                }
                CrawlProfile profile = sb.crawler.getPushCrawlProfile(collection);
                
                // create requests and artificial response
--- a/source/net/yacy/cora/protocol/HeaderFramework.java
+++ b/source/net/yacy/cora/protocol/HeaderFramework.java
@ -107,10 +107,12 @@ public class HeaderFramework extends TreeMap<String, String> implements Map<Stri
    public static final String X_ROBOTS_TAG = "X-Robots-Tag"; // see http://googleblog.blogspot.com/2007/07/robots-exclusion-protocol-now-with-even.html
    public static final String X_ROBOTS = "X-Robots";

-    public static final String X_YACY_INDEX_CONTROL = "X-YACY-Index-Control";
+    public static final String X_YACY_INDEX_CONTROL = "X-YaCy-Index-Control";
    //public static final String X_YACY_PREVIOUS_REQUEST_LINE = "X-Previous-Request-Line";
    public static final String X_YACY_KEEP_ALIVE_REQUEST_COUNT = "X-Keep-Alive-Request-Count";
    public static final String X_YACY_ORIGINAL_REQUEST_LINE = "X-Original-Request-Line";
+    public static final String X_YACY_MEDIA_TITLE = "X-YaCy-Media-Title"; // can be attached to media files which do not have metadata; this will be used as title
+    public static final String X_YACY_MEDIA_KEYWORDS = "X-YaCy-Media-Keywords"; // can be attached to media files which do not have metadata; this will be used as keywords (space-separared list of words)

    public static final String SET_COOKIE = "Set-Cookie";
    public static final String SET_COOKIE2 = "Set-Cookie2";
--- a/source/net/yacy/search/schema/CollectionConfiguration.java
+++ b/source/net/yacy/search/schema/CollectionConfiguration.java
@ -421,9 +421,15 @@ public class CollectionConfiguration extends SchemaConfiguration implements Seri
            }
            add(doc, CollectionSchema.collection_sxt, cs);
        }
-        
+        char doctype = Response.docType(responseHeader.getContentType());
        List<String> titles = document.titles();
        if (allAttr || contains(CollectionSchema.title)) {
+            if (doctype == Response.DT_IMAGE || doctype == Response.DT_AUDIO || doctype == Response.DT_MOVIE) {
+                String mediatitle = responseHeader.get(HeaderFramework.X_YACY_MEDIA_TITLE, "");
+                if (mediatitle.length() > 0) {
+                    if (titles.size() == 0) titles.add(mediatitle); else titles.set(0, mediatitle);
+                }
+            }
            add(doc, CollectionSchema.title, titles);
            if ((allAttr || contains(CollectionSchema.title_exact_signature_l)) && titles.size() > 0) {
                add(doc, CollectionSchema.title_exact_signature_l, EnhancedTextProfileSignature.getSignatureLong(titles.get(0)));
@ -473,7 +479,13 @@ public class CollectionConfiguration extends SchemaConfiguration implements Seri
            if (document.getDate().before(lastModified)) lastModified = document.getDate();
            add(doc, CollectionSchema.last_modified, lastModified);
        }
-        if (allAttr || contains(CollectionSchema.keywords)) add(doc, CollectionSchema.keywords, document.dc_subject(' '));
+        if (allAttr || contains(CollectionSchema.keywords)) {
+            String keywords = document.dc_subject(' ');
+            if (doctype == Response.DT_IMAGE || doctype == Response.DT_AUDIO || doctype == Response.DT_MOVIE) {
+                keywords = responseHeader.get(HeaderFramework.X_YACY_MEDIA_KEYWORDS, keywords);
+            }
+            add(doc, CollectionSchema.keywords, keywords);
+        }
        if (allAttr || contains(CollectionSchema.synonyms_sxt)) {
            List<String> synonyms = condenser.synonyms();
            add(doc, CollectionSchema.synonyms_sxt, synonyms);