mirror of
https://github.com/yacy/yacy_search_server.git
synced 2024-09-19 00:01:41 +02:00
enhanced metadata enrichment for media file type search:
- Web servers may now deliver YaCy-specific http header field with a title and keywords. The new http header fields are: X-YaCy-Media-Title - to be used for media (image, audio, video) titles X-YaCy-Media-Keywords - to be used for media (image, audio, video) keywords - both fields are written to document fields title and keywords and are searched also during image search. - to make the usage of arbitrary http header fields (including this new fields) possible in the /api/push_p.json servlet, a new POST argument is also introduced to push http header fields. The new POST attribute is named "responseHeader-X" (where X is the counter). It is allowed to use this attribute as multi-attribute several times, each can be filled with a http header line. - see /api/push_p.html for examples
This commit is contained in:
parent
49886fab08
commit
36e623d8bf
|
@ -26,19 +26,28 @@
|
|||
<dd>#[count]#</dd>
|
||||
|
||||
<dt>Data</dt>
|
||||
<dd><input name="data-#[count]#" type="file"></dd>
|
||||
<dd>data-#[count]#=<input name="data-#[count]#" type="file"></dd>
|
||||
|
||||
<dt>URL</dt>
|
||||
<dd><input name="url-#[count]#" type="text" value="http://nowhere.cc/example.txt" size="80" maxlength="512"></dd>
|
||||
|
||||
<dt>Last-Modified</dt><!-- see: http://tools.ietf.org/html/rfc2616#section-14.29 -->
|
||||
<dd><input name="lastModified-#[count]#" type="text" value="Tue, 15 Nov 1994 12:45:26 GMT" size="30" maxlength="40"></dd>
|
||||
|
||||
<dt>Content-Type</dt><!-- see: http://www.iana.org/assignments/media-types/media-types.xhtml -->
|
||||
<dd><input name="contentType-#[count]#" type="text" value="text/plain" size="30" maxlength="80"></dd>
|
||||
<dd>url-#[count]#=<input name="url-#[count]#" type="text" value="http://nowhere.cc/example.txt" size="80" maxlength="512"></dd>
|
||||
|
||||
<dt>Collection</dt>
|
||||
<dd><input name="collection-#[count]#" type="text" value="push" size="30" maxlength="512"></dd>
|
||||
<dd>collection-#[count]#=<input name="collection-#[count]#" type="text" value="push" size="80" maxlength="512"></dd>
|
||||
|
||||
<dt>Last-Modified</dt><!-- see: http://tools.ietf.org/html/rfc2616#section-14.29 -->
|
||||
<!--<dd><input name="lastModified-#[count]#" type="text" value="Tue, 15 Nov 1994 12:45:26 GMT" size="30" maxlength="40"></dd>-->
|
||||
<dd>responseHeader-#[count]#=<input name="responseHeader-#[count]#" type="text" value="Last-Modified:Tue, 15 Nov 1994 12:45:26 GMT" size="80" maxlength="80"></dd>
|
||||
|
||||
<dt>Content-Type</dt><!-- see: http://www.iana.org/assignments/media-types/media-types.xhtml -->
|
||||
<!--<dd><input name="contentType-#[count]#" type="text" value="text/plain" size="30" maxlength="80"></dd>-->
|
||||
<dd>responseHeader-#[count]#=<input name="responseHeader-#[count]#" type="text" value="Content-Type:text/plain" size="80" maxlength="80"></dd>
|
||||
|
||||
<dt></dt><dd>The following attributes are only used for media type content</dd>
|
||||
<dt>Media-Title</dt>
|
||||
<dd>responseHeader-#[count]#=<input name="responseHeader-#[count]#" type="text" value="X-YaCy-Media-Title:Hello Pictureworld" size="80" maxlength="200"></dd>
|
||||
|
||||
<dt>Media-Keywords ()</dt>
|
||||
<dd>responseHeader-#[count]#=<input name="responseHeader-#[count]#" type="text" value="X-YaCy-Media-Keywords:uno dos tres cuatro cinco" size="80" maxlength="200"></dd>
|
||||
</dl>
|
||||
</dd>
|
||||
#{/input}#
|
||||
|
|
|
@ -83,6 +83,15 @@ public class push_p {
|
|||
responseHeader.put(HeaderFramework.LAST_MODIFIED, lastModified);
|
||||
responseHeader.put(HeaderFramework.CONTENT_TYPE, contentType);
|
||||
responseHeader.put(HeaderFramework.CONTENT_LENGTH, Long.toString(data.length));
|
||||
// add generic fields
|
||||
String[] responseHeaderMap = post.getParams("responseHeader-" + i); // strings with key-value pairs; separated by ':'
|
||||
for (String kv: responseHeaderMap) {
|
||||
int p = kv.indexOf(':');
|
||||
if (p < 0) continue;
|
||||
String key = kv.substring(0, p).trim();
|
||||
String value = kv.substring(p + 1).trim();
|
||||
responseHeader.put(key, value);
|
||||
}
|
||||
CrawlProfile profile = sb.crawler.getPushCrawlProfile(collection);
|
||||
|
||||
// create requests and artificial response
|
||||
|
|
|
@ -107,10 +107,12 @@ public class HeaderFramework extends TreeMap<String, String> implements Map<Stri
|
|||
public static final String X_ROBOTS_TAG = "X-Robots-Tag"; // see http://googleblog.blogspot.com/2007/07/robots-exclusion-protocol-now-with-even.html
|
||||
public static final String X_ROBOTS = "X-Robots";
|
||||
|
||||
public static final String X_YACY_INDEX_CONTROL = "X-YACY-Index-Control";
|
||||
public static final String X_YACY_INDEX_CONTROL = "X-YaCy-Index-Control";
|
||||
//public static final String X_YACY_PREVIOUS_REQUEST_LINE = "X-Previous-Request-Line";
|
||||
public static final String X_YACY_KEEP_ALIVE_REQUEST_COUNT = "X-Keep-Alive-Request-Count";
|
||||
public static final String X_YACY_ORIGINAL_REQUEST_LINE = "X-Original-Request-Line";
|
||||
public static final String X_YACY_MEDIA_TITLE = "X-YaCy-Media-Title"; // can be attached to media files which do not have metadata; this will be used as title
|
||||
public static final String X_YACY_MEDIA_KEYWORDS = "X-YaCy-Media-Keywords"; // can be attached to media files which do not have metadata; this will be used as keywords (space-separared list of words)
|
||||
|
||||
public static final String SET_COOKIE = "Set-Cookie";
|
||||
public static final String SET_COOKIE2 = "Set-Cookie2";
|
||||
|
|
|
@ -421,9 +421,15 @@ public class CollectionConfiguration extends SchemaConfiguration implements Seri
|
|||
}
|
||||
add(doc, CollectionSchema.collection_sxt, cs);
|
||||
}
|
||||
|
||||
char doctype = Response.docType(responseHeader.getContentType());
|
||||
List<String> titles = document.titles();
|
||||
if (allAttr || contains(CollectionSchema.title)) {
|
||||
if (doctype == Response.DT_IMAGE || doctype == Response.DT_AUDIO || doctype == Response.DT_MOVIE) {
|
||||
String mediatitle = responseHeader.get(HeaderFramework.X_YACY_MEDIA_TITLE, "");
|
||||
if (mediatitle.length() > 0) {
|
||||
if (titles.size() == 0) titles.add(mediatitle); else titles.set(0, mediatitle);
|
||||
}
|
||||
}
|
||||
add(doc, CollectionSchema.title, titles);
|
||||
if ((allAttr || contains(CollectionSchema.title_exact_signature_l)) && titles.size() > 0) {
|
||||
add(doc, CollectionSchema.title_exact_signature_l, EnhancedTextProfileSignature.getSignatureLong(titles.get(0)));
|
||||
|
@ -473,7 +479,13 @@ public class CollectionConfiguration extends SchemaConfiguration implements Seri
|
|||
if (document.getDate().before(lastModified)) lastModified = document.getDate();
|
||||
add(doc, CollectionSchema.last_modified, lastModified);
|
||||
}
|
||||
if (allAttr || contains(CollectionSchema.keywords)) add(doc, CollectionSchema.keywords, document.dc_subject(' '));
|
||||
if (allAttr || contains(CollectionSchema.keywords)) {
|
||||
String keywords = document.dc_subject(' ');
|
||||
if (doctype == Response.DT_IMAGE || doctype == Response.DT_AUDIO || doctype == Response.DT_MOVIE) {
|
||||
keywords = responseHeader.get(HeaderFramework.X_YACY_MEDIA_KEYWORDS, keywords);
|
||||
}
|
||||
add(doc, CollectionSchema.keywords, keywords);
|
||||
}
|
||||
if (allAttr || contains(CollectionSchema.synonyms_sxt)) {
|
||||
List<String> synonyms = condenser.synonyms();
|
||||
add(doc, CollectionSchema.synonyms_sxt, synonyms);
|
||||
|
|
Loading…
Reference in New Issue
Block a user