mirror of
https://github.com/yacy/yacy_search_server.git
synced 2024-09-19 00:01:41 +02:00
replaced the single-text description solr field with a multi-value
description_txt text field
This commit is contained in:
parent
4c242f9af9
commit
cf12835f20
|
@ -123,8 +123,8 @@ coordinate_p
|
|||
## content of author-tag, texgen
|
||||
author
|
||||
|
||||
## content of description-tag, text
|
||||
description
|
||||
## content of description-tag(s), text
|
||||
description_txt
|
||||
|
||||
## the 64 bit hash of the org.apache.solr.update.processor.Lookup3Signature of description, used to compute description_unique_b
|
||||
#description_exact_signature_l
|
||||
|
|
|
@ -324,7 +324,7 @@ public class Load_RSS_p {
|
|||
if (author == null || author.isEmpty()) author = channel == null ? "" : channel.getCopyright();
|
||||
Date pubDate = channel == null ? null : channel.getPubDate();
|
||||
prop.putHTML("showitems_author", author == null ? "" : author);
|
||||
prop.putHTML("showitems_description", channel == null ? "" : channel.getDescription());
|
||||
prop.putHTML("showitems_description", channel == null ? "" : channel.getDescriptions().toString());
|
||||
prop.putHTML("showitems_language", channel == null ? "" : channel.getLanguage());
|
||||
prop.putHTML("showitems_date", (pubDate == null) ? "" : DateFormat.getDateTimeInstance().format(pubDate));
|
||||
prop.putHTML("showitems_ttl", channel == null ? "" : channel.getTTL());
|
||||
|
@ -355,7 +355,7 @@ public class Load_RSS_p {
|
|||
prop.putHTML("showitems_item_" + i + "_author", author == null ? "" : author);
|
||||
prop.putHTML("showitems_item_" + i + "_title", item.getTitle());
|
||||
prop.putHTML("showitems_item_" + i + "_link", messageurl.toNormalform(true));
|
||||
prop.putHTML("showitems_item_" + i + "_description", item.getDescription());
|
||||
prop.putHTML("showitems_item_" + i + "_description", item.getDescriptions().toString());
|
||||
prop.putHTML("showitems_item_" + i + "_language", item.getLanguage());
|
||||
prop.putHTML("showitems_item_" + i + "_date", (pubDate == null) ? "" : DateFormat.getDateTimeInstance().format(pubDate));
|
||||
i++;
|
||||
|
|
|
@ -60,9 +60,10 @@ public class feed {
|
|||
if (feed == null || feed.isEmpty()) continue channelIteration;
|
||||
|
||||
RSSMessage message = feed.getChannel();
|
||||
String description = message.getDescriptions().size() > 0 ? message.getDescriptions().get(0) : "";
|
||||
if (message != null) {
|
||||
prop.putXML("channel_title", message.getTitle());
|
||||
prop.putXML("channel_description", message.getDescription());
|
||||
prop.putXML("channel_description", description);
|
||||
prop.put("channel_pubDate", message.getPubDate());
|
||||
}
|
||||
while (messageMaxCount > 0 && !feed.isEmpty()) {
|
||||
|
@ -71,7 +72,7 @@ public class feed {
|
|||
|
||||
// create RSS entry
|
||||
prop.putXML("item_" + messageCount + "_title", channelName + ": " + message.getTitle());
|
||||
prop.putXML("item_" + messageCount + "_description", message.getDescription());
|
||||
prop.putXML("item_" + messageCount + "_description", description);
|
||||
prop.putXML("item_" + messageCount + "_link", message.getLink());
|
||||
prop.put("item_" + messageCount + "_pubDate", message.getPubDate());
|
||||
prop.putXML("item_" + messageCount + "_guid", message.getGuid());
|
||||
|
|
|
@ -121,7 +121,7 @@ public class getpageinfo {
|
|||
}
|
||||
prop.put("tags", count);
|
||||
// put description
|
||||
prop.putXML("desc", removelinebreaks(scraper.dc_description()));
|
||||
prop.putXML("desc", removelinebreaks(scraper.dc_description().length > 0 ? scraper.dc_description()[0] : ""));
|
||||
// put language
|
||||
final Set<String> languages = scraper.getContentLanguages();
|
||||
prop.putXML("lang", (languages == null || languages.size() == 0) ? "unknown" : languages.iterator().next());
|
||||
|
|
|
@ -121,7 +121,7 @@ public class getpageinfo_p {
|
|||
}
|
||||
prop.put("tags", count);
|
||||
// put description
|
||||
prop.putXML("desc", scraper.dc_description());
|
||||
prop.putXML("desc", scraper.dc_description().length > 0 ? scraper.dc_description()[0] : "");
|
||||
// put language
|
||||
final Set<String> languages = scraper.getContentLanguages();
|
||||
prop.putXML("lang", (languages == null || languages.size() == 0) ? "unknown" : languages.iterator().next());
|
||||
|
|
|
@ -155,14 +155,14 @@ public class searchresult {
|
|||
CollectionSchema.id.getSolrFieldName() + ',' +
|
||||
CollectionSchema.sku.getSolrFieldName() + ',' +
|
||||
CollectionSchema.title.getSolrFieldName() + ',' +
|
||||
CollectionSchema.description.getSolrFieldName() + ',' +
|
||||
CollectionSchema.description_txt.getSolrFieldName() + ',' +
|
||||
CollectionSchema.load_date_dt.getSolrFieldName() + ',' +
|
||||
CollectionSchema.last_modified.getSolrFieldName() + ',' +
|
||||
CollectionSchema.size_i.getSolrFieldName());
|
||||
post.put("hl", "true");
|
||||
post.put("hl.q", originalQuery);
|
||||
post.put("hl.fl", CollectionSchema.h1_txt.getSolrFieldName() + "," + CollectionSchema.h2_txt.getSolrFieldName() + "," + CollectionSchema.text_t.getSolrFieldName());
|
||||
post.put("hl.alternateField", CollectionSchema.description.getSolrFieldName());
|
||||
post.put("hl.alternateField", CollectionSchema.description_txt.getSolrFieldName());
|
||||
post.put("hl.simple.pre", "<b>");
|
||||
post.put("hl.simple.post", "</b>");
|
||||
post.put("hl.fragsize", Integer.toString(SearchEvent.SNIPPET_MAX_LENGTH));
|
||||
|
|
|
@ -202,7 +202,7 @@ public class select {
|
|||
// add options for snippet generation
|
||||
if (!post.containsKey("hl.q")) post.put("hl.q", q);
|
||||
if (!post.containsKey("hl.fl")) post.put("hl.fl", CollectionSchema.h1_txt.getSolrFieldName() + "," + CollectionSchema.h2_txt.getSolrFieldName() + "," + CollectionSchema.text_t.getSolrFieldName());
|
||||
if (!post.containsKey("hl.alternateField")) post.put("hl.alternateField", CollectionSchema.description.getSolrFieldName());
|
||||
if (!post.containsKey("hl.alternateField")) post.put("hl.alternateField", CollectionSchema.description_txt.getSolrFieldName());
|
||||
if (!post.containsKey("hl.simple.pre")) post.put("hl.simple.pre", "<b>");
|
||||
if (!post.containsKey("hl.simple.post")) post.put("hl.simple.post", "</b>");
|
||||
if (!post.containsKey("hl.fragsize")) post.put("hl.fragsize", Integer.toString(SearchEvent.SNIPPET_MAX_LENGTH));
|
||||
|
|
|
@ -104,7 +104,7 @@ public class yacysearch_location {
|
|||
prop.put("kml_placemark_" + placemarkCounter + "_author", message.getAuthor());
|
||||
prop.put("kml_placemark_" + placemarkCounter + "_copyright", message.getCopyright());
|
||||
prop.put("kml_placemark_" + placemarkCounter + "_subject", message.getSubject());
|
||||
prop.put("kml_placemark_" + placemarkCounter + "_description", message.getDescription());
|
||||
prop.put("kml_placemark_" + placemarkCounter + "_description", message.getDescriptions().size() > 0 ? message.getDescriptions().get(0) : "");
|
||||
prop.put("kml_placemark_" + placemarkCounter + "_date", message.getPubDate());
|
||||
prop.putXML("kml_placemark_" + placemarkCounter + "_url", message.getLink());
|
||||
prop.put("kml_placemark_" + placemarkCounter + "_pointname", message.getTitle());
|
||||
|
|
|
@ -27,6 +27,7 @@
|
|||
package net.yacy.cora.document;
|
||||
|
||||
import java.util.Date;
|
||||
import java.util.List;
|
||||
|
||||
public interface Hit {
|
||||
|
||||
|
@ -70,7 +71,7 @@ public interface Hit {
|
|||
|
||||
public String getLanguage();
|
||||
|
||||
public String getDescription();
|
||||
public List<String> getDescriptions();
|
||||
|
||||
public Date getPubDate();
|
||||
|
||||
|
|
|
@ -25,11 +25,13 @@
|
|||
package net.yacy.cora.document;
|
||||
|
||||
import java.text.ParseException;
|
||||
import java.util.ArrayList;
|
||||
import java.util.Arrays;
|
||||
import java.util.Comparator;
|
||||
import java.util.Date;
|
||||
import java.util.HashMap;
|
||||
import java.util.HashSet;
|
||||
import java.util.List;
|
||||
import java.util.Map;
|
||||
import java.util.Set;
|
||||
|
||||
|
@ -158,8 +160,11 @@ public class RSSMessage implements Hit, Comparable<RSSMessage>, Comparator<RSSMe
|
|||
}
|
||||
|
||||
@Override
|
||||
public String getDescription() {
|
||||
return Token.description.valueFrom(this.map, "");
|
||||
public List<String> getDescriptions() {
|
||||
List<String> ds = new ArrayList<String>();
|
||||
String d = Token.description.valueFrom(this.map, "");
|
||||
if (d.length() > 0) ds.add(d);
|
||||
return ds;
|
||||
}
|
||||
|
||||
@Override
|
||||
|
@ -216,7 +221,7 @@ public class RSSMessage implements Hit, Comparable<RSSMessage>, Comparator<RSSMe
|
|||
String guid = Token.guid.valueFrom(this.map, "");
|
||||
if ((guid.isEmpty() || guid.startsWith(artificialGuidPrefix)) &&
|
||||
(this.map.containsKey("title") || this.map.containsKey("description") || this.map.containsKey("link"))) {
|
||||
guid = calculatedGuidPrefix + Integer.toHexString(getTitle().hashCode() + getDescription().hashCode() + getLink().hashCode());
|
||||
guid = calculatedGuidPrefix + Integer.toHexString(getTitle().hashCode() + getDescriptions().hashCode() + getLink().hashCode());
|
||||
this.map.put("guid", guid);
|
||||
}
|
||||
return guid;
|
||||
|
|
|
@ -169,13 +169,19 @@ public class EnhancedTextProfileSignature extends Lookup3Signature {
|
|||
return t2.cnt - t1.cnt;
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
public static long getSignatureLong(String text) {
|
||||
Lookup3Signature sig = new Lookup3Signature();
|
||||
sig.add(text);
|
||||
return getSignatureLong(sig);
|
||||
}
|
||||
|
||||
public static long getSignatureLong(String[] texts) {
|
||||
Lookup3Signature sig = new Lookup3Signature();
|
||||
for (String s: texts) sig.add(s);
|
||||
return getSignatureLong(sig);
|
||||
}
|
||||
|
||||
public static long getSignatureLong(Lookup3Signature sig) {
|
||||
byte[] hash = sig.getSignature();
|
||||
long l = 0;
|
||||
|
|
|
@ -22,6 +22,7 @@ package net.yacy.cora.federate.solr.responsewriter;
|
|||
|
||||
import java.io.IOException;
|
||||
import java.io.Writer;
|
||||
import java.util.ArrayList;
|
||||
import java.util.Date;
|
||||
import java.util.HashMap;
|
||||
import java.util.HashSet;
|
||||
|
@ -84,7 +85,7 @@ public class GSAResponseWriter implements QueryResponseWriter {
|
|||
|
||||
// pre-select a set of YaCy schema fields for the solr searcher which should cause a better caching
|
||||
private static final CollectionSchema[] extrafields = new CollectionSchema[]{
|
||||
CollectionSchema.id, CollectionSchema.sku, CollectionSchema.title, CollectionSchema.description,
|
||||
CollectionSchema.id, CollectionSchema.sku, CollectionSchema.title, CollectionSchema.description_txt,
|
||||
CollectionSchema.last_modified, CollectionSchema.load_date_dt, CollectionSchema.size_i, CollectionSchema.language_s
|
||||
};
|
||||
|
||||
|
@ -235,7 +236,7 @@ public class GSAResponseWriter implements QueryResponseWriter {
|
|||
// write the R header for a search result
|
||||
writer.write("<R N=\"" + (resHead.offset + i + 1) + "\"" + (i == 1 ? " L=\"2\"" : "") + (mime != null && mime.length() > 0 ? " MIME=\"" + mime + "\"" : "") + ">"); writer.write(lb);
|
||||
//List<String> texts = new ArrayList<String>();
|
||||
String description = "";
|
||||
List<String> descriptions = new ArrayList<String>();
|
||||
int size = 0;
|
||||
boolean title_written = false; // the solr index may contain several; we take only the first which should be the visible tag in <title></title>
|
||||
for (IndexableField value: fields) {
|
||||
|
@ -264,9 +265,9 @@ public class GSAResponseWriter implements QueryResponseWriter {
|
|||
title_written = true;
|
||||
continue;
|
||||
}
|
||||
if (CollectionSchema.description.getSolrFieldName().equals(fieldName)) {
|
||||
description = value.stringValue();
|
||||
//texts.add(description);
|
||||
if (CollectionSchema.description_txt.getSolrFieldName().equals(fieldName)) {
|
||||
descriptions.add(value.stringValue());
|
||||
//texts.adds(description);
|
||||
continue;
|
||||
}
|
||||
if (CollectionSchema.last_modified.getSolrFieldName().equals(fieldName)) {
|
||||
|
@ -290,8 +291,8 @@ public class GSAResponseWriter implements QueryResponseWriter {
|
|||
}
|
||||
// compute snippet from texts
|
||||
List<String> snippet = urlhash == null ? null : snippets.get(urlhash);
|
||||
OpensearchResponseWriter.solitaireTag(writer, GSAToken.S.name(), snippet == null || snippet.size() == 0 ? description : snippet.get(0));
|
||||
OpensearchResponseWriter.solitaireTag(writer, GSAToken.GD.name(), description);
|
||||
OpensearchResponseWriter.solitaireTag(writer, GSAToken.S.name(), snippet == null || snippet.size() == 0 ? (descriptions.size() > 0 ? descriptions.get(0) : snippet.get(0)) : snippet.get(0));
|
||||
OpensearchResponseWriter.solitaireTag(writer, GSAToken.GD.name(), descriptions.size() > 0 ? descriptions.get(0) : "");
|
||||
writer.write("<HAS><L/><C SZ=\""); writer.write(Integer.toString(size / 1024)); writer.write("k\" CID=\""); writer.write(urlhash); writer.write("\" ENC=\"UTF-8\"/></HAS>");
|
||||
if (YaCyVer == null) YaCyVer = yacyVersion.thisVersion().getName() + "/" + Switchboard.getSwitchboard().peers.mySeed().hash;
|
||||
OpensearchResponseWriter.solitaireTag(writer, GSAToken.ENT_SOURCE.name(), YaCyVer);
|
||||
|
|
|
@ -59,7 +59,7 @@ public class OpensearchResponseWriter implements QueryResponseWriter {
|
|||
|
||||
// pre-select a set of YaCy schema fields for the solr searcher which should cause a better caching
|
||||
private static final CollectionSchema[] extrafields = new CollectionSchema[]{
|
||||
CollectionSchema.id, CollectionSchema.title, CollectionSchema.description, CollectionSchema.text_t,
|
||||
CollectionSchema.id, CollectionSchema.title, CollectionSchema.description_txt, CollectionSchema.text_t,
|
||||
CollectionSchema.h1_txt, CollectionSchema.h2_txt, CollectionSchema.h3_txt, CollectionSchema.h4_txt, CollectionSchema.h5_txt, CollectionSchema.h6_txt,
|
||||
};
|
||||
static final Set<String> SOLR_FIELDS = new HashSet<String>();
|
||||
|
@ -163,7 +163,8 @@ public class OpensearchResponseWriter implements QueryResponseWriter {
|
|||
List<IndexableField> fields = doc.getFields();
|
||||
int fieldc = fields.size();
|
||||
List<String> texts = new ArrayList<String>();
|
||||
String description = "", title = "";
|
||||
List<String> descriptions = new ArrayList<String>();
|
||||
String title = "";
|
||||
for (int j = 0; j < fieldc; j++) {
|
||||
IndexableField value = fields.get(j);
|
||||
String fieldName = value.name();
|
||||
|
@ -204,8 +205,9 @@ public class OpensearchResponseWriter implements QueryResponseWriter {
|
|||
solitaireTag(writer, RSSMessage.Token.pubDate.name(), HeaderFramework.formatRFC1123(d));
|
||||
continue;
|
||||
}
|
||||
if (CollectionSchema.description.getSolrFieldName().equals(fieldName)) {
|
||||
description = value.stringValue();
|
||||
if (CollectionSchema.description_txt.getSolrFieldName().equals(fieldName)) {
|
||||
String description = value.stringValue();
|
||||
descriptions.add(description);
|
||||
solitaireTag(writer, DublinCore.Description.getURIref(), description);
|
||||
texts.add(description);
|
||||
continue;
|
||||
|
@ -233,10 +235,17 @@ public class OpensearchResponseWriter implements QueryResponseWriter {
|
|||
solitaireTag(writer, RSSMessage.Token.title.name(), title.length() == 0 ? (texts.size() == 0 ? "" : texts.get(0)) : title);
|
||||
List<String> snippet = urlhash == null ? null : snippets.get(urlhash);
|
||||
String tagname = RSSMessage.Token.description.name();
|
||||
writer.write("<"); writer.write(tagname); writer.write('>');
|
||||
XML.escapeCharData(snippet == null || snippet.size() == 0 ? description : snippet.get(0), writer);
|
||||
writer.write("</"); writer.write(tagname); writer.write(">\n");
|
||||
|
||||
if (snippet == null || snippet.size() == 0) {
|
||||
for (String d: descriptions) {
|
||||
writer.write("<"); writer.write(tagname); writer.write('>');
|
||||
XML.escapeCharData(snippet == null || snippet.size() == 0 ? d : snippet.get(0), writer);
|
||||
writer.write("</"); writer.write(tagname); writer.write(">\n");
|
||||
}
|
||||
} else {
|
||||
writer.write("<"); writer.write(tagname); writer.write('>');
|
||||
XML.escapeCharData(snippet.get(0), writer);
|
||||
writer.write("</"); writer.write(tagname); writer.write(">\n");
|
||||
}
|
||||
// open: where do we get the subject?
|
||||
//solitaireTag(writer, DublinCore.Subject.getURIref(), ""); // TODO: fill with actual data
|
||||
|
||||
|
|
|
@ -136,7 +136,8 @@ public class YJsonResponseWriter implements QueryResponseWriter {
|
|||
List<String> texts = new ArrayList<String>();
|
||||
MultiProtocolURI url = null;
|
||||
String urlhash = null;
|
||||
String description = "", title = "";
|
||||
List<String> descriptions = new ArrayList<String>();
|
||||
String title = "";
|
||||
StringBuilder path = new StringBuilder(80);
|
||||
for (int j = 0; j < fieldc; j++) {
|
||||
IndexableField value = fields.get(j);
|
||||
|
@ -166,8 +167,9 @@ public class YJsonResponseWriter implements QueryResponseWriter {
|
|||
texts.add(title);
|
||||
continue;
|
||||
}
|
||||
if (CollectionSchema.description.getSolrFieldName().equals(fieldName)) {
|
||||
description = value.stringValue();
|
||||
if (CollectionSchema.description_txt.getSolrFieldName().equals(fieldName)) {
|
||||
String description = value.stringValue();
|
||||
descriptions.add(description);
|
||||
texts.add(description);
|
||||
continue;
|
||||
}
|
||||
|
@ -212,7 +214,7 @@ public class YJsonResponseWriter implements QueryResponseWriter {
|
|||
solitaireTag(writer, "path", path.toString());
|
||||
solitaireTag(writer, "title", title.length() == 0 ? (texts.size() == 0 ? path.toString() : texts.get(0)) : title);
|
||||
List<String> snippet = urlhash == null ? null : snippets.get(urlhash);
|
||||
writer.write("\"description\":\""); writer.write(serverObjects.toJSON(snippet == null || snippet.size() == 0 ? description : snippet.get(0))); writer.write("\"\n}\n");
|
||||
writer.write("\"description\":\""); writer.write(serverObjects.toJSON(snippet == null || snippet.size() == 0 ? (descriptions.size() > 0 ? descriptions.get(0) : "") : snippet.get(0))); writer.write("\"\n}\n");
|
||||
if (i < responseCount - 1) {
|
||||
writer.write(",\n".toCharArray());
|
||||
}
|
||||
|
|
|
@ -527,7 +527,7 @@ public class CrawlQueues {
|
|||
ASCII.getBytes(hash),
|
||||
url,
|
||||
(referrer == null) ? null : referrer.hash(),
|
||||
item.getDescription(),
|
||||
item.getDescriptions().size() > 0 ? item.getDescriptions().get(0) : "",
|
||||
loaddate,
|
||||
this.sb.crawler.defaultRemoteProfile.handle(),
|
||||
0,
|
||||
|
|
|
@ -90,11 +90,11 @@ public class YMarkAutoTagger implements Runnable, Thread.UncaughtExceptionHandle
|
|||
final Map<String, Word> words = new Condenser(document, true, true, LibraryProvider.dymLib, LibraryProvider.synonyms, false).words();
|
||||
|
||||
// generate potential tags from document title, description and subject
|
||||
final int bufferSize = document.dc_title().length() + document.dc_description().length() + document.dc_subject(' ').length() + 32;
|
||||
final int bufferSize = document.dc_title().length() + document.dc_description().length + document.dc_subject(' ').length() + 32;
|
||||
final StringBuilder buffer = new StringBuilder(bufferSize);
|
||||
final StringBuilder pwords = new StringBuilder(1000);
|
||||
buffer.append(document.dc_title().toLowerCase());
|
||||
buffer.append(document.dc_description().toLowerCase());
|
||||
for (String s:document.dc_description()) buffer.append(s.toLowerCase());
|
||||
buffer.append(document.dc_subject(' ').toLowerCase());
|
||||
final WordTokenizer tokens = new WordTokenizer(new SentenceReader(buffer.toString()), LibraryProvider.dymLib);
|
||||
try {
|
||||
|
|
|
@ -166,7 +166,7 @@ public class YMarkEntry extends TreeMap<String, String> {
|
|||
public YMarkEntry(final DCEntry dc) {
|
||||
super();
|
||||
for (BOOKMARK b : BOOKMARK.values()) {
|
||||
if(dc.containsKey(b.dc_attrb)) {
|
||||
if (dc.getMap().containsKey(b.dc_attrb)) {
|
||||
this.put(b.key(), dc.get(b.dc_attrb));
|
||||
}
|
||||
}
|
||||
|
@ -218,7 +218,7 @@ public class YMarkEntry extends TreeMap<String, String> {
|
|||
final DCEntry dc = new DCEntry();
|
||||
for (BOOKMARK b : BOOKMARK.values()) {
|
||||
if(!b.dc_attrb.isEmpty() && this.containsKey(b.key())) {
|
||||
dc.put(b.dc_attrb, this.get(b.key()));
|
||||
dc.getMap().put(b.dc_attrb, new String[]{this.get(b.key())});
|
||||
}
|
||||
}
|
||||
return dc;
|
||||
|
|
|
@ -132,7 +132,7 @@ public class YMarkMetadata {
|
|||
metadata.put(METADATA.CREATOR, this.document.dc_creator());
|
||||
metadata.put(METADATA.KEYWORDS, this.document.dc_subject(' '));
|
||||
metadata.put(METADATA.PUBLISHER, this.document.dc_publisher());
|
||||
metadata.put(METADATA.DESCRIPTION, this.document.dc_description());
|
||||
metadata.put(METADATA.DESCRIPTION, this.document.dc_description().length > 0 ? this.document.dc_description()[0] : "");
|
||||
metadata.put(METADATA.MIMETYPE, this.document.dc_format());
|
||||
metadata.put(METADATA.LANGUAGE, this.document.dc_language());
|
||||
metadata.put(METADATA.CHARSET, this.document.getCharset());
|
||||
|
|
|
@ -129,7 +129,9 @@ public final class Condenser {
|
|||
// phrase 99 is taken from the media Link url and anchor description
|
||||
// phrase 100 and above are lines from the text
|
||||
insertTextToWords(new SentenceReader(document.dc_title()), 1, WordReferenceRow.flag_app_dc_title, this.RESULT_FLAGS, true, meaningLib);
|
||||
insertTextToWords(new SentenceReader(document.dc_description()), 3, WordReferenceRow.flag_app_dc_description, this.RESULT_FLAGS, true, meaningLib);
|
||||
for (String description: document.dc_description()) {
|
||||
insertTextToWords(new SentenceReader(description), 3, WordReferenceRow.flag_app_dc_description, this.RESULT_FLAGS, true, meaningLib);
|
||||
}
|
||||
insertTextToWords(new SentenceReader(document.dc_creator()), 4, WordReferenceRow.flag_app_dc_creator, this.RESULT_FLAGS, true, meaningLib);
|
||||
insertTextToWords(new SentenceReader(document.dc_publisher()), 5, WordReferenceRow.flag_app_dc_creator, this.RESULT_FLAGS, true, meaningLib);
|
||||
insertTextToWords(new SentenceReader(document.dc_subject(' ')), 6, WordReferenceRow.flag_app_dc_description, this.RESULT_FLAGS, true, meaningLib);
|
||||
|
|
|
@ -75,8 +75,8 @@ public class Document {
|
|||
private List<String> titles; // the document titles, taken from title and/or h1 tag; shall appear as headline of search result
|
||||
private final StringBuilder creator; // author or copyright
|
||||
private final String publisher; // publisher
|
||||
private final List<String> sections; // if present: more titles/headlines appearing in the document
|
||||
private final StringBuilder description; // an abstract, if present: short content description
|
||||
private final List<String> sections; // if present: more titles/headlines appearing in the document
|
||||
private final List<String> descriptions; // an abstract, if present: short content description
|
||||
private Object text; // the clear text, all that is visible
|
||||
private final Map<DigestURI, Properties> anchors; // all links embedded as clickeable entities (anchor tags)
|
||||
private final Map<DigestURI, String> rss; // all embedded rss feeds
|
||||
|
@ -101,7 +101,7 @@ public class Document {
|
|||
final String[] keywords,
|
||||
final List<String> titles,
|
||||
final String author, final String publisher,
|
||||
final String[] sections, final String abstrct,
|
||||
final String[] sections, final List<String> abstrcts,
|
||||
final double lon, final double lat,
|
||||
final Object text,
|
||||
final Map<DigestURI, Properties> anchors,
|
||||
|
@ -118,7 +118,7 @@ public class Document {
|
|||
this.creator = (author == null) ? new StringBuilder(0) : new StringBuilder(author);
|
||||
this.sections = new LinkedList<String>() ;
|
||||
if (sections != null) this.sections.addAll(Arrays.asList(sections));
|
||||
this.description = (abstrct == null) ? new StringBuilder(0) : new StringBuilder(abstrct);
|
||||
this.descriptions = (abstrcts == null) ? new ArrayList<String>() : abstrcts;
|
||||
if (lat >= -90.0d && lat <= 90.0d && lon >= -180.0d && lon <= 180.0d) {
|
||||
this.lon = lon;
|
||||
this.lat = lat;
|
||||
|
@ -288,10 +288,9 @@ dc_rights
|
|||
return sb.substring(0, sb.length() - 1);
|
||||
}
|
||||
|
||||
public String dc_description() {
|
||||
if (this.description == null)
|
||||
return dc_title();
|
||||
return this.description.toString();
|
||||
public String[] dc_description() {
|
||||
if (descriptions == null) return new String[0];
|
||||
return this.descriptions.toArray(new String[this.descriptions.size()]);
|
||||
}
|
||||
|
||||
public String dc_publisher() {
|
||||
|
@ -646,9 +645,7 @@ dc_rights
|
|||
this.sections.addAll(doc.sections);
|
||||
this.titles.addAll(doc.titles());
|
||||
this.keywords.addAll(doc.getKeywords());
|
||||
|
||||
if (this.description.length() > 0) this.description.append('\n');
|
||||
this.description.append(doc.dc_description());
|
||||
for (String d: doc.dc_description()) this.descriptions.add(d);
|
||||
|
||||
if (!(this.text instanceof ByteArrayOutputStream)) {
|
||||
this.text = new ByteArrayOutputStream();
|
||||
|
@ -779,7 +776,7 @@ dc_rights
|
|||
final StringBuilder authors = new StringBuilder(80);
|
||||
final StringBuilder publishers = new StringBuilder(80);
|
||||
final StringBuilder subjects = new StringBuilder(80);
|
||||
final StringBuilder description = new StringBuilder(80);
|
||||
final List<String> descriptions = new ArrayList<String>();
|
||||
final Collection<String> titles = new LinkedHashSet<String>();
|
||||
final Collection<String> sectionTitles = new LinkedHashSet<String>();
|
||||
final Map<DigestURI, Properties> anchors = new HashMap<DigestURI, Properties>();
|
||||
|
@ -810,9 +807,7 @@ dc_rights
|
|||
|
||||
titles.addAll(doc.titles());
|
||||
sectionTitles.addAll(Arrays.asList(doc.getSectionTitles()));
|
||||
|
||||
if (description.length() > 0) description.append("\n");
|
||||
description.append(doc.dc_description());
|
||||
for (String d: doc.dc_description()) descriptions.add(d);
|
||||
|
||||
if (doc.getTextLength() > 0) {
|
||||
if (docTextLength > 0) content.write('\n');
|
||||
|
@ -851,7 +846,7 @@ dc_rights
|
|||
authors.toString(),
|
||||
publishers.toString(),
|
||||
sectionTitles.toArray(new String[sectionTitles.size()]),
|
||||
description.toString(),
|
||||
descriptions,
|
||||
lon, lat,
|
||||
content.getBytes(),
|
||||
anchors,
|
||||
|
|
|
@ -37,12 +37,14 @@ import java.util.List;
|
|||
import java.util.Locale;
|
||||
import java.util.TreeMap;
|
||||
|
||||
import org.apache.solr.common.params.MultiMapSolrParams;
|
||||
|
||||
import net.yacy.cora.date.ISO8601Formatter;
|
||||
import net.yacy.cora.util.ConcurrentLog;
|
||||
import net.yacy.document.Document;
|
||||
import net.yacy.kelondro.data.meta.DigestURI;
|
||||
|
||||
public class DCEntry extends TreeMap<String, String> {
|
||||
public class DCEntry extends MultiMapSolrParams {
|
||||
|
||||
private static final long serialVersionUID = -2050291583515701559L;
|
||||
|
||||
|
@ -55,7 +57,7 @@ public class DCEntry extends TreeMap<String, String> {
|
|||
public static final DCEntry poison = new DCEntry();
|
||||
|
||||
public DCEntry() {
|
||||
super((Collator) insensitiveCollator.clone());
|
||||
super(new TreeMap<String, String[]>((Collator) insensitiveCollator.clone()));
|
||||
}
|
||||
|
||||
public DCEntry(
|
||||
|
@ -67,14 +69,14 @@ public class DCEntry extends TreeMap<String, String> {
|
|||
double lat,
|
||||
double lon
|
||||
) {
|
||||
super((Collator) insensitiveCollator.clone());
|
||||
this.put("dc:identifier", url.toNormalform(true));
|
||||
this.put("dc:date", ISO8601Formatter.FORMATTER.format(date));
|
||||
this.put("dc:title", title);
|
||||
this.put("dc:creator", author);
|
||||
this.put("dc:description", body);
|
||||
this.put("geo:lat", Double.toString(lat));
|
||||
this.put("geo:long", Double.toString(lon));
|
||||
super(new TreeMap<String, String[]>((Collator) insensitiveCollator.clone()));
|
||||
this.getMap().put("dc:identifier", new String[]{url.toNormalform(true)});
|
||||
this.getMap().put("dc:date", new String[]{ISO8601Formatter.FORMATTER.format(date)});
|
||||
this.getMap().put("dc:title", new String[]{title});
|
||||
this.getMap().put("dc:creator", new String[]{author});
|
||||
this.getMap().put("dc:description", new String[]{body});
|
||||
this.getMap().put("geo:lat", new String[]{Double.toString(lat)});
|
||||
this.getMap().put("geo:long", new String[]{Double.toString(lon)});
|
||||
}
|
||||
|
||||
/*
|
||||
|
@ -222,14 +224,12 @@ public class DCEntry extends TreeMap<String, String> {
|
|||
return t;
|
||||
}
|
||||
|
||||
public String getDescription() {
|
||||
String t = this.get("body");
|
||||
if (t == null) t = this.get("dc:description");
|
||||
if (t == null) t = this.get("dc:subject");
|
||||
if (t == null) t = this.get("categories");
|
||||
t = stripCDATA(t);
|
||||
if (t == null) return "";
|
||||
return t;
|
||||
public List<String> getDescriptions() {
|
||||
String[] t = this.getParams("dc:description");
|
||||
List<String> descriptions = new ArrayList<String>();
|
||||
if (t == null) return descriptions;
|
||||
for (String s: t) descriptions.add(stripCDATA(s));
|
||||
return descriptions;
|
||||
}
|
||||
|
||||
public String[] getSubject() {
|
||||
|
@ -280,9 +280,9 @@ public class DCEntry extends TreeMap<String, String> {
|
|||
getCreator(),
|
||||
getPublisher(),
|
||||
null,
|
||||
"",
|
||||
getDescriptions(),
|
||||
getLon(), getLat(),
|
||||
getDescription(),
|
||||
"",
|
||||
null,
|
||||
null,
|
||||
null,
|
||||
|
|
|
@ -169,7 +169,7 @@ public class SurrogateReader extends DefaultHandler implements Runnable {
|
|||
//System.out.println("BUFFER-SIZE=" + buffer.length());
|
||||
final String value = buffer.toString().trim();
|
||||
if (this.elementName != null) {
|
||||
this.surrogate.put(this.elementName, value);
|
||||
this.surrogate.getMap().put(this.elementName, new String[]{value});
|
||||
}
|
||||
this.buffer.setLength(0);
|
||||
this.parsingValue = false;
|
||||
|
@ -179,9 +179,9 @@ public class SurrogateReader extends DefaultHandler implements Runnable {
|
|||
value.replaceAll(";", ",");
|
||||
String oldcontent = this.surrogate.get(this.elementName);
|
||||
if (oldcontent == null) {
|
||||
this.surrogate.put(this.elementName, value);
|
||||
this.surrogate.getMap().put(this.elementName, new String[]{value});
|
||||
} else {
|
||||
this.surrogate.put(this.elementName, oldcontent + ";" + value);
|
||||
this.surrogate.getMap().put(this.elementName, new String[]{oldcontent + ";" + value});
|
||||
}
|
||||
}
|
||||
this.buffer.setLength(0);
|
||||
|
@ -222,7 +222,7 @@ public class SurrogateReader extends DefaultHandler implements Runnable {
|
|||
System.out.println("Publisher: " + s.getPublisher());
|
||||
System.out.println("URL: " + s.getIdentifier(true));
|
||||
System.out.println("Language: " + s.getLanguage());
|
||||
System.out.println("Body: " + s.getDescription());
|
||||
System.out.println("Body: " + s.getDescriptions().toString());
|
||||
}
|
||||
} catch (final IOException e) {
|
||||
ConcurrentLog.logException(e);
|
||||
|
|
|
@ -124,44 +124,36 @@ public class audioTagParser extends AbstractParser implements Parser {
|
|||
titles.add(filename);
|
||||
|
||||
// text
|
||||
final List<String> descriptions = new ArrayList<String>(7);
|
||||
final StringBuilder text = new StringBuilder(500);
|
||||
final char space = ' ';
|
||||
text.append(tag.getFirst(FieldKey.ARTIST));
|
||||
text.append(space);
|
||||
text.append(tag.getFirst(FieldKey.ALBUM));
|
||||
text.append(space);
|
||||
text.append(tag.getFirst(FieldKey.TITLE));
|
||||
text.append(space);
|
||||
text.append(tag.getFirst(FieldKey.COMMENT));
|
||||
text.append(space);
|
||||
text.append(tag.getFirst(FieldKey.LYRICS));
|
||||
text.append(space);
|
||||
text.append(tag.getFirst(FieldKey.TAGS));
|
||||
text.append(space);
|
||||
text.append(tag.getFirst(FieldKey.GENRE));
|
||||
text.append(space);
|
||||
String field = tag.getFirst(FieldKey.ARTIST);
|
||||
descriptions.add(FieldKey.ARTIST.name() + ": " + field);
|
||||
text.append(field); text.append(space);
|
||||
field = tag.getFirst(FieldKey.ALBUM);
|
||||
descriptions.add(FieldKey.ALBUM.name() + ": " + field);
|
||||
text.append(field); text.append(space);
|
||||
field = tag.getFirst(FieldKey.TITLE);
|
||||
descriptions.add(FieldKey.TITLE.name() + ": " + field);
|
||||
text.append(field); text.append(space);
|
||||
field = tag.getFirst(FieldKey.COMMENT);
|
||||
descriptions.add(FieldKey.COMMENT.name() + ": " + field);
|
||||
text.append(field); text.append(space);
|
||||
field = tag.getFirst(FieldKey.LYRICS);
|
||||
descriptions.add(FieldKey.LYRICS.name() + ": " + field);
|
||||
text.append(field); text.append(space);
|
||||
field = tag.getFirst(FieldKey.TAGS);
|
||||
descriptions.add(FieldKey.TAGS.name() + ": " + field);
|
||||
text.append(field); text.append(space);
|
||||
field = tag.getFirst(FieldKey.GENRE);
|
||||
descriptions.add(FieldKey.GENRE.name() + ": " + field);
|
||||
text.append(field); text.append(space);
|
||||
text.append(location.toTokens());
|
||||
|
||||
// dc:subject
|
||||
final String[] subject = new String[1];
|
||||
subject[0] = tag.getFirst(FieldKey.GENRE);
|
||||
|
||||
// description
|
||||
final StringBuilder desc = new StringBuilder(500);
|
||||
final String sep = " - ";
|
||||
int count = desc.length();
|
||||
desc.append(tag.getFirst(FieldKey.ARTIST));
|
||||
if(desc.length() > count) {
|
||||
desc.append(sep);
|
||||
count = desc.length();
|
||||
}
|
||||
desc.append(tag.getFirst(FieldKey.ALBUM));
|
||||
if(desc.length() > count) {
|
||||
desc.append(sep);
|
||||
count = desc.length();
|
||||
}
|
||||
desc.append(tag.getFirst(FieldKey.TITLE));
|
||||
|
||||
docs = new Document[]{new Document(
|
||||
location,
|
||||
mime,
|
||||
|
@ -173,7 +165,7 @@ public class audioTagParser extends AbstractParser implements Parser {
|
|||
tag.getFirst(FieldKey.ARTIST), // author
|
||||
location.getHost(), // publisher
|
||||
null, // sections
|
||||
desc.toString(), // abstrct
|
||||
descriptions, // abstrct
|
||||
0.0f, 0.0f, // lon, lat
|
||||
text.toString(), // text
|
||||
null,
|
||||
|
|
|
@ -590,12 +590,6 @@ public class ContentScraper extends AbstractScraper implements Scraper {
|
|||
}
|
||||
}
|
||||
|
||||
if (this.titles.size() == 0) {
|
||||
// take description tag
|
||||
s = getDescription();
|
||||
if (!s.isEmpty()) this.titles.add(s);
|
||||
}
|
||||
|
||||
// extract headline from file name
|
||||
ArrayList<String> t = new ArrayList<String>();
|
||||
t.addAll(this.titles);
|
||||
|
@ -768,11 +762,13 @@ public class ContentScraper extends AbstractScraper implements Scraper {
|
|||
return false;
|
||||
}
|
||||
|
||||
public String getDescription() {
|
||||
public List<String> getDescriptions() {
|
||||
String s = this.metas.get("description");
|
||||
if (s == null) s = this.metas.get("dc.description");
|
||||
if (s == null) return EMPTY_STRING;
|
||||
return s;
|
||||
List<String> descriptions = new ArrayList<String>();
|
||||
if (s == null) return descriptions;
|
||||
descriptions.add(s);
|
||||
return descriptions;
|
||||
}
|
||||
|
||||
public String getContentType() {
|
||||
|
|
|
@ -135,7 +135,7 @@ public class htmlParser extends AbstractParser implements Parser {
|
|||
scraper.getAuthor(),
|
||||
scraper.getPublisher(),
|
||||
sections,
|
||||
scraper.getDescription(),
|
||||
scraper.getDescriptions(),
|
||||
scraper.getLon(), scraper.getLat(),
|
||||
scraper.getText(),
|
||||
scraper.getAnchors(),
|
||||
|
|
|
@ -34,9 +34,11 @@ import java.io.FileNotFoundException;
|
|||
import java.io.IOException;
|
||||
import java.io.InputStream;
|
||||
import java.net.MalformedURLException;
|
||||
import java.util.ArrayList;
|
||||
import java.util.HashMap;
|
||||
import java.util.HashSet;
|
||||
import java.util.Iterator;
|
||||
import java.util.List;
|
||||
import java.util.Properties;
|
||||
import java.util.Set;
|
||||
|
||||
|
@ -100,7 +102,7 @@ public class genericImageParser extends AbstractParser implements Parser {
|
|||
String title = null;
|
||||
String author = null;
|
||||
String keywords = null;
|
||||
String description = null;
|
||||
List<String> descriptions = new ArrayList<String>();
|
||||
String filename = location.getFileName();
|
||||
String ext = MultiProtocolURI.getFileExtension(filename);
|
||||
double gpslat = 0;
|
||||
|
@ -179,10 +181,11 @@ public class genericImageParser extends AbstractParser implements Parser {
|
|||
if (keywords == null || keywords.isEmpty()) keywords = props.get("Category");
|
||||
if (keywords == null || keywords.isEmpty()) keywords = props.get("Supplemental Category(s)");
|
||||
|
||||
description = props.get("Caption/Abstract");
|
||||
if (description == null || description.isEmpty()) description = props.get("Country/Primary Location");
|
||||
if (description == null || description.isEmpty()) description = props.get("Province/State");
|
||||
if (description == null || description.isEmpty()) description = props.get("Copyright Notice");
|
||||
String description;
|
||||
description = props.get("Caption/Abstract"); if (description != null && description.length() > 0) descriptions.add("Abstract: " + description);
|
||||
description = props.get("Country/Primary Location"); if (description != null && description.length() > 0) descriptions.add("Location: " + description);
|
||||
description = props.get("Province/State"); if (description != null && description.length() > 0) descriptions.add("State: " + description);
|
||||
description = props.get("Copyright Notice"); if (description != null && description.length() > 0) descriptions.add("Copyright: " + description);
|
||||
|
||||
} catch (final JpegProcessingException e) {
|
||||
//Log.logException(e);
|
||||
|
@ -212,7 +215,7 @@ public class genericImageParser extends AbstractParser implements Parser {
|
|||
author == null ? "" : author, // author
|
||||
location.getHost(), // Publisher
|
||||
new String[]{}, // sections
|
||||
description == null ? "" : description, // description
|
||||
descriptions, // description
|
||||
gpslon, gpslat, // location
|
||||
infoString, // content text
|
||||
anchors, // anchors
|
||||
|
|
|
@ -29,8 +29,10 @@ package net.yacy.document.parser;
|
|||
|
||||
import java.io.File;
|
||||
import java.io.InputStream;
|
||||
import java.util.ArrayList;
|
||||
import java.util.Enumeration;
|
||||
import java.util.HashSet;
|
||||
import java.util.List;
|
||||
import java.util.Set;
|
||||
import java.util.zip.ZipEntry;
|
||||
import java.util.zip.ZipFile;
|
||||
|
@ -176,6 +178,8 @@ public class odtParser extends AbstractParser implements Parser {
|
|||
// create the parser document
|
||||
Document[] docs = null;
|
||||
final byte[] contentBytes = (writer == null) ? null : UTF8.getBytes(writer.toString());
|
||||
List<String> descriptions = new ArrayList<String>();
|
||||
if (docDescription != null && docDescription.length() > 0) descriptions.add(docDescription);
|
||||
docs = new Document[]{new Document(
|
||||
location,
|
||||
mimeType,
|
||||
|
@ -187,7 +191,7 @@ public class odtParser extends AbstractParser implements Parser {
|
|||
docAuthor,
|
||||
"",
|
||||
null,
|
||||
docDescription,
|
||||
descriptions,
|
||||
0.0f, 0.0f,
|
||||
contentBytes,
|
||||
null,
|
||||
|
|
|
@ -29,8 +29,10 @@ package net.yacy.document.parser;
|
|||
|
||||
import java.io.File;
|
||||
import java.io.InputStream;
|
||||
import java.util.ArrayList;
|
||||
import java.util.Enumeration;
|
||||
import java.util.HashSet;
|
||||
import java.util.List;
|
||||
import java.util.Set;
|
||||
import java.util.zip.ZipEntry;
|
||||
import java.util.zip.ZipFile;
|
||||
|
@ -161,6 +163,8 @@ public class ooxmlParser extends AbstractParser implements Parser {
|
|||
// create the parser document
|
||||
Document[] docs = null;
|
||||
final byte[] contentBytes = (writer == null) ? null : UTF8.getBytes(writer.toString());
|
||||
List<String> descriptions = new ArrayList<String>();
|
||||
if (docDescription != null && docDescription.length() > 0) descriptions.add(docDescription);
|
||||
docs = new Document[]{new Document(
|
||||
location,
|
||||
mimeType,
|
||||
|
@ -172,7 +176,7 @@ public class ooxmlParser extends AbstractParser implements Parser {
|
|||
docAuthor,
|
||||
"",
|
||||
null,
|
||||
docDescription,
|
||||
descriptions,
|
||||
0.0f, 0.0f,
|
||||
contentBytes,
|
||||
null,
|
||||
|
|
|
@ -59,7 +59,7 @@ public class rdfParser extends AbstractParser implements Parser {
|
|||
|
||||
String all = "rdfdatasource";
|
||||
doc = new Document(url, mimeType, charset, null, null, null, singleList(""), "",
|
||||
"", null, "", 0, 0, all, null, null, null, false);
|
||||
"", null, new ArrayList<String>(0), 0, 0, all, null, null, null, false);
|
||||
|
||||
docs.add(doc);
|
||||
|
||||
|
|
|
@ -12,6 +12,7 @@ import java.io.InputStreamReader;
|
|||
import java.io.Reader;
|
||||
import java.net.MalformedURLException;
|
||||
import java.net.URL;
|
||||
import java.util.ArrayList;
|
||||
import java.util.HashSet;
|
||||
import java.util.Set;
|
||||
|
||||
|
@ -78,7 +79,7 @@ public class RDFaParser extends AbstractParser implements Parser {
|
|||
}
|
||||
|
||||
Document doc = new Document(url, mimeType, charset, null, null, null, singleList(""), "",
|
||||
"", null, "", 0, 0, null, null, null, null, false);
|
||||
"", null, new ArrayList<String>(0), 0, 0, null, null, null, null, false);
|
||||
|
||||
try {
|
||||
if (allTriples.length > 0)
|
||||
|
@ -137,7 +138,7 @@ public class RDFaParser extends AbstractParser implements Parser {
|
|||
}
|
||||
|
||||
Document doc = new Document(url, mimeType, charset, null, null, null, singleList(""), "",
|
||||
"", null, "", 0, 0, all, null, null, null, false);
|
||||
"", null, new ArrayList<String>(0), 0, 0, all, null, null, null, false);
|
||||
return doc;
|
||||
}
|
||||
|
||||
|
|
|
@ -95,7 +95,7 @@ public class rssParser extends AbstractParser implements Parser {
|
|||
item.getAuthor(),
|
||||
item.getCopyright(),
|
||||
new String[0],
|
||||
item.getDescription(),
|
||||
item.getDescriptions(),
|
||||
item.getLon(),
|
||||
item.getLat(),
|
||||
null,
|
||||
|
|
|
@ -90,7 +90,7 @@ public class sitemapParser extends AbstractParser implements Parser {
|
|||
"",
|
||||
"",
|
||||
new String[0],
|
||||
"",
|
||||
new ArrayList<String>(),
|
||||
0.0f, 0.0f,
|
||||
null,
|
||||
null,
|
||||
|
|
|
@ -29,7 +29,9 @@ package net.yacy.document.parser;
|
|||
|
||||
import java.io.IOException;
|
||||
import java.io.InputStream;
|
||||
import java.util.ArrayList;
|
||||
import java.util.HashMap;
|
||||
import java.util.List;
|
||||
import java.util.Map;
|
||||
import java.util.Properties;
|
||||
|
||||
|
@ -76,7 +78,7 @@ public class swfParser extends AbstractParser implements Parser {
|
|||
String urlnr = null;
|
||||
final String linebreak = System.getProperty("line.separator");
|
||||
final String[] sections = null;
|
||||
final String abstrct = null;
|
||||
final List<String> abstrct = new ArrayList<String>();
|
||||
//TreeSet images = null;
|
||||
final Map<DigestURI, Properties> anchors = new HashMap<DigestURI, Properties>();
|
||||
int urls = 0;
|
||||
|
|
|
@ -32,9 +32,11 @@ import java.io.IOException;
|
|||
import java.io.InputStream;
|
||||
import java.io.InputStreamReader;
|
||||
import java.net.MalformedURLException;
|
||||
import java.util.ArrayList;
|
||||
import java.util.HashMap;
|
||||
import java.util.Iterator;
|
||||
import java.util.LinkedList;
|
||||
import java.util.List;
|
||||
import java.util.Properties;
|
||||
|
||||
import net.yacy.cora.document.UTF8;
|
||||
|
@ -207,6 +209,7 @@ public class vcfParser extends AbstractParser implements Parser {
|
|||
|
||||
final String[] sections = parsedNames.toArray(new String[parsedNames.size()]);
|
||||
final byte[] text = UTF8.getBytes(parsedDataText.toString());
|
||||
final List<String> descriptions = new ArrayList<String>(1); descriptions.add("vCard");
|
||||
return new Document[]{new Document(
|
||||
url, // url of the source document
|
||||
mimeType, // the documents mime type
|
||||
|
@ -218,7 +221,7 @@ public class vcfParser extends AbstractParser implements Parser {
|
|||
"", // TODO: AUTHOR
|
||||
"", // the publisher
|
||||
sections, // an array of section headlines
|
||||
"vCard", // an abstract
|
||||
descriptions, // an abstract
|
||||
0.0f, 0.0f,
|
||||
text, // the parsed document text
|
||||
anchors, // a map of extracted anchors
|
||||
|
|
|
@ -28,6 +28,8 @@
|
|||
package net.yacy.document.parser;
|
||||
|
||||
import java.io.InputStream;
|
||||
import java.util.ArrayList;
|
||||
import java.util.List;
|
||||
|
||||
import net.yacy.cora.util.ConcurrentLog;
|
||||
import net.yacy.document.AbstractParser;
|
||||
|
@ -90,16 +92,10 @@ public class vsdParser extends AbstractParser implements Parser {
|
|||
title = summary.getTitle();
|
||||
}
|
||||
|
||||
String abstrct = null;
|
||||
abstrct = ((contents.length() > 80)? contents.substring(0, 80) : contents.trim()).
|
||||
replaceAll("\r\n"," ").
|
||||
replaceAll("\n"," ").
|
||||
replaceAll("\r"," ").
|
||||
replaceAll("\t"," ");
|
||||
List<String> abstrct = new ArrayList<String>();
|
||||
if (contents.length() > 0) abstrct.add(((contents.length() > 80) ? contents.substring(0, 80) : contents.trim()).replaceAll("\r\n"," ").replaceAll("\n"," ").replaceAll("\r"," ").replaceAll("\t"," "));
|
||||
|
||||
if (title == null) {
|
||||
title = abstrct;
|
||||
}
|
||||
if (title == null) title = location.toNormalform(true);
|
||||
|
||||
// As the result of parsing this function must return a plasmaParserDocument object
|
||||
return new Document[]{new Document(
|
||||
|
|
|
@ -312,8 +312,8 @@ public class URIMetadataNode {
|
|||
return getString(CollectionSchema.text_t);
|
||||
}
|
||||
|
||||
public String getDescription() {
|
||||
return getString(CollectionSchema.description);
|
||||
public ArrayList<String> getDescription() {
|
||||
return getStringList(CollectionSchema.description_txt);
|
||||
}
|
||||
|
||||
public boolean isOlder(URIMetadataRow other) {
|
||||
|
|
|
@ -2916,7 +2916,7 @@ public final class Switchboard extends serverSwitch {
|
|||
}
|
||||
|
||||
final String title = scraper == null ? url.toNormalform(true) : scraper.dc_title();
|
||||
final String description = scraper.dc_description();
|
||||
final String description = scraper.dc_description().length > 0 ? scraper.dc_description()[0] : "";
|
||||
|
||||
// add the url to the crawl stack
|
||||
this.crawler.removePassive(handle); // if there is an old entry, delete it
|
||||
|
|
|
@ -806,10 +806,11 @@ public final class Fulltext {
|
|||
} else {
|
||||
BlockingQueue<SolrDocument> docs = Fulltext.this.getDefaultConnector().concurrentDocumentsByQuery(CollectionSchema.httpstatus_i.getSolrFieldName() + ":200", 0, 100000000, 10 * 60 * 60 * 1000, 100,
|
||||
CollectionSchema.id.getSolrFieldName(), CollectionSchema.sku.getSolrFieldName(), CollectionSchema.title.getSolrFieldName(),
|
||||
CollectionSchema.author.getSolrFieldName(), CollectionSchema.description.getSolrFieldName(), CollectionSchema.size_i.getSolrFieldName(), CollectionSchema.last_modified.getSolrFieldName());
|
||||
CollectionSchema.author.getSolrFieldName(), CollectionSchema.description_txt.getSolrFieldName(), CollectionSchema.size_i.getSolrFieldName(), CollectionSchema.last_modified.getSolrFieldName());
|
||||
SolrDocument doc;
|
||||
ArrayList<?> title;
|
||||
String url, author, description, hash;
|
||||
String url, author, hash;
|
||||
String[] descriptions;
|
||||
Integer size;
|
||||
Date date;
|
||||
while ((doc = docs.take()) != AbstractSolrConnector.POISON_DOCUMENT) {
|
||||
|
@ -817,7 +818,7 @@ public final class Fulltext {
|
|||
url = (String) doc.getFieldValue(CollectionSchema.sku.getSolrFieldName());
|
||||
title = (ArrayList<?>) doc.getFieldValue(CollectionSchema.title.getSolrFieldName());
|
||||
author = (String) doc.getFieldValue(CollectionSchema.author.getSolrFieldName());
|
||||
description = (String) doc.getFieldValue(CollectionSchema.description.getSolrFieldName());
|
||||
descriptions = (String[]) doc.getFieldValue(CollectionSchema.description_txt.getSolrFieldName());
|
||||
size = (Integer) doc.getFieldValue(CollectionSchema.size_i.getSolrFieldName());
|
||||
date = (Date) doc.getFieldValue(CollectionSchema.last_modified.getSolrFieldName());
|
||||
if (this.pattern != null && !this.pattern.matcher(url).matches()) continue;
|
||||
|
@ -832,7 +833,9 @@ public final class Fulltext {
|
|||
if (title != null) pw.println("<title>" + CharacterCoding.unicode2xml((String) title.iterator().next(), true) + "</title>");
|
||||
pw.println("<link>" + MultiProtocolURI.escape(url) + "</link>");
|
||||
if (author != null && !author.isEmpty()) pw.println("<author>" + CharacterCoding.unicode2xml(author, true) + "</author>");
|
||||
if (description != null && !description.isEmpty()) pw.println("<description>" + CharacterCoding.unicode2xml(description, true) + "</description>");
|
||||
if (descriptions != null && descriptions.length > 0) {
|
||||
for (String d: descriptions) pw.println("<description>" + CharacterCoding.unicode2xml(d, true) + "</description>");
|
||||
}
|
||||
if (date != null) pw.println("<pubDate>" + HeaderFramework.formatRFC1123(date) + "</pubDate>");
|
||||
if (size != null) pw.println("<yacy:size>" + size.intValue() + "</yacy:size>");
|
||||
pw.println("<guid isPermaLink=\"false\">" + hash + "</guid>");
|
||||
|
|
|
@ -625,7 +625,7 @@ public class Segment {
|
|||
if (this.fulltext.getDefaultConfiguration().contains(CollectionSchema.host_id_s)) {
|
||||
uniquecheck: for (CollectionSchema[] checkfields: new CollectionSchema[][]{
|
||||
{CollectionSchema.title, CollectionSchema.title_exact_signature_l, CollectionSchema.title_unique_b},
|
||||
{CollectionSchema.description, CollectionSchema.description_exact_signature_l, CollectionSchema.description_unique_b}}) {
|
||||
{CollectionSchema.description_txt, CollectionSchema.description_exact_signature_l, CollectionSchema.description_unique_b}}) {
|
||||
CollectionSchema checkfield = checkfields[0];
|
||||
CollectionSchema signaturefield = checkfields[1];
|
||||
CollectionSchema uniquefield = checkfields[2];
|
||||
|
|
|
@ -245,16 +245,16 @@ public class CollectionConfiguration extends SchemaConfiguration implements Seri
|
|||
add(doc, CollectionSchema.title_words_val, cv);
|
||||
}
|
||||
|
||||
String description = md.snippet(); if (description == null) description = "";
|
||||
if (allAttr || contains(CollectionSchema.description)) add(doc, CollectionSchema.description, description);
|
||||
if (allAttr || contains(CollectionSchema.description_count_i)) add(doc, CollectionSchema.description_count_i, 1);
|
||||
String description = md.snippet();
|
||||
boolean description_exist = description != null;
|
||||
if (description == null) description = "";
|
||||
if (allAttr || contains(CollectionSchema.description_txt)) add(doc, CollectionSchema.description_txt, description_exist ? new String[]{description} : new String[0]);
|
||||
if (allAttr || contains(CollectionSchema.description_count_i)) add(doc, CollectionSchema.description_count_i, description_exist ? 1 : 0);
|
||||
if (allAttr || contains(CollectionSchema.description_chars_val)) {
|
||||
Integer[] cv = new Integer[]{new Integer(description.length())};
|
||||
add(doc, CollectionSchema.description_chars_val, cv);
|
||||
add(doc, CollectionSchema.description_chars_val, description_exist ? new Integer[]{new Integer(description.length())} : new Integer[0]);
|
||||
}
|
||||
if (allAttr || contains(CollectionSchema.description_words_val)) {
|
||||
Integer[] cv = new Integer[]{new Integer(CommonPattern.SPACE.split(description).length)};
|
||||
add(doc, CollectionSchema.description_words_val, cv);
|
||||
add(doc, CollectionSchema.description_words_val, description_exist ? new Integer[]{new Integer(description.length() == 0 ? 0 : CommonPattern.SPACE.split(description).length)} : new Integer[0]);
|
||||
}
|
||||
|
||||
String filename = digestURI.getFileName();
|
||||
|
@ -424,23 +424,21 @@ public class CollectionConfiguration extends SchemaConfiguration implements Seri
|
|||
add(doc, CollectionSchema.title_words_val, cv);
|
||||
}
|
||||
|
||||
String description = document.dc_description();
|
||||
List<String> descriptions = new ArrayList<String>();
|
||||
for (String s: CommonPattern.NEWLINE.split(description)) descriptions.add(s);
|
||||
if (allAttr || contains(CollectionSchema.description)) {
|
||||
add(doc, CollectionSchema.description, description);
|
||||
if ((allAttr || contains(CollectionSchema.description_exact_signature_l)) && description != null && description.length() > 0) {
|
||||
add(doc, CollectionSchema.description_exact_signature_l, EnhancedTextProfileSignature.getSignatureLong(description));
|
||||
String[] descriptions = document.dc_description();
|
||||
if (allAttr || contains(CollectionSchema.description_txt)) {
|
||||
add(doc, CollectionSchema.description_txt, descriptions);
|
||||
if ((allAttr || contains(CollectionSchema.description_exact_signature_l)) && descriptions != null && descriptions.length > 0) {
|
||||
add(doc, CollectionSchema.description_exact_signature_l, EnhancedTextProfileSignature.getSignatureLong(descriptions));
|
||||
}
|
||||
}
|
||||
if (allAttr || contains(CollectionSchema.description_count_i)) add(doc, CollectionSchema.description_count_i, descriptions.size());
|
||||
if (allAttr || contains(CollectionSchema.description_count_i)) add(doc, CollectionSchema.description_count_i, descriptions.length);
|
||||
if (allAttr || contains(CollectionSchema.description_chars_val)) {
|
||||
ArrayList<Integer> cv = new ArrayList<Integer>(descriptions.size());
|
||||
ArrayList<Integer> cv = new ArrayList<Integer>(descriptions.length);
|
||||
for (String s: descriptions) cv.add(new Integer(s.length()));
|
||||
add(doc, CollectionSchema.description_chars_val, cv);
|
||||
}
|
||||
if (allAttr || contains(CollectionSchema.description_words_val)) {
|
||||
ArrayList<Integer> cv = new ArrayList<Integer>(descriptions.size());
|
||||
ArrayList<Integer> cv = new ArrayList<Integer>(descriptions.length);
|
||||
for (String s: descriptions) cv.add(new Integer(CommonPattern.SPACE.split(s).length));
|
||||
add(doc, CollectionSchema.description_words_val, cv);
|
||||
}
|
||||
|
|
|
@ -75,7 +75,7 @@ public enum CollectionSchema implements SchemaDeclaration {
|
|||
ip_s(SolrType.string, true, true, false, false, false, "ip of host of url (after DNS lookup)"),
|
||||
author(SolrType.text_general, true, true, false, false, true, "content of author-tag"),
|
||||
author_sxt(SolrType.string, true, true, true, false, false, "content of author-tag as copy-field from author. This is used for facet generation"),
|
||||
description(SolrType.text_general, true, true, false, false, true, "content of description-tag"),
|
||||
description_txt(SolrType.text_general, true, true, true, false, true, "content of description-tag(s)"),
|
||||
description_exact_signature_l(SolrType.num_long, true, true, false, false, false, "the 64 bit hash of the org.apache.solr.update.processor.Lookup3Signature of description, used to compute description_unique_b"),
|
||||
description_unique_b(SolrType.bool, true, true, false, false, false, "flag shows if description is unique in the whole index; if yes and another document appears with same description, the unique-flag is set to false"),
|
||||
keywords(SolrType.text_general, true, true, false, false, true, "content of keywords tag; words are separated by space"),
|
||||
|
|
|
@ -489,7 +489,8 @@ public final class HTTPDFileHandler {
|
|||
File f;
|
||||
String size;
|
||||
long sz;
|
||||
String headline, author, description, publisher;
|
||||
String headline, author, publisher;
|
||||
List<String> descriptions;
|
||||
int images, links;
|
||||
ContentScraper scraper;
|
||||
for (final String element : list) {
|
||||
|
@ -503,14 +504,14 @@ public final class HTTPDFileHandler {
|
|||
headline = t.size() > 0 ? t.iterator().next() : "";
|
||||
author = scraper.getAuthor();
|
||||
publisher = scraper.getPublisher();
|
||||
description = scraper.getDescription();
|
||||
descriptions = scraper.getDescriptions();
|
||||
images = scraper.getImages().size();
|
||||
links = scraper.getAnchors().size();
|
||||
} else {
|
||||
headline = null;
|
||||
author = null;
|
||||
publisher = null;
|
||||
description = null;
|
||||
descriptions = null;
|
||||
images = 0;
|
||||
links = 0;
|
||||
}
|
||||
|
@ -527,7 +528,11 @@ public final class HTTPDFileHandler {
|
|||
aBuffer.append("<a href=\"" + path + element + "\">" + element + "</a><br/>");
|
||||
if (author != null && author.length() > 0) aBuffer.append("Author: " + author + "<br/>");
|
||||
if (publisher != null && publisher.length() > 0) aBuffer.append("Publisher: " + publisher + "<br/>");
|
||||
if (description != null && description.length() > 0) aBuffer.append("Description: " + description + "<br/>");
|
||||
if (descriptions != null && descriptions.size() > 0) {
|
||||
for (String d: descriptions) {
|
||||
aBuffer.append("Description: " + d + "<br/>");
|
||||
}
|
||||
}
|
||||
aBuffer.append(GenericFormatter.SHORT_DAY_FORMATTER.format(new Date(f.lastModified())) + ", " + size + ((images > 0) ? ", " + images + " images" : "") + ((links > 0) ? ", " + links + " links" : "") + "<br/></li>\n");
|
||||
}
|
||||
}
|
||||
|
|
Loading…
Reference in New Issue
Block a user