replaced the single-text description solr field with a multi-value

description_txt text field
This commit is contained in:
Michael Peter Christen 2013-07-30 12:48:57 +02:00
parent 4c242f9af9
commit cf12835f20
42 changed files with 207 additions and 178 deletions

View File

@ -123,8 +123,8 @@ coordinate_p
## content of author-tag, texgen
author
## content of description-tag, text
description
## content of description-tag(s), text
description_txt
## the 64 bit hash of the org.apache.solr.update.processor.Lookup3Signature of description, used to compute description_unique_b
#description_exact_signature_l

View File

@ -324,7 +324,7 @@ public class Load_RSS_p {
if (author == null || author.isEmpty()) author = channel == null ? "" : channel.getCopyright();
Date pubDate = channel == null ? null : channel.getPubDate();
prop.putHTML("showitems_author", author == null ? "" : author);
prop.putHTML("showitems_description", channel == null ? "" : channel.getDescription());
prop.putHTML("showitems_description", channel == null ? "" : channel.getDescriptions().toString());
prop.putHTML("showitems_language", channel == null ? "" : channel.getLanguage());
prop.putHTML("showitems_date", (pubDate == null) ? "" : DateFormat.getDateTimeInstance().format(pubDate));
prop.putHTML("showitems_ttl", channel == null ? "" : channel.getTTL());
@ -355,7 +355,7 @@ public class Load_RSS_p {
prop.putHTML("showitems_item_" + i + "_author", author == null ? "" : author);
prop.putHTML("showitems_item_" + i + "_title", item.getTitle());
prop.putHTML("showitems_item_" + i + "_link", messageurl.toNormalform(true));
prop.putHTML("showitems_item_" + i + "_description", item.getDescription());
prop.putHTML("showitems_item_" + i + "_description", item.getDescriptions().toString());
prop.putHTML("showitems_item_" + i + "_language", item.getLanguage());
prop.putHTML("showitems_item_" + i + "_date", (pubDate == null) ? "" : DateFormat.getDateTimeInstance().format(pubDate));
i++;

View File

@ -60,9 +60,10 @@ public class feed {
if (feed == null || feed.isEmpty()) continue channelIteration;
RSSMessage message = feed.getChannel();
String description = message.getDescriptions().size() > 0 ? message.getDescriptions().get(0) : "";
if (message != null) {
prop.putXML("channel_title", message.getTitle());
prop.putXML("channel_description", message.getDescription());
prop.putXML("channel_description", description);
prop.put("channel_pubDate", message.getPubDate());
}
while (messageMaxCount > 0 && !feed.isEmpty()) {
@ -71,7 +72,7 @@ public class feed {
// create RSS entry
prop.putXML("item_" + messageCount + "_title", channelName + ": " + message.getTitle());
prop.putXML("item_" + messageCount + "_description", message.getDescription());
prop.putXML("item_" + messageCount + "_description", description);
prop.putXML("item_" + messageCount + "_link", message.getLink());
prop.put("item_" + messageCount + "_pubDate", message.getPubDate());
prop.putXML("item_" + messageCount + "_guid", message.getGuid());

View File

@ -121,7 +121,7 @@ public class getpageinfo {
}
prop.put("tags", count);
// put description
prop.putXML("desc", removelinebreaks(scraper.dc_description()));
prop.putXML("desc", removelinebreaks(scraper.dc_description().length > 0 ? scraper.dc_description()[0] : ""));
// put language
final Set<String> languages = scraper.getContentLanguages();
prop.putXML("lang", (languages == null || languages.size() == 0) ? "unknown" : languages.iterator().next());

View File

@ -121,7 +121,7 @@ public class getpageinfo_p {
}
prop.put("tags", count);
// put description
prop.putXML("desc", scraper.dc_description());
prop.putXML("desc", scraper.dc_description().length > 0 ? scraper.dc_description()[0] : "");
// put language
final Set<String> languages = scraper.getContentLanguages();
prop.putXML("lang", (languages == null || languages.size() == 0) ? "unknown" : languages.iterator().next());

View File

@ -155,14 +155,14 @@ public class searchresult {
CollectionSchema.id.getSolrFieldName() + ',' +
CollectionSchema.sku.getSolrFieldName() + ',' +
CollectionSchema.title.getSolrFieldName() + ',' +
CollectionSchema.description.getSolrFieldName() + ',' +
CollectionSchema.description_txt.getSolrFieldName() + ',' +
CollectionSchema.load_date_dt.getSolrFieldName() + ',' +
CollectionSchema.last_modified.getSolrFieldName() + ',' +
CollectionSchema.size_i.getSolrFieldName());
post.put("hl", "true");
post.put("hl.q", originalQuery);
post.put("hl.fl", CollectionSchema.h1_txt.getSolrFieldName() + "," + CollectionSchema.h2_txt.getSolrFieldName() + "," + CollectionSchema.text_t.getSolrFieldName());
post.put("hl.alternateField", CollectionSchema.description.getSolrFieldName());
post.put("hl.alternateField", CollectionSchema.description_txt.getSolrFieldName());
post.put("hl.simple.pre", "<b>");
post.put("hl.simple.post", "</b>");
post.put("hl.fragsize", Integer.toString(SearchEvent.SNIPPET_MAX_LENGTH));

View File

@ -202,7 +202,7 @@ public class select {
// add options for snippet generation
if (!post.containsKey("hl.q")) post.put("hl.q", q);
if (!post.containsKey("hl.fl")) post.put("hl.fl", CollectionSchema.h1_txt.getSolrFieldName() + "," + CollectionSchema.h2_txt.getSolrFieldName() + "," + CollectionSchema.text_t.getSolrFieldName());
if (!post.containsKey("hl.alternateField")) post.put("hl.alternateField", CollectionSchema.description.getSolrFieldName());
if (!post.containsKey("hl.alternateField")) post.put("hl.alternateField", CollectionSchema.description_txt.getSolrFieldName());
if (!post.containsKey("hl.simple.pre")) post.put("hl.simple.pre", "<b>");
if (!post.containsKey("hl.simple.post")) post.put("hl.simple.post", "</b>");
if (!post.containsKey("hl.fragsize")) post.put("hl.fragsize", Integer.toString(SearchEvent.SNIPPET_MAX_LENGTH));

View File

@ -104,7 +104,7 @@ public class yacysearch_location {
prop.put("kml_placemark_" + placemarkCounter + "_author", message.getAuthor());
prop.put("kml_placemark_" + placemarkCounter + "_copyright", message.getCopyright());
prop.put("kml_placemark_" + placemarkCounter + "_subject", message.getSubject());
prop.put("kml_placemark_" + placemarkCounter + "_description", message.getDescription());
prop.put("kml_placemark_" + placemarkCounter + "_description", message.getDescriptions().size() > 0 ? message.getDescriptions().get(0) : "");
prop.put("kml_placemark_" + placemarkCounter + "_date", message.getPubDate());
prop.putXML("kml_placemark_" + placemarkCounter + "_url", message.getLink());
prop.put("kml_placemark_" + placemarkCounter + "_pointname", message.getTitle());

View File

@ -27,6 +27,7 @@
package net.yacy.cora.document;
import java.util.Date;
import java.util.List;
public interface Hit {
@ -70,7 +71,7 @@ public interface Hit {
public String getLanguage();
public String getDescription();
public List<String> getDescriptions();
public Date getPubDate();

View File

@ -25,11 +25,13 @@
package net.yacy.cora.document;
import java.text.ParseException;
import java.util.ArrayList;
import java.util.Arrays;
import java.util.Comparator;
import java.util.Date;
import java.util.HashMap;
import java.util.HashSet;
import java.util.List;
import java.util.Map;
import java.util.Set;
@ -158,8 +160,11 @@ public class RSSMessage implements Hit, Comparable<RSSMessage>, Comparator<RSSMe
}
@Override
public String getDescription() {
return Token.description.valueFrom(this.map, "");
public List<String> getDescriptions() {
List<String> ds = new ArrayList<String>();
String d = Token.description.valueFrom(this.map, "");
if (d.length() > 0) ds.add(d);
return ds;
}
@Override
@ -216,7 +221,7 @@ public class RSSMessage implements Hit, Comparable<RSSMessage>, Comparator<RSSMe
String guid = Token.guid.valueFrom(this.map, "");
if ((guid.isEmpty() || guid.startsWith(artificialGuidPrefix)) &&
(this.map.containsKey("title") || this.map.containsKey("description") || this.map.containsKey("link"))) {
guid = calculatedGuidPrefix + Integer.toHexString(getTitle().hashCode() + getDescription().hashCode() + getLink().hashCode());
guid = calculatedGuidPrefix + Integer.toHexString(getTitle().hashCode() + getDescriptions().hashCode() + getLink().hashCode());
this.map.put("guid", guid);
}
return guid;

View File

@ -169,13 +169,19 @@ public class EnhancedTextProfileSignature extends Lookup3Signature {
return t2.cnt - t1.cnt;
}
}
public static long getSignatureLong(String text) {
Lookup3Signature sig = new Lookup3Signature();
sig.add(text);
return getSignatureLong(sig);
}
public static long getSignatureLong(String[] texts) {
Lookup3Signature sig = new Lookup3Signature();
for (String s: texts) sig.add(s);
return getSignatureLong(sig);
}
public static long getSignatureLong(Lookup3Signature sig) {
byte[] hash = sig.getSignature();
long l = 0;

View File

@ -22,6 +22,7 @@ package net.yacy.cora.federate.solr.responsewriter;
import java.io.IOException;
import java.io.Writer;
import java.util.ArrayList;
import java.util.Date;
import java.util.HashMap;
import java.util.HashSet;
@ -84,7 +85,7 @@ public class GSAResponseWriter implements QueryResponseWriter {
// pre-select a set of YaCy schema fields for the solr searcher which should cause a better caching
private static final CollectionSchema[] extrafields = new CollectionSchema[]{
CollectionSchema.id, CollectionSchema.sku, CollectionSchema.title, CollectionSchema.description,
CollectionSchema.id, CollectionSchema.sku, CollectionSchema.title, CollectionSchema.description_txt,
CollectionSchema.last_modified, CollectionSchema.load_date_dt, CollectionSchema.size_i, CollectionSchema.language_s
};
@ -235,7 +236,7 @@ public class GSAResponseWriter implements QueryResponseWriter {
// write the R header for a search result
writer.write("<R N=\"" + (resHead.offset + i + 1) + "\"" + (i == 1 ? " L=\"2\"" : "") + (mime != null && mime.length() > 0 ? " MIME=\"" + mime + "\"" : "") + ">"); writer.write(lb);
//List<String> texts = new ArrayList<String>();
String description = "";
List<String> descriptions = new ArrayList<String>();
int size = 0;
boolean title_written = false; // the solr index may contain several; we take only the first which should be the visible tag in <title></title>
for (IndexableField value: fields) {
@ -264,9 +265,9 @@ public class GSAResponseWriter implements QueryResponseWriter {
title_written = true;
continue;
}
if (CollectionSchema.description.getSolrFieldName().equals(fieldName)) {
description = value.stringValue();
//texts.add(description);
if (CollectionSchema.description_txt.getSolrFieldName().equals(fieldName)) {
descriptions.add(value.stringValue());
//texts.adds(description);
continue;
}
if (CollectionSchema.last_modified.getSolrFieldName().equals(fieldName)) {
@ -290,8 +291,8 @@ public class GSAResponseWriter implements QueryResponseWriter {
}
// compute snippet from texts
List<String> snippet = urlhash == null ? null : snippets.get(urlhash);
OpensearchResponseWriter.solitaireTag(writer, GSAToken.S.name(), snippet == null || snippet.size() == 0 ? description : snippet.get(0));
OpensearchResponseWriter.solitaireTag(writer, GSAToken.GD.name(), description);
OpensearchResponseWriter.solitaireTag(writer, GSAToken.S.name(), snippet == null || snippet.size() == 0 ? (descriptions.size() > 0 ? descriptions.get(0) : snippet.get(0)) : snippet.get(0));
OpensearchResponseWriter.solitaireTag(writer, GSAToken.GD.name(), descriptions.size() > 0 ? descriptions.get(0) : "");
writer.write("<HAS><L/><C SZ=\""); writer.write(Integer.toString(size / 1024)); writer.write("k\" CID=\""); writer.write(urlhash); writer.write("\" ENC=\"UTF-8\"/></HAS>");
if (YaCyVer == null) YaCyVer = yacyVersion.thisVersion().getName() + "/" + Switchboard.getSwitchboard().peers.mySeed().hash;
OpensearchResponseWriter.solitaireTag(writer, GSAToken.ENT_SOURCE.name(), YaCyVer);

View File

@ -59,7 +59,7 @@ public class OpensearchResponseWriter implements QueryResponseWriter {
// pre-select a set of YaCy schema fields for the solr searcher which should cause a better caching
private static final CollectionSchema[] extrafields = new CollectionSchema[]{
CollectionSchema.id, CollectionSchema.title, CollectionSchema.description, CollectionSchema.text_t,
CollectionSchema.id, CollectionSchema.title, CollectionSchema.description_txt, CollectionSchema.text_t,
CollectionSchema.h1_txt, CollectionSchema.h2_txt, CollectionSchema.h3_txt, CollectionSchema.h4_txt, CollectionSchema.h5_txt, CollectionSchema.h6_txt,
};
static final Set<String> SOLR_FIELDS = new HashSet<String>();
@ -163,7 +163,8 @@ public class OpensearchResponseWriter implements QueryResponseWriter {
List<IndexableField> fields = doc.getFields();
int fieldc = fields.size();
List<String> texts = new ArrayList<String>();
String description = "", title = "";
List<String> descriptions = new ArrayList<String>();
String title = "";
for (int j = 0; j < fieldc; j++) {
IndexableField value = fields.get(j);
String fieldName = value.name();
@ -204,8 +205,9 @@ public class OpensearchResponseWriter implements QueryResponseWriter {
solitaireTag(writer, RSSMessage.Token.pubDate.name(), HeaderFramework.formatRFC1123(d));
continue;
}
if (CollectionSchema.description.getSolrFieldName().equals(fieldName)) {
description = value.stringValue();
if (CollectionSchema.description_txt.getSolrFieldName().equals(fieldName)) {
String description = value.stringValue();
descriptions.add(description);
solitaireTag(writer, DublinCore.Description.getURIref(), description);
texts.add(description);
continue;
@ -233,10 +235,17 @@ public class OpensearchResponseWriter implements QueryResponseWriter {
solitaireTag(writer, RSSMessage.Token.title.name(), title.length() == 0 ? (texts.size() == 0 ? "" : texts.get(0)) : title);
List<String> snippet = urlhash == null ? null : snippets.get(urlhash);
String tagname = RSSMessage.Token.description.name();
writer.write("<"); writer.write(tagname); writer.write('>');
XML.escapeCharData(snippet == null || snippet.size() == 0 ? description : snippet.get(0), writer);
writer.write("</"); writer.write(tagname); writer.write(">\n");
if (snippet == null || snippet.size() == 0) {
for (String d: descriptions) {
writer.write("<"); writer.write(tagname); writer.write('>');
XML.escapeCharData(snippet == null || snippet.size() == 0 ? d : snippet.get(0), writer);
writer.write("</"); writer.write(tagname); writer.write(">\n");
}
} else {
writer.write("<"); writer.write(tagname); writer.write('>');
XML.escapeCharData(snippet.get(0), writer);
writer.write("</"); writer.write(tagname); writer.write(">\n");
}
// open: where do we get the subject?
//solitaireTag(writer, DublinCore.Subject.getURIref(), ""); // TODO: fill with actual data

View File

@ -136,7 +136,8 @@ public class YJsonResponseWriter implements QueryResponseWriter {
List<String> texts = new ArrayList<String>();
MultiProtocolURI url = null;
String urlhash = null;
String description = "", title = "";
List<String> descriptions = new ArrayList<String>();
String title = "";
StringBuilder path = new StringBuilder(80);
for (int j = 0; j < fieldc; j++) {
IndexableField value = fields.get(j);
@ -166,8 +167,9 @@ public class YJsonResponseWriter implements QueryResponseWriter {
texts.add(title);
continue;
}
if (CollectionSchema.description.getSolrFieldName().equals(fieldName)) {
description = value.stringValue();
if (CollectionSchema.description_txt.getSolrFieldName().equals(fieldName)) {
String description = value.stringValue();
descriptions.add(description);
texts.add(description);
continue;
}
@ -212,7 +214,7 @@ public class YJsonResponseWriter implements QueryResponseWriter {
solitaireTag(writer, "path", path.toString());
solitaireTag(writer, "title", title.length() == 0 ? (texts.size() == 0 ? path.toString() : texts.get(0)) : title);
List<String> snippet = urlhash == null ? null : snippets.get(urlhash);
writer.write("\"description\":\""); writer.write(serverObjects.toJSON(snippet == null || snippet.size() == 0 ? description : snippet.get(0))); writer.write("\"\n}\n");
writer.write("\"description\":\""); writer.write(serverObjects.toJSON(snippet == null || snippet.size() == 0 ? (descriptions.size() > 0 ? descriptions.get(0) : "") : snippet.get(0))); writer.write("\"\n}\n");
if (i < responseCount - 1) {
writer.write(",\n".toCharArray());
}

View File

@ -527,7 +527,7 @@ public class CrawlQueues {
ASCII.getBytes(hash),
url,
(referrer == null) ? null : referrer.hash(),
item.getDescription(),
item.getDescriptions().size() > 0 ? item.getDescriptions().get(0) : "",
loaddate,
this.sb.crawler.defaultRemoteProfile.handle(),
0,

View File

@ -90,11 +90,11 @@ public class YMarkAutoTagger implements Runnable, Thread.UncaughtExceptionHandle
final Map<String, Word> words = new Condenser(document, true, true, LibraryProvider.dymLib, LibraryProvider.synonyms, false).words();
// generate potential tags from document title, description and subject
final int bufferSize = document.dc_title().length() + document.dc_description().length() + document.dc_subject(' ').length() + 32;
final int bufferSize = document.dc_title().length() + document.dc_description().length + document.dc_subject(' ').length() + 32;
final StringBuilder buffer = new StringBuilder(bufferSize);
final StringBuilder pwords = new StringBuilder(1000);
buffer.append(document.dc_title().toLowerCase());
buffer.append(document.dc_description().toLowerCase());
for (String s:document.dc_description()) buffer.append(s.toLowerCase());
buffer.append(document.dc_subject(' ').toLowerCase());
final WordTokenizer tokens = new WordTokenizer(new SentenceReader(buffer.toString()), LibraryProvider.dymLib);
try {

View File

@ -166,7 +166,7 @@ public class YMarkEntry extends TreeMap<String, String> {
public YMarkEntry(final DCEntry dc) {
super();
for (BOOKMARK b : BOOKMARK.values()) {
if(dc.containsKey(b.dc_attrb)) {
if (dc.getMap().containsKey(b.dc_attrb)) {
this.put(b.key(), dc.get(b.dc_attrb));
}
}
@ -218,7 +218,7 @@ public class YMarkEntry extends TreeMap<String, String> {
final DCEntry dc = new DCEntry();
for (BOOKMARK b : BOOKMARK.values()) {
if(!b.dc_attrb.isEmpty() && this.containsKey(b.key())) {
dc.put(b.dc_attrb, this.get(b.key()));
dc.getMap().put(b.dc_attrb, new String[]{this.get(b.key())});
}
}
return dc;

View File

@ -132,7 +132,7 @@ public class YMarkMetadata {
metadata.put(METADATA.CREATOR, this.document.dc_creator());
metadata.put(METADATA.KEYWORDS, this.document.dc_subject(' '));
metadata.put(METADATA.PUBLISHER, this.document.dc_publisher());
metadata.put(METADATA.DESCRIPTION, this.document.dc_description());
metadata.put(METADATA.DESCRIPTION, this.document.dc_description().length > 0 ? this.document.dc_description()[0] : "");
metadata.put(METADATA.MIMETYPE, this.document.dc_format());
metadata.put(METADATA.LANGUAGE, this.document.dc_language());
metadata.put(METADATA.CHARSET, this.document.getCharset());

View File

@ -129,7 +129,9 @@ public final class Condenser {
// phrase 99 is taken from the media Link url and anchor description
// phrase 100 and above are lines from the text
insertTextToWords(new SentenceReader(document.dc_title()), 1, WordReferenceRow.flag_app_dc_title, this.RESULT_FLAGS, true, meaningLib);
insertTextToWords(new SentenceReader(document.dc_description()), 3, WordReferenceRow.flag_app_dc_description, this.RESULT_FLAGS, true, meaningLib);
for (String description: document.dc_description()) {
insertTextToWords(new SentenceReader(description), 3, WordReferenceRow.flag_app_dc_description, this.RESULT_FLAGS, true, meaningLib);
}
insertTextToWords(new SentenceReader(document.dc_creator()), 4, WordReferenceRow.flag_app_dc_creator, this.RESULT_FLAGS, true, meaningLib);
insertTextToWords(new SentenceReader(document.dc_publisher()), 5, WordReferenceRow.flag_app_dc_creator, this.RESULT_FLAGS, true, meaningLib);
insertTextToWords(new SentenceReader(document.dc_subject(' ')), 6, WordReferenceRow.flag_app_dc_description, this.RESULT_FLAGS, true, meaningLib);

View File

@ -75,8 +75,8 @@ public class Document {
private List<String> titles; // the document titles, taken from title and/or h1 tag; shall appear as headline of search result
private final StringBuilder creator; // author or copyright
private final String publisher; // publisher
private final List<String> sections; // if present: more titles/headlines appearing in the document
private final StringBuilder description; // an abstract, if present: short content description
private final List<String> sections; // if present: more titles/headlines appearing in the document
private final List<String> descriptions; // an abstract, if present: short content description
private Object text; // the clear text, all that is visible
private final Map<DigestURI, Properties> anchors; // all links embedded as clickeable entities (anchor tags)
private final Map<DigestURI, String> rss; // all embedded rss feeds
@ -101,7 +101,7 @@ public class Document {
final String[] keywords,
final List<String> titles,
final String author, final String publisher,
final String[] sections, final String abstrct,
final String[] sections, final List<String> abstrcts,
final double lon, final double lat,
final Object text,
final Map<DigestURI, Properties> anchors,
@ -118,7 +118,7 @@ public class Document {
this.creator = (author == null) ? new StringBuilder(0) : new StringBuilder(author);
this.sections = new LinkedList<String>() ;
if (sections != null) this.sections.addAll(Arrays.asList(sections));
this.description = (abstrct == null) ? new StringBuilder(0) : new StringBuilder(abstrct);
this.descriptions = (abstrcts == null) ? new ArrayList<String>() : abstrcts;
if (lat >= -90.0d && lat <= 90.0d && lon >= -180.0d && lon <= 180.0d) {
this.lon = lon;
this.lat = lat;
@ -288,10 +288,9 @@ dc_rights
return sb.substring(0, sb.length() - 1);
}
public String dc_description() {
if (this.description == null)
return dc_title();
return this.description.toString();
public String[] dc_description() {
if (descriptions == null) return new String[0];
return this.descriptions.toArray(new String[this.descriptions.size()]);
}
public String dc_publisher() {
@ -646,9 +645,7 @@ dc_rights
this.sections.addAll(doc.sections);
this.titles.addAll(doc.titles());
this.keywords.addAll(doc.getKeywords());
if (this.description.length() > 0) this.description.append('\n');
this.description.append(doc.dc_description());
for (String d: doc.dc_description()) this.descriptions.add(d);
if (!(this.text instanceof ByteArrayOutputStream)) {
this.text = new ByteArrayOutputStream();
@ -779,7 +776,7 @@ dc_rights
final StringBuilder authors = new StringBuilder(80);
final StringBuilder publishers = new StringBuilder(80);
final StringBuilder subjects = new StringBuilder(80);
final StringBuilder description = new StringBuilder(80);
final List<String> descriptions = new ArrayList<String>();
final Collection<String> titles = new LinkedHashSet<String>();
final Collection<String> sectionTitles = new LinkedHashSet<String>();
final Map<DigestURI, Properties> anchors = new HashMap<DigestURI, Properties>();
@ -810,9 +807,7 @@ dc_rights
titles.addAll(doc.titles());
sectionTitles.addAll(Arrays.asList(doc.getSectionTitles()));
if (description.length() > 0) description.append("\n");
description.append(doc.dc_description());
for (String d: doc.dc_description()) descriptions.add(d);
if (doc.getTextLength() > 0) {
if (docTextLength > 0) content.write('\n');
@ -851,7 +846,7 @@ dc_rights
authors.toString(),
publishers.toString(),
sectionTitles.toArray(new String[sectionTitles.size()]),
description.toString(),
descriptions,
lon, lat,
content.getBytes(),
anchors,

View File

@ -37,12 +37,14 @@ import java.util.List;
import java.util.Locale;
import java.util.TreeMap;
import org.apache.solr.common.params.MultiMapSolrParams;
import net.yacy.cora.date.ISO8601Formatter;
import net.yacy.cora.util.ConcurrentLog;
import net.yacy.document.Document;
import net.yacy.kelondro.data.meta.DigestURI;
public class DCEntry extends TreeMap<String, String> {
public class DCEntry extends MultiMapSolrParams {
private static final long serialVersionUID = -2050291583515701559L;
@ -55,7 +57,7 @@ public class DCEntry extends TreeMap<String, String> {
public static final DCEntry poison = new DCEntry();
public DCEntry() {
super((Collator) insensitiveCollator.clone());
super(new TreeMap<String, String[]>((Collator) insensitiveCollator.clone()));
}
public DCEntry(
@ -67,14 +69,14 @@ public class DCEntry extends TreeMap<String, String> {
double lat,
double lon
) {
super((Collator) insensitiveCollator.clone());
this.put("dc:identifier", url.toNormalform(true));
this.put("dc:date", ISO8601Formatter.FORMATTER.format(date));
this.put("dc:title", title);
this.put("dc:creator", author);
this.put("dc:description", body);
this.put("geo:lat", Double.toString(lat));
this.put("geo:long", Double.toString(lon));
super(new TreeMap<String, String[]>((Collator) insensitiveCollator.clone()));
this.getMap().put("dc:identifier", new String[]{url.toNormalform(true)});
this.getMap().put("dc:date", new String[]{ISO8601Formatter.FORMATTER.format(date)});
this.getMap().put("dc:title", new String[]{title});
this.getMap().put("dc:creator", new String[]{author});
this.getMap().put("dc:description", new String[]{body});
this.getMap().put("geo:lat", new String[]{Double.toString(lat)});
this.getMap().put("geo:long", new String[]{Double.toString(lon)});
}
/*
@ -222,14 +224,12 @@ public class DCEntry extends TreeMap<String, String> {
return t;
}
public String getDescription() {
String t = this.get("body");
if (t == null) t = this.get("dc:description");
if (t == null) t = this.get("dc:subject");
if (t == null) t = this.get("categories");
t = stripCDATA(t);
if (t == null) return "";
return t;
public List<String> getDescriptions() {
String[] t = this.getParams("dc:description");
List<String> descriptions = new ArrayList<String>();
if (t == null) return descriptions;
for (String s: t) descriptions.add(stripCDATA(s));
return descriptions;
}
public String[] getSubject() {
@ -280,9 +280,9 @@ public class DCEntry extends TreeMap<String, String> {
getCreator(),
getPublisher(),
null,
"",
getDescriptions(),
getLon(), getLat(),
getDescription(),
"",
null,
null,
null,

View File

@ -169,7 +169,7 @@ public class SurrogateReader extends DefaultHandler implements Runnable {
//System.out.println("BUFFER-SIZE=" + buffer.length());
final String value = buffer.toString().trim();
if (this.elementName != null) {
this.surrogate.put(this.elementName, value);
this.surrogate.getMap().put(this.elementName, new String[]{value});
}
this.buffer.setLength(0);
this.parsingValue = false;
@ -179,9 +179,9 @@ public class SurrogateReader extends DefaultHandler implements Runnable {
value.replaceAll(";", ",");
String oldcontent = this.surrogate.get(this.elementName);
if (oldcontent == null) {
this.surrogate.put(this.elementName, value);
this.surrogate.getMap().put(this.elementName, new String[]{value});
} else {
this.surrogate.put(this.elementName, oldcontent + ";" + value);
this.surrogate.getMap().put(this.elementName, new String[]{oldcontent + ";" + value});
}
}
this.buffer.setLength(0);
@ -222,7 +222,7 @@ public class SurrogateReader extends DefaultHandler implements Runnable {
System.out.println("Publisher: " + s.getPublisher());
System.out.println("URL: " + s.getIdentifier(true));
System.out.println("Language: " + s.getLanguage());
System.out.println("Body: " + s.getDescription());
System.out.println("Body: " + s.getDescriptions().toString());
}
} catch (final IOException e) {
ConcurrentLog.logException(e);

View File

@ -124,44 +124,36 @@ public class audioTagParser extends AbstractParser implements Parser {
titles.add(filename);
// text
final List<String> descriptions = new ArrayList<String>(7);
final StringBuilder text = new StringBuilder(500);
final char space = ' ';
text.append(tag.getFirst(FieldKey.ARTIST));
text.append(space);
text.append(tag.getFirst(FieldKey.ALBUM));
text.append(space);
text.append(tag.getFirst(FieldKey.TITLE));
text.append(space);
text.append(tag.getFirst(FieldKey.COMMENT));
text.append(space);
text.append(tag.getFirst(FieldKey.LYRICS));
text.append(space);
text.append(tag.getFirst(FieldKey.TAGS));
text.append(space);
text.append(tag.getFirst(FieldKey.GENRE));
text.append(space);
String field = tag.getFirst(FieldKey.ARTIST);
descriptions.add(FieldKey.ARTIST.name() + ": " + field);
text.append(field); text.append(space);
field = tag.getFirst(FieldKey.ALBUM);
descriptions.add(FieldKey.ALBUM.name() + ": " + field);
text.append(field); text.append(space);
field = tag.getFirst(FieldKey.TITLE);
descriptions.add(FieldKey.TITLE.name() + ": " + field);
text.append(field); text.append(space);
field = tag.getFirst(FieldKey.COMMENT);
descriptions.add(FieldKey.COMMENT.name() + ": " + field);
text.append(field); text.append(space);
field = tag.getFirst(FieldKey.LYRICS);
descriptions.add(FieldKey.LYRICS.name() + ": " + field);
text.append(field); text.append(space);
field = tag.getFirst(FieldKey.TAGS);
descriptions.add(FieldKey.TAGS.name() + ": " + field);
text.append(field); text.append(space);
field = tag.getFirst(FieldKey.GENRE);
descriptions.add(FieldKey.GENRE.name() + ": " + field);
text.append(field); text.append(space);
text.append(location.toTokens());
// dc:subject
final String[] subject = new String[1];
subject[0] = tag.getFirst(FieldKey.GENRE);
// description
final StringBuilder desc = new StringBuilder(500);
final String sep = " - ";
int count = desc.length();
desc.append(tag.getFirst(FieldKey.ARTIST));
if(desc.length() > count) {
desc.append(sep);
count = desc.length();
}
desc.append(tag.getFirst(FieldKey.ALBUM));
if(desc.length() > count) {
desc.append(sep);
count = desc.length();
}
desc.append(tag.getFirst(FieldKey.TITLE));
docs = new Document[]{new Document(
location,
mime,
@ -173,7 +165,7 @@ public class audioTagParser extends AbstractParser implements Parser {
tag.getFirst(FieldKey.ARTIST), // author
location.getHost(), // publisher
null, // sections
desc.toString(), // abstrct
descriptions, // abstrct
0.0f, 0.0f, // lon, lat
text.toString(), // text
null,

View File

@ -590,12 +590,6 @@ public class ContentScraper extends AbstractScraper implements Scraper {
}
}
if (this.titles.size() == 0) {
// take description tag
s = getDescription();
if (!s.isEmpty()) this.titles.add(s);
}
// extract headline from file name
ArrayList<String> t = new ArrayList<String>();
t.addAll(this.titles);
@ -768,11 +762,13 @@ public class ContentScraper extends AbstractScraper implements Scraper {
return false;
}
public String getDescription() {
public List<String> getDescriptions() {
String s = this.metas.get("description");
if (s == null) s = this.metas.get("dc.description");
if (s == null) return EMPTY_STRING;
return s;
List<String> descriptions = new ArrayList<String>();
if (s == null) return descriptions;
descriptions.add(s);
return descriptions;
}
public String getContentType() {

View File

@ -135,7 +135,7 @@ public class htmlParser extends AbstractParser implements Parser {
scraper.getAuthor(),
scraper.getPublisher(),
sections,
scraper.getDescription(),
scraper.getDescriptions(),
scraper.getLon(), scraper.getLat(),
scraper.getText(),
scraper.getAnchors(),

View File

@ -34,9 +34,11 @@ import java.io.FileNotFoundException;
import java.io.IOException;
import java.io.InputStream;
import java.net.MalformedURLException;
import java.util.ArrayList;
import java.util.HashMap;
import java.util.HashSet;
import java.util.Iterator;
import java.util.List;
import java.util.Properties;
import java.util.Set;
@ -100,7 +102,7 @@ public class genericImageParser extends AbstractParser implements Parser {
String title = null;
String author = null;
String keywords = null;
String description = null;
List<String> descriptions = new ArrayList<String>();
String filename = location.getFileName();
String ext = MultiProtocolURI.getFileExtension(filename);
double gpslat = 0;
@ -179,10 +181,11 @@ public class genericImageParser extends AbstractParser implements Parser {
if (keywords == null || keywords.isEmpty()) keywords = props.get("Category");
if (keywords == null || keywords.isEmpty()) keywords = props.get("Supplemental Category(s)");
description = props.get("Caption/Abstract");
if (description == null || description.isEmpty()) description = props.get("Country/Primary Location");
if (description == null || description.isEmpty()) description = props.get("Province/State");
if (description == null || description.isEmpty()) description = props.get("Copyright Notice");
String description;
description = props.get("Caption/Abstract"); if (description != null && description.length() > 0) descriptions.add("Abstract: " + description);
description = props.get("Country/Primary Location"); if (description != null && description.length() > 0) descriptions.add("Location: " + description);
description = props.get("Province/State"); if (description != null && description.length() > 0) descriptions.add("State: " + description);
description = props.get("Copyright Notice"); if (description != null && description.length() > 0) descriptions.add("Copyright: " + description);
} catch (final JpegProcessingException e) {
//Log.logException(e);
@ -212,7 +215,7 @@ public class genericImageParser extends AbstractParser implements Parser {
author == null ? "" : author, // author
location.getHost(), // Publisher
new String[]{}, // sections
description == null ? "" : description, // description
descriptions, // description
gpslon, gpslat, // location
infoString, // content text
anchors, // anchors

View File

@ -29,8 +29,10 @@ package net.yacy.document.parser;
import java.io.File;
import java.io.InputStream;
import java.util.ArrayList;
import java.util.Enumeration;
import java.util.HashSet;
import java.util.List;
import java.util.Set;
import java.util.zip.ZipEntry;
import java.util.zip.ZipFile;
@ -176,6 +178,8 @@ public class odtParser extends AbstractParser implements Parser {
// create the parser document
Document[] docs = null;
final byte[] contentBytes = (writer == null) ? null : UTF8.getBytes(writer.toString());
List<String> descriptions = new ArrayList<String>();
if (docDescription != null && docDescription.length() > 0) descriptions.add(docDescription);
docs = new Document[]{new Document(
location,
mimeType,
@ -187,7 +191,7 @@ public class odtParser extends AbstractParser implements Parser {
docAuthor,
"",
null,
docDescription,
descriptions,
0.0f, 0.0f,
contentBytes,
null,

View File

@ -29,8 +29,10 @@ package net.yacy.document.parser;
import java.io.File;
import java.io.InputStream;
import java.util.ArrayList;
import java.util.Enumeration;
import java.util.HashSet;
import java.util.List;
import java.util.Set;
import java.util.zip.ZipEntry;
import java.util.zip.ZipFile;
@ -161,6 +163,8 @@ public class ooxmlParser extends AbstractParser implements Parser {
// create the parser document
Document[] docs = null;
final byte[] contentBytes = (writer == null) ? null : UTF8.getBytes(writer.toString());
List<String> descriptions = new ArrayList<String>();
if (docDescription != null && docDescription.length() > 0) descriptions.add(docDescription);
docs = new Document[]{new Document(
location,
mimeType,
@ -172,7 +176,7 @@ public class ooxmlParser extends AbstractParser implements Parser {
docAuthor,
"",
null,
docDescription,
descriptions,
0.0f, 0.0f,
contentBytes,
null,

View File

@ -59,7 +59,7 @@ public class rdfParser extends AbstractParser implements Parser {
String all = "rdfdatasource";
doc = new Document(url, mimeType, charset, null, null, null, singleList(""), "",
"", null, "", 0, 0, all, null, null, null, false);
"", null, new ArrayList<String>(0), 0, 0, all, null, null, null, false);
docs.add(doc);

View File

@ -12,6 +12,7 @@ import java.io.InputStreamReader;
import java.io.Reader;
import java.net.MalformedURLException;
import java.net.URL;
import java.util.ArrayList;
import java.util.HashSet;
import java.util.Set;
@ -78,7 +79,7 @@ public class RDFaParser extends AbstractParser implements Parser {
}
Document doc = new Document(url, mimeType, charset, null, null, null, singleList(""), "",
"", null, "", 0, 0, null, null, null, null, false);
"", null, new ArrayList<String>(0), 0, 0, null, null, null, null, false);
try {
if (allTriples.length > 0)
@ -137,7 +138,7 @@ public class RDFaParser extends AbstractParser implements Parser {
}
Document doc = new Document(url, mimeType, charset, null, null, null, singleList(""), "",
"", null, "", 0, 0, all, null, null, null, false);
"", null, new ArrayList<String>(0), 0, 0, all, null, null, null, false);
return doc;
}

View File

@ -95,7 +95,7 @@ public class rssParser extends AbstractParser implements Parser {
item.getAuthor(),
item.getCopyright(),
new String[0],
item.getDescription(),
item.getDescriptions(),
item.getLon(),
item.getLat(),
null,

View File

@ -90,7 +90,7 @@ public class sitemapParser extends AbstractParser implements Parser {
"",
"",
new String[0],
"",
new ArrayList<String>(),
0.0f, 0.0f,
null,
null,

View File

@ -29,7 +29,9 @@ package net.yacy.document.parser;
import java.io.IOException;
import java.io.InputStream;
import java.util.ArrayList;
import java.util.HashMap;
import java.util.List;
import java.util.Map;
import java.util.Properties;
@ -76,7 +78,7 @@ public class swfParser extends AbstractParser implements Parser {
String urlnr = null;
final String linebreak = System.getProperty("line.separator");
final String[] sections = null;
final String abstrct = null;
final List<String> abstrct = new ArrayList<String>();
//TreeSet images = null;
final Map<DigestURI, Properties> anchors = new HashMap<DigestURI, Properties>();
int urls = 0;

View File

@ -32,9 +32,11 @@ import java.io.IOException;
import java.io.InputStream;
import java.io.InputStreamReader;
import java.net.MalformedURLException;
import java.util.ArrayList;
import java.util.HashMap;
import java.util.Iterator;
import java.util.LinkedList;
import java.util.List;
import java.util.Properties;
import net.yacy.cora.document.UTF8;
@ -207,6 +209,7 @@ public class vcfParser extends AbstractParser implements Parser {
final String[] sections = parsedNames.toArray(new String[parsedNames.size()]);
final byte[] text = UTF8.getBytes(parsedDataText.toString());
final List<String> descriptions = new ArrayList<String>(1); descriptions.add("vCard");
return new Document[]{new Document(
url, // url of the source document
mimeType, // the documents mime type
@ -218,7 +221,7 @@ public class vcfParser extends AbstractParser implements Parser {
"", // TODO: AUTHOR
"", // the publisher
sections, // an array of section headlines
"vCard", // an abstract
descriptions, // an abstract
0.0f, 0.0f,
text, // the parsed document text
anchors, // a map of extracted anchors

View File

@ -28,6 +28,8 @@
package net.yacy.document.parser;
import java.io.InputStream;
import java.util.ArrayList;
import java.util.List;
import net.yacy.cora.util.ConcurrentLog;
import net.yacy.document.AbstractParser;
@ -90,16 +92,10 @@ public class vsdParser extends AbstractParser implements Parser {
title = summary.getTitle();
}
String abstrct = null;
abstrct = ((contents.length() > 80)? contents.substring(0, 80) : contents.trim()).
replaceAll("\r\n"," ").
replaceAll("\n"," ").
replaceAll("\r"," ").
replaceAll("\t"," ");
List<String> abstrct = new ArrayList<String>();
if (contents.length() > 0) abstrct.add(((contents.length() > 80) ? contents.substring(0, 80) : contents.trim()).replaceAll("\r\n"," ").replaceAll("\n"," ").replaceAll("\r"," ").replaceAll("\t"," "));
if (title == null) {
title = abstrct;
}
if (title == null) title = location.toNormalform(true);
// As the result of parsing this function must return a plasmaParserDocument object
return new Document[]{new Document(

View File

@ -312,8 +312,8 @@ public class URIMetadataNode {
return getString(CollectionSchema.text_t);
}
public String getDescription() {
return getString(CollectionSchema.description);
public ArrayList<String> getDescription() {
return getStringList(CollectionSchema.description_txt);
}
public boolean isOlder(URIMetadataRow other) {

View File

@ -2916,7 +2916,7 @@ public final class Switchboard extends serverSwitch {
}
final String title = scraper == null ? url.toNormalform(true) : scraper.dc_title();
final String description = scraper.dc_description();
final String description = scraper.dc_description().length > 0 ? scraper.dc_description()[0] : "";
// add the url to the crawl stack
this.crawler.removePassive(handle); // if there is an old entry, delete it

View File

@ -806,10 +806,11 @@ public final class Fulltext {
} else {
BlockingQueue<SolrDocument> docs = Fulltext.this.getDefaultConnector().concurrentDocumentsByQuery(CollectionSchema.httpstatus_i.getSolrFieldName() + ":200", 0, 100000000, 10 * 60 * 60 * 1000, 100,
CollectionSchema.id.getSolrFieldName(), CollectionSchema.sku.getSolrFieldName(), CollectionSchema.title.getSolrFieldName(),
CollectionSchema.author.getSolrFieldName(), CollectionSchema.description.getSolrFieldName(), CollectionSchema.size_i.getSolrFieldName(), CollectionSchema.last_modified.getSolrFieldName());
CollectionSchema.author.getSolrFieldName(), CollectionSchema.description_txt.getSolrFieldName(), CollectionSchema.size_i.getSolrFieldName(), CollectionSchema.last_modified.getSolrFieldName());
SolrDocument doc;
ArrayList<?> title;
String url, author, description, hash;
String url, author, hash;
String[] descriptions;
Integer size;
Date date;
while ((doc = docs.take()) != AbstractSolrConnector.POISON_DOCUMENT) {
@ -817,7 +818,7 @@ public final class Fulltext {
url = (String) doc.getFieldValue(CollectionSchema.sku.getSolrFieldName());
title = (ArrayList<?>) doc.getFieldValue(CollectionSchema.title.getSolrFieldName());
author = (String) doc.getFieldValue(CollectionSchema.author.getSolrFieldName());
description = (String) doc.getFieldValue(CollectionSchema.description.getSolrFieldName());
descriptions = (String[]) doc.getFieldValue(CollectionSchema.description_txt.getSolrFieldName());
size = (Integer) doc.getFieldValue(CollectionSchema.size_i.getSolrFieldName());
date = (Date) doc.getFieldValue(CollectionSchema.last_modified.getSolrFieldName());
if (this.pattern != null && !this.pattern.matcher(url).matches()) continue;
@ -832,7 +833,9 @@ public final class Fulltext {
if (title != null) pw.println("<title>" + CharacterCoding.unicode2xml((String) title.iterator().next(), true) + "</title>");
pw.println("<link>" + MultiProtocolURI.escape(url) + "</link>");
if (author != null && !author.isEmpty()) pw.println("<author>" + CharacterCoding.unicode2xml(author, true) + "</author>");
if (description != null && !description.isEmpty()) pw.println("<description>" + CharacterCoding.unicode2xml(description, true) + "</description>");
if (descriptions != null && descriptions.length > 0) {
for (String d: descriptions) pw.println("<description>" + CharacterCoding.unicode2xml(d, true) + "</description>");
}
if (date != null) pw.println("<pubDate>" + HeaderFramework.formatRFC1123(date) + "</pubDate>");
if (size != null) pw.println("<yacy:size>" + size.intValue() + "</yacy:size>");
pw.println("<guid isPermaLink=\"false\">" + hash + "</guid>");

View File

@ -625,7 +625,7 @@ public class Segment {
if (this.fulltext.getDefaultConfiguration().contains(CollectionSchema.host_id_s)) {
uniquecheck: for (CollectionSchema[] checkfields: new CollectionSchema[][]{
{CollectionSchema.title, CollectionSchema.title_exact_signature_l, CollectionSchema.title_unique_b},
{CollectionSchema.description, CollectionSchema.description_exact_signature_l, CollectionSchema.description_unique_b}}) {
{CollectionSchema.description_txt, CollectionSchema.description_exact_signature_l, CollectionSchema.description_unique_b}}) {
CollectionSchema checkfield = checkfields[0];
CollectionSchema signaturefield = checkfields[1];
CollectionSchema uniquefield = checkfields[2];

View File

@ -245,16 +245,16 @@ public class CollectionConfiguration extends SchemaConfiguration implements Seri
add(doc, CollectionSchema.title_words_val, cv);
}
String description = md.snippet(); if (description == null) description = "";
if (allAttr || contains(CollectionSchema.description)) add(doc, CollectionSchema.description, description);
if (allAttr || contains(CollectionSchema.description_count_i)) add(doc, CollectionSchema.description_count_i, 1);
String description = md.snippet();
boolean description_exist = description != null;
if (description == null) description = "";
if (allAttr || contains(CollectionSchema.description_txt)) add(doc, CollectionSchema.description_txt, description_exist ? new String[]{description} : new String[0]);
if (allAttr || contains(CollectionSchema.description_count_i)) add(doc, CollectionSchema.description_count_i, description_exist ? 1 : 0);
if (allAttr || contains(CollectionSchema.description_chars_val)) {
Integer[] cv = new Integer[]{new Integer(description.length())};
add(doc, CollectionSchema.description_chars_val, cv);
add(doc, CollectionSchema.description_chars_val, description_exist ? new Integer[]{new Integer(description.length())} : new Integer[0]);
}
if (allAttr || contains(CollectionSchema.description_words_val)) {
Integer[] cv = new Integer[]{new Integer(CommonPattern.SPACE.split(description).length)};
add(doc, CollectionSchema.description_words_val, cv);
add(doc, CollectionSchema.description_words_val, description_exist ? new Integer[]{new Integer(description.length() == 0 ? 0 : CommonPattern.SPACE.split(description).length)} : new Integer[0]);
}
String filename = digestURI.getFileName();
@ -424,23 +424,21 @@ public class CollectionConfiguration extends SchemaConfiguration implements Seri
add(doc, CollectionSchema.title_words_val, cv);
}
String description = document.dc_description();
List<String> descriptions = new ArrayList<String>();
for (String s: CommonPattern.NEWLINE.split(description)) descriptions.add(s);
if (allAttr || contains(CollectionSchema.description)) {
add(doc, CollectionSchema.description, description);
if ((allAttr || contains(CollectionSchema.description_exact_signature_l)) && description != null && description.length() > 0) {
add(doc, CollectionSchema.description_exact_signature_l, EnhancedTextProfileSignature.getSignatureLong(description));
String[] descriptions = document.dc_description();
if (allAttr || contains(CollectionSchema.description_txt)) {
add(doc, CollectionSchema.description_txt, descriptions);
if ((allAttr || contains(CollectionSchema.description_exact_signature_l)) && descriptions != null && descriptions.length > 0) {
add(doc, CollectionSchema.description_exact_signature_l, EnhancedTextProfileSignature.getSignatureLong(descriptions));
}
}
if (allAttr || contains(CollectionSchema.description_count_i)) add(doc, CollectionSchema.description_count_i, descriptions.size());
if (allAttr || contains(CollectionSchema.description_count_i)) add(doc, CollectionSchema.description_count_i, descriptions.length);
if (allAttr || contains(CollectionSchema.description_chars_val)) {
ArrayList<Integer> cv = new ArrayList<Integer>(descriptions.size());
ArrayList<Integer> cv = new ArrayList<Integer>(descriptions.length);
for (String s: descriptions) cv.add(new Integer(s.length()));
add(doc, CollectionSchema.description_chars_val, cv);
}
if (allAttr || contains(CollectionSchema.description_words_val)) {
ArrayList<Integer> cv = new ArrayList<Integer>(descriptions.size());
ArrayList<Integer> cv = new ArrayList<Integer>(descriptions.length);
for (String s: descriptions) cv.add(new Integer(CommonPattern.SPACE.split(s).length));
add(doc, CollectionSchema.description_words_val, cv);
}

View File

@ -75,7 +75,7 @@ public enum CollectionSchema implements SchemaDeclaration {
ip_s(SolrType.string, true, true, false, false, false, "ip of host of url (after DNS lookup)"),
author(SolrType.text_general, true, true, false, false, true, "content of author-tag"),
author_sxt(SolrType.string, true, true, true, false, false, "content of author-tag as copy-field from author. This is used for facet generation"),
description(SolrType.text_general, true, true, false, false, true, "content of description-tag"),
description_txt(SolrType.text_general, true, true, true, false, true, "content of description-tag(s)"),
description_exact_signature_l(SolrType.num_long, true, true, false, false, false, "the 64 bit hash of the org.apache.solr.update.processor.Lookup3Signature of description, used to compute description_unique_b"),
description_unique_b(SolrType.bool, true, true, false, false, false, "flag shows if description is unique in the whole index; if yes and another document appears with same description, the unique-flag is set to false"),
keywords(SolrType.text_general, true, true, false, false, true, "content of keywords tag; words are separated by space"),

View File

@ -489,7 +489,8 @@ public final class HTTPDFileHandler {
File f;
String size;
long sz;
String headline, author, description, publisher;
String headline, author, publisher;
List<String> descriptions;
int images, links;
ContentScraper scraper;
for (final String element : list) {
@ -503,14 +504,14 @@ public final class HTTPDFileHandler {
headline = t.size() > 0 ? t.iterator().next() : "";
author = scraper.getAuthor();
publisher = scraper.getPublisher();
description = scraper.getDescription();
descriptions = scraper.getDescriptions();
images = scraper.getImages().size();
links = scraper.getAnchors().size();
} else {
headline = null;
author = null;
publisher = null;
description = null;
descriptions = null;
images = 0;
links = 0;
}
@ -527,7 +528,11 @@ public final class HTTPDFileHandler {
aBuffer.append("<a href=\"" + path + element + "\">" + element + "</a><br/>");
if (author != null && author.length() > 0) aBuffer.append("Author: " + author + "<br/>");
if (publisher != null && publisher.length() > 0) aBuffer.append("Publisher: " + publisher + "<br/>");
if (description != null && description.length() > 0) aBuffer.append("Description: " + description + "<br/>");
if (descriptions != null && descriptions.size() > 0) {
for (String d: descriptions) {
aBuffer.append("Description: " + d + "<br/>");
}
}
aBuffer.append(GenericFormatter.SHORT_DAY_FORMATTER.format(new Date(f.lastModified())) + ", " + size + ((images > 0) ? ", " + images + " images" : "") + ((links > 0) ? ", " + links + " links" : "") + "<br/></li>\n");
}
}