enhanced snippets: remove lines which are identical to the title and

choose longer versions if possible. Prefer the description part.
This commit is contained in:
Michael Peter Christen 2014-05-06 16:48:50 +02:00
parent e84e07399a
commit 4e734815e8
7 changed files with 67 additions and 26 deletions

View File

@ -26,6 +26,7 @@ import java.util.ArrayList;
import java.util.Date;
import java.util.HashMap;
import java.util.HashSet;
import java.util.LinkedHashSet;
import java.util.List;
import java.util.Map;
import java.util.Set;
@ -147,7 +148,7 @@ public class GSAResponseWriter implements QueryResponseWriter {
DocList response = ((ResultContext) rsp.getValues().get("response")).docs;
@SuppressWarnings("unchecked")
SimpleOrderedMap<Object> highlighting = (SimpleOrderedMap<Object>) rsp.getValues().get("highlighting");
Map<String, List<String>> snippets = OpensearchResponseWriter.highlighting(highlighting);
Map<String, LinkedHashSet<String>> snippets = OpensearchResponseWriter.highlighting(highlighting);
Map<Object,Object> context = request.getContext();
// parse response header
@ -241,6 +242,7 @@ public class GSAResponseWriter implements QueryResponseWriter {
List<String> collections = new ArrayList<String>();
int size = 0;
boolean title_written = false; // the solr index may contain several; we take only the first which should be the visible tag in <title></title>
String title = null;
for (IndexableField value: fields) {
String fieldName = value.name();
@ -262,7 +264,8 @@ public class GSAResponseWriter implements QueryResponseWriter {
continue;
}
if (CollectionSchema.title.getSolrFieldName().equals(fieldName) && !title_written) {
OpensearchResponseWriter.solitaireTag(writer, GSAToken.T.name(), highlight(value.stringValue(), query));
title = value.stringValue();
OpensearchResponseWriter.solitaireTag(writer, GSAToken.T.name(), highlight(title, query));
//texts.add(value.stringValue());
title_written = true;
continue;
@ -296,8 +299,9 @@ public class GSAResponseWriter implements QueryResponseWriter {
//System.out.println("superfluous field: " + fieldName + ": " + value.stringValue()); // this can be avoided setting the enableLazyFieldLoading = false in solrconfig.xml
}
// compute snippet from texts
List<String> snippet = urlhash == null ? null : snippets.get(urlhash);
OpensearchResponseWriter.solitaireTag(writer, GSAToken.S.name(), snippet == null || snippet.size() == 0 ? (descriptions.size() > 0 ? descriptions.get(0) : "") : snippet.get(0));
LinkedHashSet<String> snippet = urlhash == null ? null : snippets.get(urlhash);
OpensearchResponseWriter.removeSubsumedTitle(snippet, title);
OpensearchResponseWriter.solitaireTag(writer, GSAToken.S.name(), snippet == null || snippet.size() == 0 ? (descriptions.size() > 0 ? descriptions.get(0) : "") : OpensearchResponseWriter.getLargestSnippet(snippet));
OpensearchResponseWriter.solitaireTag(writer, GSAToken.GD.name(), descriptions.size() > 0 ? descriptions.get(0) : "");
String cols = collections.toString();
if (collections.size() > 0) OpensearchResponseWriter.solitaireTag(writer, "COLS" /*SPECIAL!*/, collections.size() > 1 ? cols.substring(1, cols.length() - 1).replaceAll(" ", "") : collections.get(0));

View File

@ -27,9 +27,12 @@ import java.util.ArrayList;
import java.util.Date;
import java.util.HashMap;
import java.util.HashSet;
import java.util.Iterator;
import java.util.LinkedHashSet;
import java.util.List;
import java.util.Map;
import java.util.Set;
import java.util.regex.Pattern;
import net.yacy.cora.document.feed.RSSMessage;
import net.yacy.cora.document.id.MultiProtocolURL;
@ -114,7 +117,7 @@ public class OpensearchResponseWriter implements QueryResponseWriter {
SimpleOrderedMap<Object> facetFields = facetCounts == null || facetCounts.size() == 0 ? null : (SimpleOrderedMap<Object>) facetCounts.get("facet_fields");
@SuppressWarnings("unchecked")
SimpleOrderedMap<Object> highlighting = (SimpleOrderedMap<Object>) values.get("highlighting");
Map<String, List<String>> snippets = highlighting(highlighting);
Map<String, LinkedHashSet<String>> snippets = highlighting(highlighting);
// parse response header
ResHead resHead = new ResHead();
@ -233,17 +236,18 @@ public class OpensearchResponseWriter implements QueryResponseWriter {
// compute snippet from texts
solitaireTag(writer, RSSMessage.Token.title.name(), title.length() == 0 ? (texts.size() == 0 ? "" : texts.get(0)) : title);
List<String> snippet = urlhash == null ? null : snippets.get(urlhash);
LinkedHashSet<String> snippet = urlhash == null ? null : snippets.get(urlhash);
String tagname = RSSMessage.Token.description.name();
if (snippet == null || snippet.size() == 0) {
for (String d: descriptions) {
writer.write("<"); writer.write(tagname); writer.write('>');
XML.escapeCharData(snippet == null || snippet.size() == 0 ? d : snippet.get(0), writer);
XML.escapeCharData(d, writer);
writer.write("</"); writer.write(tagname); writer.write(">\n");
}
} else {
OpensearchResponseWriter.removeSubsumedTitle(snippet, title);
writer.write("<"); writer.write(tagname); writer.write('>');
XML.escapeCharData(snippet.get(0), writer);
XML.escapeCharData(OpensearchResponseWriter.getLargestSnippet(snippet), writer);
writer.write("</"); writer.write(tagname); writer.write(">\n");
}
// open: where do we get the subject?
@ -296,8 +300,8 @@ public class OpensearchResponseWriter implements QueryResponseWriter {
* @return a map from urlhashes to a list of snippets for that url
*/
@SuppressWarnings("unchecked")
public static Map<String, List<String>> highlighting(final SimpleOrderedMap<Object> val) {
Map<String, List<String>> snippets = new HashMap<String, List<String>>();
public static Map<String, LinkedHashSet<String>> highlighting(final SimpleOrderedMap<Object> val) {
Map<String, LinkedHashSet<String>> snippets = new HashMap<String, LinkedHashSet<String>>();
if (val == null) return snippets;
int sz = val.size();
Object v, vv;
@ -306,7 +310,7 @@ public class OpensearchResponseWriter implements QueryResponseWriter {
v = val.getVal(i);
if (v instanceof SimpleOrderedMap) {
int sz1 = ((SimpleOrderedMap<Object>) v).size();
List<String> t = new ArrayList<String>(sz1);
LinkedHashSet<String> t = new LinkedHashSet<String>();
for (int j = 0; j < sz1; j++) {
vv = ((SimpleOrderedMap<Object>) v).getVal(j);
if (vv instanceof String[]) {
@ -319,6 +323,30 @@ public class OpensearchResponseWriter implements QueryResponseWriter {
return snippets;
}
final static Pattern keymarks = Pattern.compile("<b>|</b>");
public static void removeSubsumedTitle(LinkedHashSet<String> snippets, String title) {
if (title == null || title.length() == 0 || snippets == null || snippets.size() == 0) return;
snippets.remove(title);
String tlc = title.toLowerCase();
Iterator<String> i = snippets.iterator();
while (i.hasNext()) {
String s = i.next().toLowerCase();
s = keymarks.matcher(s).replaceAll("");
if (tlc.toLowerCase().indexOf(s) >= 0 || s.toLowerCase().indexOf(tlc) >= 0) i.remove();
}
return;
}
public static String getLargestSnippet(LinkedHashSet<String> snippets) {
if (snippets == null || snippets.size() == 0) return null;
String l = null;
for (String s: snippets) {
if ((l == null || s.length() > l.length()) && s.indexOf(' ') > 0) l = s;
}
return l;
}
public static void openTag(final Writer writer, final String tag) throws IOException {
writer.write('<'); writer.write(tag); writer.write(">\n");
}

View File

@ -26,6 +26,7 @@ import java.net.MalformedURLException;
import java.util.ArrayList;
import java.util.Date;
import java.util.HashMap;
import java.util.LinkedHashSet;
import java.util.List;
import java.util.Map;
@ -98,7 +99,7 @@ public class YJsonResponseWriter implements QueryResponseWriter {
SimpleOrderedMap<Object> facetFields = facetCounts == null || facetCounts.size() == 0 ? null : (SimpleOrderedMap<Object>) facetCounts.get("facet_fields");
@SuppressWarnings("unchecked")
SimpleOrderedMap<Object> highlighting = (SimpleOrderedMap<Object>) values.get("highlighting");
Map<String, List<String>> snippets = OpensearchResponseWriter.highlighting(highlighting);
Map<String, LinkedHashSet<String>> snippets = OpensearchResponseWriter.highlighting(highlighting);
// parse response header
ResHead resHead = new ResHead();
@ -213,8 +214,9 @@ public class YJsonResponseWriter implements QueryResponseWriter {
// compute snippet from texts
solitaireTag(writer, "path", path.toString());
solitaireTag(writer, "title", title.length() == 0 ? (texts.size() == 0 ? path.toString() : texts.get(0)) : title);
List<String> snippet = urlhash == null ? null : snippets.get(urlhash);
writer.write("\"description\":\""); writer.write(serverObjects.toJSON(snippet == null || snippet.size() == 0 ? (descriptions.size() > 0 ? descriptions.get(0) : "") : snippet.get(0))); writer.write("\"\n}\n");
LinkedHashSet<String> snippet = urlhash == null ? null : snippets.get(urlhash);
OpensearchResponseWriter.removeSubsumedTitle(snippet, title);
writer.write("\"description\":\""); writer.write(serverObjects.toJSON(snippet == null || snippet.size() == 0 ? (descriptions.size() > 0 ? descriptions.get(0) : "") : OpensearchResponseWriter.getLargestSnippet(snippet))); writer.write("\"\n}\n");
if (i < responseCount - 1) {
writer.write(",\n".toCharArray());
}

View File

@ -181,7 +181,7 @@ public class GSAsearchServlet extends HttpServlet {
CollectionSchema.size_i.getSolrFieldName());
post.put("hl", "true");
post.put("hl.q", originalQuery);
post.put("hl.fl", CollectionSchema.h1_txt.getSolrFieldName() + "," + CollectionSchema.h2_txt.getSolrFieldName() + "," + CollectionSchema.text_t.getSolrFieldName());
post.put("hl.fl", CollectionSchema.description_txt + "," + CollectionSchema.h4_txt.getSolrFieldName() + "," + CollectionSchema.h3_txt.getSolrFieldName() + "," + CollectionSchema.h2_txt.getSolrFieldName() + "," + CollectionSchema.h1_txt.getSolrFieldName() + "," + CollectionSchema.text_t.getSolrFieldName());
post.put("hl.alternateField", CollectionSchema.description_txt.getSolrFieldName());
post.put("hl.simple.pre", "<b>");
post.put("hl.simple.post", "</b>");

View File

@ -180,7 +180,7 @@ public class SolrSelectServlet extends HttpServlet {
if ((responseWriter instanceof YJsonResponseWriter || responseWriter instanceof OpensearchResponseWriter) && "true".equals(mmsp.get("hl", "true"))) {
// add options for snippet generation
if (!mmsp.getMap().containsKey("hl.q")) mmsp.getMap().put("hl.q", new String[]{q});
if (!mmsp.getMap().containsKey("hl.fl")) mmsp.getMap().put("hl.fl", new String[]{CollectionSchema.h1_txt.getSolrFieldName() + "," + CollectionSchema.h2_txt.getSolrFieldName() + "," + CollectionSchema.text_t.getSolrFieldName()});
if (!mmsp.getMap().containsKey("hl.fl")) mmsp.getMap().put("hl.fl", new String[]{CollectionSchema.description_txt + "," + CollectionSchema.h4_txt.getSolrFieldName() + "," + CollectionSchema.h3_txt.getSolrFieldName() + "," + CollectionSchema.h2_txt.getSolrFieldName() + "," + CollectionSchema.h1_txt.getSolrFieldName() + "," + CollectionSchema.text_t.getSolrFieldName()});
if (!mmsp.getMap().containsKey("hl.alternateField")) mmsp.getMap().put("hl.alternateField", new String[]{CollectionSchema.description_txt.getSolrFieldName()});
if (!mmsp.getMap().containsKey("hl.simple.pre")) mmsp.getMap().put("hl.simple.pre", new String[]{"<b>"});
if (!mmsp.getMap().containsKey("hl.simple.post")) mmsp.getMap().put("hl.simple.post", new String[]{"</b>"});

View File

@ -53,6 +53,7 @@ import java.util.Collection;
import java.util.HashMap;
import java.util.Iterator;
import java.util.LinkedHashMap;
import java.util.LinkedHashSet;
import java.util.List;
import java.util.Map;
import java.util.TreeMap;
@ -71,6 +72,7 @@ import net.yacy.cora.federate.opensearch.SRURSSConnector;
import net.yacy.cora.federate.solr.connector.RemoteSolrConnector;
import net.yacy.cora.federate.solr.connector.SolrConnector;
import net.yacy.cora.federate.solr.instance.RemoteInstance;
import net.yacy.cora.federate.solr.responsewriter.OpensearchResponseWriter;
import net.yacy.cora.federate.yacy.CacheStrategy;
import net.yacy.cora.order.Base64Order;
import net.yacy.cora.order.Digest;
@ -999,7 +1001,7 @@ public final class Protocol {
}
}
private final static CollectionSchema[] snippetFields = new CollectionSchema[]{CollectionSchema.h1_txt, CollectionSchema.h2_txt, CollectionSchema.text_t};
private final static CollectionSchema[] snippetFields = new CollectionSchema[]{CollectionSchema.description_txt, CollectionSchema.h4_txt, CollectionSchema.h3_txt, CollectionSchema.h2_txt, CollectionSchema.h1_txt, CollectionSchema.text_t};
protected static int solrQuery(
final SearchEvent event,
@ -1025,14 +1027,14 @@ public final class Protocol {
//solrQuery.setHighlightRequireFieldMatch();
solrQuery.setHighlightSimplePost("</b>");
solrQuery.setHighlightSimplePre("<b>");
solrQuery.setHighlightSnippets(1);
solrQuery.setHighlightSnippets(5);
for (CollectionSchema field: snippetFields) solrQuery.addHighlightField(field.getSolrFieldName());
} else {
solrQuery.setHighlight(false);
}
boolean localsearch = target == null || target.equals(event.peers.mySeed());
Map<String, ReversibleScoreMap<String>> facets = new HashMap<String, ReversibleScoreMap<String>>(event.query.facetfields.size());
Map<String, String> snippets = new HashMap<String, String>(); // this will be a list of urlhash-snippet entries
Map<String, LinkedHashSet<String>> snippets = new HashMap<String, LinkedHashSet<String>>(); // this will be a list of urlhash-snippet entries
final QueryResponse[] rsp = new QueryResponse[]{null};
final SolrDocumentList[] docList = new SolrDocumentList[]{null};
{// encapsulate expensive solr QueryResponse object
@ -1122,7 +1124,9 @@ public final class Protocol {
if (rs.containsKey(field.getSolrFieldName())) {
List<String> s = rs.get(field.getSolrFieldName());
if (s.size() > 0) {
snippets.put(re.getKey(), s.get(0));
LinkedHashSet<String> ls = new LinkedHashSet<String>();
ls.addAll(s);
snippets.put(re.getKey(), ls);
continue nextsnippet;
}
}

View File

@ -33,6 +33,7 @@ import java.util.ConcurrentModificationException;
import java.util.HashMap;
import java.util.Iterator;
import java.util.LinkedHashMap;
import java.util.LinkedHashSet;
import java.util.List;
import java.util.Map;
import java.util.SortedMap;
@ -49,6 +50,7 @@ import net.yacy.cora.document.analysis.Classification.ContentDomain;
import net.yacy.cora.document.encoding.ASCII;
import net.yacy.cora.document.id.DigestURL;
import net.yacy.cora.document.id.MultiProtocolURL;
import net.yacy.cora.federate.solr.responsewriter.OpensearchResponseWriter;
import net.yacy.cora.federate.yacy.CacheStrategy;
import net.yacy.cora.federate.yacy.Distribution;
import net.yacy.cora.lod.vocabulary.Tagging;
@ -150,7 +152,7 @@ public final class SearchEvent {
private final boolean deleteIfSnippetFail;
private long urlRetrievalAllTime;
private long snippetComputationAllTime;
private ConcurrentHashMap<String, String> snippets;
private ConcurrentHashMap<String, LinkedHashSet<String>> snippets;
private final boolean remote;
private SortedMap<byte[], ReferenceContainer<WordReference>> localSearchInclusion;
private final ScoreMap<String> ref; // reference score computation for the commonSense heuristic
@ -234,7 +236,7 @@ public final class SearchEvent {
this.topicNavigatorCount = navcfg.contains("topics") ? MAX_TOPWORDS : 0;
this.languageNavigator = navcfg.contains("language") ? new ConcurrentScoreMap<String>() : null;
this.vocabularyNavigator = new ConcurrentHashMap<String, ScoreMap<String>>();
this.snippets = new ConcurrentHashMap<String, String>();
this.snippets = new ConcurrentHashMap<String, LinkedHashSet<String>>();
this.secondarySearchSuperviser = (this.query.getQueryGoal().getIncludeHashes().size() > 1) ? new SecondarySearchSuperviser(this) : null; // generate abstracts only for combined searches
if (this.secondarySearchSuperviser != null) this.secondarySearchSuperviser.start();
this.secondarySearchThreads = null;
@ -701,7 +703,7 @@ public final class SearchEvent {
public void addNodes(
final List<URIMetadataNode> nodeList,
final Map<String, ReversibleScoreMap<String>> facets, // a map from a field name to scored values
final Map<String, String> solrsnippets, // a map from urlhash to snippet text
final Map<String, LinkedHashSet<String>> solrsnippets, // a map from urlhash to snippet text
final boolean local,
final String resourceName,
final int fullResource) {
@ -1218,9 +1220,10 @@ public final class SearchEvent {
Element<URIMetadataNode> localEntryElement = this.nodeStack.sizeQueue() > 0 ? this.nodeStack.poll() : null;
URIMetadataNode node = localEntryElement == null ? null : localEntryElement.getElement();
if (node != null) {
String solrsnippet = this.snippets.remove(ASCII.String(node.hash())); // we can remove this because it's used only once
if (solrsnippet != null && solrsnippet.length() > 0) {
final TextSnippet snippet = new TextSnippet(node.hash(), solrsnippet, true, ResultClass.SOURCE_CACHE, "");
LinkedHashSet<String> solrsnippet = this.snippets.remove(ASCII.String(node.hash())); // we can remove this because it's used only once
if (solrsnippet != null && solrsnippet.size() > 0) {
OpensearchResponseWriter.removeSubsumedTitle(solrsnippet, node.dc_title());
final TextSnippet snippet = new TextSnippet(node.hash(), OpensearchResponseWriter.getLargestSnippet(solrsnippet), true, ResultClass.SOURCE_CACHE, "");
ResultEntry re = new ResultEntry(node, this.query.getSegment(), this.peers, snippet, null, 0);
addResult(re);
success = true;