mirror of
https://github.com/yacy/yacy_search_server.git
synced 2024-09-19 00:01:41 +02:00
enhanced snippets: remove lines which are identical to the title and
choose longer versions if possible. Prefer the description part.
This commit is contained in:
parent
e84e07399a
commit
4e734815e8
|
@ -26,6 +26,7 @@ import java.util.ArrayList;
|
|||
import java.util.Date;
|
||||
import java.util.HashMap;
|
||||
import java.util.HashSet;
|
||||
import java.util.LinkedHashSet;
|
||||
import java.util.List;
|
||||
import java.util.Map;
|
||||
import java.util.Set;
|
||||
|
@ -147,7 +148,7 @@ public class GSAResponseWriter implements QueryResponseWriter {
|
|||
DocList response = ((ResultContext) rsp.getValues().get("response")).docs;
|
||||
@SuppressWarnings("unchecked")
|
||||
SimpleOrderedMap<Object> highlighting = (SimpleOrderedMap<Object>) rsp.getValues().get("highlighting");
|
||||
Map<String, List<String>> snippets = OpensearchResponseWriter.highlighting(highlighting);
|
||||
Map<String, LinkedHashSet<String>> snippets = OpensearchResponseWriter.highlighting(highlighting);
|
||||
Map<Object,Object> context = request.getContext();
|
||||
|
||||
// parse response header
|
||||
|
@ -241,6 +242,7 @@ public class GSAResponseWriter implements QueryResponseWriter {
|
|||
List<String> collections = new ArrayList<String>();
|
||||
int size = 0;
|
||||
boolean title_written = false; // the solr index may contain several; we take only the first which should be the visible tag in <title></title>
|
||||
String title = null;
|
||||
for (IndexableField value: fields) {
|
||||
String fieldName = value.name();
|
||||
|
||||
|
@ -262,7 +264,8 @@ public class GSAResponseWriter implements QueryResponseWriter {
|
|||
continue;
|
||||
}
|
||||
if (CollectionSchema.title.getSolrFieldName().equals(fieldName) && !title_written) {
|
||||
OpensearchResponseWriter.solitaireTag(writer, GSAToken.T.name(), highlight(value.stringValue(), query));
|
||||
title = value.stringValue();
|
||||
OpensearchResponseWriter.solitaireTag(writer, GSAToken.T.name(), highlight(title, query));
|
||||
//texts.add(value.stringValue());
|
||||
title_written = true;
|
||||
continue;
|
||||
|
@ -296,8 +299,9 @@ public class GSAResponseWriter implements QueryResponseWriter {
|
|||
//System.out.println("superfluous field: " + fieldName + ": " + value.stringValue()); // this can be avoided setting the enableLazyFieldLoading = false in solrconfig.xml
|
||||
}
|
||||
// compute snippet from texts
|
||||
List<String> snippet = urlhash == null ? null : snippets.get(urlhash);
|
||||
OpensearchResponseWriter.solitaireTag(writer, GSAToken.S.name(), snippet == null || snippet.size() == 0 ? (descriptions.size() > 0 ? descriptions.get(0) : "") : snippet.get(0));
|
||||
LinkedHashSet<String> snippet = urlhash == null ? null : snippets.get(urlhash);
|
||||
OpensearchResponseWriter.removeSubsumedTitle(snippet, title);
|
||||
OpensearchResponseWriter.solitaireTag(writer, GSAToken.S.name(), snippet == null || snippet.size() == 0 ? (descriptions.size() > 0 ? descriptions.get(0) : "") : OpensearchResponseWriter.getLargestSnippet(snippet));
|
||||
OpensearchResponseWriter.solitaireTag(writer, GSAToken.GD.name(), descriptions.size() > 0 ? descriptions.get(0) : "");
|
||||
String cols = collections.toString();
|
||||
if (collections.size() > 0) OpensearchResponseWriter.solitaireTag(writer, "COLS" /*SPECIAL!*/, collections.size() > 1 ? cols.substring(1, cols.length() - 1).replaceAll(" ", "") : collections.get(0));
|
||||
|
|
|
@ -27,9 +27,12 @@ import java.util.ArrayList;
|
|||
import java.util.Date;
|
||||
import java.util.HashMap;
|
||||
import java.util.HashSet;
|
||||
import java.util.Iterator;
|
||||
import java.util.LinkedHashSet;
|
||||
import java.util.List;
|
||||
import java.util.Map;
|
||||
import java.util.Set;
|
||||
import java.util.regex.Pattern;
|
||||
|
||||
import net.yacy.cora.document.feed.RSSMessage;
|
||||
import net.yacy.cora.document.id.MultiProtocolURL;
|
||||
|
@ -114,7 +117,7 @@ public class OpensearchResponseWriter implements QueryResponseWriter {
|
|||
SimpleOrderedMap<Object> facetFields = facetCounts == null || facetCounts.size() == 0 ? null : (SimpleOrderedMap<Object>) facetCounts.get("facet_fields");
|
||||
@SuppressWarnings("unchecked")
|
||||
SimpleOrderedMap<Object> highlighting = (SimpleOrderedMap<Object>) values.get("highlighting");
|
||||
Map<String, List<String>> snippets = highlighting(highlighting);
|
||||
Map<String, LinkedHashSet<String>> snippets = highlighting(highlighting);
|
||||
|
||||
// parse response header
|
||||
ResHead resHead = new ResHead();
|
||||
|
@ -233,17 +236,18 @@ public class OpensearchResponseWriter implements QueryResponseWriter {
|
|||
|
||||
// compute snippet from texts
|
||||
solitaireTag(writer, RSSMessage.Token.title.name(), title.length() == 0 ? (texts.size() == 0 ? "" : texts.get(0)) : title);
|
||||
List<String> snippet = urlhash == null ? null : snippets.get(urlhash);
|
||||
LinkedHashSet<String> snippet = urlhash == null ? null : snippets.get(urlhash);
|
||||
String tagname = RSSMessage.Token.description.name();
|
||||
if (snippet == null || snippet.size() == 0) {
|
||||
for (String d: descriptions) {
|
||||
writer.write("<"); writer.write(tagname); writer.write('>');
|
||||
XML.escapeCharData(snippet == null || snippet.size() == 0 ? d : snippet.get(0), writer);
|
||||
XML.escapeCharData(d, writer);
|
||||
writer.write("</"); writer.write(tagname); writer.write(">\n");
|
||||
}
|
||||
} else {
|
||||
OpensearchResponseWriter.removeSubsumedTitle(snippet, title);
|
||||
writer.write("<"); writer.write(tagname); writer.write('>');
|
||||
XML.escapeCharData(snippet.get(0), writer);
|
||||
XML.escapeCharData(OpensearchResponseWriter.getLargestSnippet(snippet), writer);
|
||||
writer.write("</"); writer.write(tagname); writer.write(">\n");
|
||||
}
|
||||
// open: where do we get the subject?
|
||||
|
@ -296,8 +300,8 @@ public class OpensearchResponseWriter implements QueryResponseWriter {
|
|||
* @return a map from urlhashes to a list of snippets for that url
|
||||
*/
|
||||
@SuppressWarnings("unchecked")
|
||||
public static Map<String, List<String>> highlighting(final SimpleOrderedMap<Object> val) {
|
||||
Map<String, List<String>> snippets = new HashMap<String, List<String>>();
|
||||
public static Map<String, LinkedHashSet<String>> highlighting(final SimpleOrderedMap<Object> val) {
|
||||
Map<String, LinkedHashSet<String>> snippets = new HashMap<String, LinkedHashSet<String>>();
|
||||
if (val == null) return snippets;
|
||||
int sz = val.size();
|
||||
Object v, vv;
|
||||
|
@ -306,7 +310,7 @@ public class OpensearchResponseWriter implements QueryResponseWriter {
|
|||
v = val.getVal(i);
|
||||
if (v instanceof SimpleOrderedMap) {
|
||||
int sz1 = ((SimpleOrderedMap<Object>) v).size();
|
||||
List<String> t = new ArrayList<String>(sz1);
|
||||
LinkedHashSet<String> t = new LinkedHashSet<String>();
|
||||
for (int j = 0; j < sz1; j++) {
|
||||
vv = ((SimpleOrderedMap<Object>) v).getVal(j);
|
||||
if (vv instanceof String[]) {
|
||||
|
@ -319,6 +323,30 @@ public class OpensearchResponseWriter implements QueryResponseWriter {
|
|||
return snippets;
|
||||
}
|
||||
|
||||
final static Pattern keymarks = Pattern.compile("<b>|</b>");
|
||||
|
||||
public static void removeSubsumedTitle(LinkedHashSet<String> snippets, String title) {
|
||||
if (title == null || title.length() == 0 || snippets == null || snippets.size() == 0) return;
|
||||
snippets.remove(title);
|
||||
String tlc = title.toLowerCase();
|
||||
Iterator<String> i = snippets.iterator();
|
||||
while (i.hasNext()) {
|
||||
String s = i.next().toLowerCase();
|
||||
s = keymarks.matcher(s).replaceAll("");
|
||||
if (tlc.toLowerCase().indexOf(s) >= 0 || s.toLowerCase().indexOf(tlc) >= 0) i.remove();
|
||||
}
|
||||
return;
|
||||
}
|
||||
|
||||
public static String getLargestSnippet(LinkedHashSet<String> snippets) {
|
||||
if (snippets == null || snippets.size() == 0) return null;
|
||||
String l = null;
|
||||
for (String s: snippets) {
|
||||
if ((l == null || s.length() > l.length()) && s.indexOf(' ') > 0) l = s;
|
||||
}
|
||||
return l;
|
||||
}
|
||||
|
||||
public static void openTag(final Writer writer, final String tag) throws IOException {
|
||||
writer.write('<'); writer.write(tag); writer.write(">\n");
|
||||
}
|
||||
|
|
|
@ -26,6 +26,7 @@ import java.net.MalformedURLException;
|
|||
import java.util.ArrayList;
|
||||
import java.util.Date;
|
||||
import java.util.HashMap;
|
||||
import java.util.LinkedHashSet;
|
||||
import java.util.List;
|
||||
import java.util.Map;
|
||||
|
||||
|
@ -98,7 +99,7 @@ public class YJsonResponseWriter implements QueryResponseWriter {
|
|||
SimpleOrderedMap<Object> facetFields = facetCounts == null || facetCounts.size() == 0 ? null : (SimpleOrderedMap<Object>) facetCounts.get("facet_fields");
|
||||
@SuppressWarnings("unchecked")
|
||||
SimpleOrderedMap<Object> highlighting = (SimpleOrderedMap<Object>) values.get("highlighting");
|
||||
Map<String, List<String>> snippets = OpensearchResponseWriter.highlighting(highlighting);
|
||||
Map<String, LinkedHashSet<String>> snippets = OpensearchResponseWriter.highlighting(highlighting);
|
||||
|
||||
// parse response header
|
||||
ResHead resHead = new ResHead();
|
||||
|
@ -213,8 +214,9 @@ public class YJsonResponseWriter implements QueryResponseWriter {
|
|||
// compute snippet from texts
|
||||
solitaireTag(writer, "path", path.toString());
|
||||
solitaireTag(writer, "title", title.length() == 0 ? (texts.size() == 0 ? path.toString() : texts.get(0)) : title);
|
||||
List<String> snippet = urlhash == null ? null : snippets.get(urlhash);
|
||||
writer.write("\"description\":\""); writer.write(serverObjects.toJSON(snippet == null || snippet.size() == 0 ? (descriptions.size() > 0 ? descriptions.get(0) : "") : snippet.get(0))); writer.write("\"\n}\n");
|
||||
LinkedHashSet<String> snippet = urlhash == null ? null : snippets.get(urlhash);
|
||||
OpensearchResponseWriter.removeSubsumedTitle(snippet, title);
|
||||
writer.write("\"description\":\""); writer.write(serverObjects.toJSON(snippet == null || snippet.size() == 0 ? (descriptions.size() > 0 ? descriptions.get(0) : "") : OpensearchResponseWriter.getLargestSnippet(snippet))); writer.write("\"\n}\n");
|
||||
if (i < responseCount - 1) {
|
||||
writer.write(",\n".toCharArray());
|
||||
}
|
||||
|
|
|
@ -181,7 +181,7 @@ public class GSAsearchServlet extends HttpServlet {
|
|||
CollectionSchema.size_i.getSolrFieldName());
|
||||
post.put("hl", "true");
|
||||
post.put("hl.q", originalQuery);
|
||||
post.put("hl.fl", CollectionSchema.h1_txt.getSolrFieldName() + "," + CollectionSchema.h2_txt.getSolrFieldName() + "," + CollectionSchema.text_t.getSolrFieldName());
|
||||
post.put("hl.fl", CollectionSchema.description_txt + "," + CollectionSchema.h4_txt.getSolrFieldName() + "," + CollectionSchema.h3_txt.getSolrFieldName() + "," + CollectionSchema.h2_txt.getSolrFieldName() + "," + CollectionSchema.h1_txt.getSolrFieldName() + "," + CollectionSchema.text_t.getSolrFieldName());
|
||||
post.put("hl.alternateField", CollectionSchema.description_txt.getSolrFieldName());
|
||||
post.put("hl.simple.pre", "<b>");
|
||||
post.put("hl.simple.post", "</b>");
|
||||
|
|
|
@ -180,7 +180,7 @@ public class SolrSelectServlet extends HttpServlet {
|
|||
if ((responseWriter instanceof YJsonResponseWriter || responseWriter instanceof OpensearchResponseWriter) && "true".equals(mmsp.get("hl", "true"))) {
|
||||
// add options for snippet generation
|
||||
if (!mmsp.getMap().containsKey("hl.q")) mmsp.getMap().put("hl.q", new String[]{q});
|
||||
if (!mmsp.getMap().containsKey("hl.fl")) mmsp.getMap().put("hl.fl", new String[]{CollectionSchema.h1_txt.getSolrFieldName() + "," + CollectionSchema.h2_txt.getSolrFieldName() + "," + CollectionSchema.text_t.getSolrFieldName()});
|
||||
if (!mmsp.getMap().containsKey("hl.fl")) mmsp.getMap().put("hl.fl", new String[]{CollectionSchema.description_txt + "," + CollectionSchema.h4_txt.getSolrFieldName() + "," + CollectionSchema.h3_txt.getSolrFieldName() + "," + CollectionSchema.h2_txt.getSolrFieldName() + "," + CollectionSchema.h1_txt.getSolrFieldName() + "," + CollectionSchema.text_t.getSolrFieldName()});
|
||||
if (!mmsp.getMap().containsKey("hl.alternateField")) mmsp.getMap().put("hl.alternateField", new String[]{CollectionSchema.description_txt.getSolrFieldName()});
|
||||
if (!mmsp.getMap().containsKey("hl.simple.pre")) mmsp.getMap().put("hl.simple.pre", new String[]{"<b>"});
|
||||
if (!mmsp.getMap().containsKey("hl.simple.post")) mmsp.getMap().put("hl.simple.post", new String[]{"</b>"});
|
||||
|
|
|
@ -53,6 +53,7 @@ import java.util.Collection;
|
|||
import java.util.HashMap;
|
||||
import java.util.Iterator;
|
||||
import java.util.LinkedHashMap;
|
||||
import java.util.LinkedHashSet;
|
||||
import java.util.List;
|
||||
import java.util.Map;
|
||||
import java.util.TreeMap;
|
||||
|
@ -71,6 +72,7 @@ import net.yacy.cora.federate.opensearch.SRURSSConnector;
|
|||
import net.yacy.cora.federate.solr.connector.RemoteSolrConnector;
|
||||
import net.yacy.cora.federate.solr.connector.SolrConnector;
|
||||
import net.yacy.cora.federate.solr.instance.RemoteInstance;
|
||||
import net.yacy.cora.federate.solr.responsewriter.OpensearchResponseWriter;
|
||||
import net.yacy.cora.federate.yacy.CacheStrategy;
|
||||
import net.yacy.cora.order.Base64Order;
|
||||
import net.yacy.cora.order.Digest;
|
||||
|
@ -999,7 +1001,7 @@ public final class Protocol {
|
|||
}
|
||||
}
|
||||
|
||||
private final static CollectionSchema[] snippetFields = new CollectionSchema[]{CollectionSchema.h1_txt, CollectionSchema.h2_txt, CollectionSchema.text_t};
|
||||
private final static CollectionSchema[] snippetFields = new CollectionSchema[]{CollectionSchema.description_txt, CollectionSchema.h4_txt, CollectionSchema.h3_txt, CollectionSchema.h2_txt, CollectionSchema.h1_txt, CollectionSchema.text_t};
|
||||
|
||||
protected static int solrQuery(
|
||||
final SearchEvent event,
|
||||
|
@ -1025,14 +1027,14 @@ public final class Protocol {
|
|||
//solrQuery.setHighlightRequireFieldMatch();
|
||||
solrQuery.setHighlightSimplePost("</b>");
|
||||
solrQuery.setHighlightSimplePre("<b>");
|
||||
solrQuery.setHighlightSnippets(1);
|
||||
solrQuery.setHighlightSnippets(5);
|
||||
for (CollectionSchema field: snippetFields) solrQuery.addHighlightField(field.getSolrFieldName());
|
||||
} else {
|
||||
solrQuery.setHighlight(false);
|
||||
}
|
||||
boolean localsearch = target == null || target.equals(event.peers.mySeed());
|
||||
Map<String, ReversibleScoreMap<String>> facets = new HashMap<String, ReversibleScoreMap<String>>(event.query.facetfields.size());
|
||||
Map<String, String> snippets = new HashMap<String, String>(); // this will be a list of urlhash-snippet entries
|
||||
Map<String, LinkedHashSet<String>> snippets = new HashMap<String, LinkedHashSet<String>>(); // this will be a list of urlhash-snippet entries
|
||||
final QueryResponse[] rsp = new QueryResponse[]{null};
|
||||
final SolrDocumentList[] docList = new SolrDocumentList[]{null};
|
||||
{// encapsulate expensive solr QueryResponse object
|
||||
|
@ -1122,7 +1124,9 @@ public final class Protocol {
|
|||
if (rs.containsKey(field.getSolrFieldName())) {
|
||||
List<String> s = rs.get(field.getSolrFieldName());
|
||||
if (s.size() > 0) {
|
||||
snippets.put(re.getKey(), s.get(0));
|
||||
LinkedHashSet<String> ls = new LinkedHashSet<String>();
|
||||
ls.addAll(s);
|
||||
snippets.put(re.getKey(), ls);
|
||||
continue nextsnippet;
|
||||
}
|
||||
}
|
||||
|
|
|
@ -33,6 +33,7 @@ import java.util.ConcurrentModificationException;
|
|||
import java.util.HashMap;
|
||||
import java.util.Iterator;
|
||||
import java.util.LinkedHashMap;
|
||||
import java.util.LinkedHashSet;
|
||||
import java.util.List;
|
||||
import java.util.Map;
|
||||
import java.util.SortedMap;
|
||||
|
@ -49,6 +50,7 @@ import net.yacy.cora.document.analysis.Classification.ContentDomain;
|
|||
import net.yacy.cora.document.encoding.ASCII;
|
||||
import net.yacy.cora.document.id.DigestURL;
|
||||
import net.yacy.cora.document.id.MultiProtocolURL;
|
||||
import net.yacy.cora.federate.solr.responsewriter.OpensearchResponseWriter;
|
||||
import net.yacy.cora.federate.yacy.CacheStrategy;
|
||||
import net.yacy.cora.federate.yacy.Distribution;
|
||||
import net.yacy.cora.lod.vocabulary.Tagging;
|
||||
|
@ -150,7 +152,7 @@ public final class SearchEvent {
|
|||
private final boolean deleteIfSnippetFail;
|
||||
private long urlRetrievalAllTime;
|
||||
private long snippetComputationAllTime;
|
||||
private ConcurrentHashMap<String, String> snippets;
|
||||
private ConcurrentHashMap<String, LinkedHashSet<String>> snippets;
|
||||
private final boolean remote;
|
||||
private SortedMap<byte[], ReferenceContainer<WordReference>> localSearchInclusion;
|
||||
private final ScoreMap<String> ref; // reference score computation for the commonSense heuristic
|
||||
|
@ -234,7 +236,7 @@ public final class SearchEvent {
|
|||
this.topicNavigatorCount = navcfg.contains("topics") ? MAX_TOPWORDS : 0;
|
||||
this.languageNavigator = navcfg.contains("language") ? new ConcurrentScoreMap<String>() : null;
|
||||
this.vocabularyNavigator = new ConcurrentHashMap<String, ScoreMap<String>>();
|
||||
this.snippets = new ConcurrentHashMap<String, String>();
|
||||
this.snippets = new ConcurrentHashMap<String, LinkedHashSet<String>>();
|
||||
this.secondarySearchSuperviser = (this.query.getQueryGoal().getIncludeHashes().size() > 1) ? new SecondarySearchSuperviser(this) : null; // generate abstracts only for combined searches
|
||||
if (this.secondarySearchSuperviser != null) this.secondarySearchSuperviser.start();
|
||||
this.secondarySearchThreads = null;
|
||||
|
@ -701,7 +703,7 @@ public final class SearchEvent {
|
|||
public void addNodes(
|
||||
final List<URIMetadataNode> nodeList,
|
||||
final Map<String, ReversibleScoreMap<String>> facets, // a map from a field name to scored values
|
||||
final Map<String, String> solrsnippets, // a map from urlhash to snippet text
|
||||
final Map<String, LinkedHashSet<String>> solrsnippets, // a map from urlhash to snippet text
|
||||
final boolean local,
|
||||
final String resourceName,
|
||||
final int fullResource) {
|
||||
|
@ -1218,9 +1220,10 @@ public final class SearchEvent {
|
|||
Element<URIMetadataNode> localEntryElement = this.nodeStack.sizeQueue() > 0 ? this.nodeStack.poll() : null;
|
||||
URIMetadataNode node = localEntryElement == null ? null : localEntryElement.getElement();
|
||||
if (node != null) {
|
||||
String solrsnippet = this.snippets.remove(ASCII.String(node.hash())); // we can remove this because it's used only once
|
||||
if (solrsnippet != null && solrsnippet.length() > 0) {
|
||||
final TextSnippet snippet = new TextSnippet(node.hash(), solrsnippet, true, ResultClass.SOURCE_CACHE, "");
|
||||
LinkedHashSet<String> solrsnippet = this.snippets.remove(ASCII.String(node.hash())); // we can remove this because it's used only once
|
||||
if (solrsnippet != null && solrsnippet.size() > 0) {
|
||||
OpensearchResponseWriter.removeSubsumedTitle(solrsnippet, node.dc_title());
|
||||
final TextSnippet snippet = new TextSnippet(node.hash(), OpensearchResponseWriter.getLargestSnippet(solrsnippet), true, ResultClass.SOURCE_CACHE, "");
|
||||
ResultEntry re = new ResultEntry(node, this.query.getSegment(), this.peers, snippet, null, 0);
|
||||
addResult(re);
|
||||
success = true;
|
||||
|
|
Loading…
Reference in New Issue
Block a user