enhanced snippets: remove lines which are identical to the title and

choose longer versions if possible. Prefer the description part.
2024-09-19 00:01:41 +02:00 · 2014-05-06 16:48:50 +02:00 · 2014-05-06 16:48:50 +02:00 · 4e734815e8
commit 4e734815e8
parent e84e07399a
7 changed files with 67 additions and 26 deletions
--- a/source/net/yacy/cora/federate/solr/responsewriter/GSAResponseWriter.java
+++ b/source/net/yacy/cora/federate/solr/responsewriter/GSAResponseWriter.java
@ -26,6 +26,7 @@ import java.util.ArrayList;
 import java.util.Date;
 import java.util.HashMap;
 import java.util.HashSet;
+import java.util.LinkedHashSet;
 import java.util.List;
 import java.util.Map;
 import java.util.Set;
@ -147,7 +148,7 @@ public class GSAResponseWriter implements QueryResponseWriter {
        DocList response = ((ResultContext) rsp.getValues().get("response")).docs;
        @SuppressWarnings("unchecked")
        SimpleOrderedMap<Object> highlighting = (SimpleOrderedMap<Object>) rsp.getValues().get("highlighting");
-        Map<String, List<String>> snippets = OpensearchResponseWriter.highlighting(highlighting);
+        Map<String, LinkedHashSet<String>> snippets = OpensearchResponseWriter.highlighting(highlighting);
        Map<Object,Object> context = request.getContext();

        // parse response header
@ -241,6 +242,7 @@ public class GSAResponseWriter implements QueryResponseWriter {
            List<String> collections = new ArrayList<String>();
            int size = 0;
            boolean title_written = false; // the solr index may contain several; we take only the first which should be the visible tag in <title></title>
+            String title = null;
            for (IndexableField value: fields) {
                String fieldName = value.name();

@ -262,7 +264,8 @@ public class GSAResponseWriter implements QueryResponseWriter {
                    continue;
                }
                if (CollectionSchema.title.getSolrFieldName().equals(fieldName) && !title_written) {
-                    OpensearchResponseWriter.solitaireTag(writer, GSAToken.T.name(), highlight(value.stringValue(), query));
+                    title = value.stringValue();
+                    OpensearchResponseWriter.solitaireTag(writer, GSAToken.T.name(), highlight(title, query));
                    //texts.add(value.stringValue());
                    title_written = true;
                    continue;
@ -296,8 +299,9 @@ public class GSAResponseWriter implements QueryResponseWriter {
                //System.out.println("superfluous field: " + fieldName + ": " + value.stringValue()); // this can be avoided setting the enableLazyFieldLoading = false in solrconfig.xml
            }
            // compute snippet from texts
-            List<String> snippet = urlhash == null ? null : snippets.get(urlhash);
-            OpensearchResponseWriter.solitaireTag(writer, GSAToken.S.name(), snippet == null || snippet.size() == 0 ? (descriptions.size() > 0 ? descriptions.get(0) : "") : snippet.get(0));
+            LinkedHashSet<String> snippet = urlhash == null ? null : snippets.get(urlhash);
+            OpensearchResponseWriter.removeSubsumedTitle(snippet, title);
+            OpensearchResponseWriter.solitaireTag(writer, GSAToken.S.name(), snippet == null || snippet.size() == 0 ? (descriptions.size() > 0 ? descriptions.get(0) : "") : OpensearchResponseWriter.getLargestSnippet(snippet));
            OpensearchResponseWriter.solitaireTag(writer, GSAToken.GD.name(), descriptions.size() > 0 ? descriptions.get(0) : "");
            String cols = collections.toString();
            if (collections.size() > 0) OpensearchResponseWriter.solitaireTag(writer, "COLS" /*SPECIAL!*/, collections.size() > 1 ? cols.substring(1, cols.length() - 1).replaceAll(" ", "") : collections.get(0));
--- a/source/net/yacy/cora/federate/solr/responsewriter/OpensearchResponseWriter.java
+++ b/source/net/yacy/cora/federate/solr/responsewriter/OpensearchResponseWriter.java
@ -27,9 +27,12 @@ import java.util.ArrayList;
 import java.util.Date;
 import java.util.HashMap;
 import java.util.HashSet;
+import java.util.Iterator;
+import java.util.LinkedHashSet;
 import java.util.List;
 import java.util.Map;
 import java.util.Set;
+import java.util.regex.Pattern;

 import net.yacy.cora.document.feed.RSSMessage;
 import net.yacy.cora.document.id.MultiProtocolURL;
@ -114,7 +117,7 @@ public class OpensearchResponseWriter implements QueryResponseWriter {
        SimpleOrderedMap<Object> facetFields = facetCounts == null || facetCounts.size() == 0 ? null : (SimpleOrderedMap<Object>) facetCounts.get("facet_fields");
        @SuppressWarnings("unchecked")
        SimpleOrderedMap<Object> highlighting = (SimpleOrderedMap<Object>) values.get("highlighting");
-        Map<String, List<String>> snippets = highlighting(highlighting);
+        Map<String, LinkedHashSet<String>> snippets = highlighting(highlighting);

        // parse response header
        ResHead resHead = new ResHead();
@ -233,17 +236,18 @@ public class OpensearchResponseWriter implements QueryResponseWriter {
            
            // compute snippet from texts
            solitaireTag(writer, RSSMessage.Token.title.name(), title.length() == 0 ? (texts.size() == 0 ? "" : texts.get(0)) : title);
-            List<String> snippet = urlhash == null ? null : snippets.get(urlhash);
+            LinkedHashSet<String> snippet = urlhash == null ? null : snippets.get(urlhash);
            String tagname = RSSMessage.Token.description.name();
            if (snippet == null || snippet.size() == 0) {
                for (String d: descriptions) {
                    writer.write("<"); writer.write(tagname); writer.write('>');
-                    XML.escapeCharData(snippet == null || snippet.size() == 0 ? d : snippet.get(0), writer);
+                    XML.escapeCharData(d, writer);
                    writer.write("</"); writer.write(tagname); writer.write(">\n");
                }
            } else {
+                OpensearchResponseWriter.removeSubsumedTitle(snippet, title);
                writer.write("<"); writer.write(tagname); writer.write('>');
-                XML.escapeCharData(snippet.get(0), writer);
+                XML.escapeCharData(OpensearchResponseWriter.getLargestSnippet(snippet), writer);
                writer.write("</"); writer.write(tagname); writer.write(">\n");
            }
            // open: where do we get the subject?
@ -296,8 +300,8 @@ public class OpensearchResponseWriter implements QueryResponseWriter {
     * @return a map from urlhashes to a list of snippets for that url
     */
    @SuppressWarnings("unchecked")
-    public static Map<String, List<String>> highlighting(final SimpleOrderedMap<Object> val) {
-        Map<String, List<String>> snippets = new HashMap<String, List<String>>();
+    public static Map<String, LinkedHashSet<String>> highlighting(final SimpleOrderedMap<Object> val) {
+        Map<String, LinkedHashSet<String>> snippets = new HashMap<String, LinkedHashSet<String>>();
        if (val == null) return snippets;
        int sz = val.size();
        Object v, vv;
@ -306,7 +310,7 @@ public class OpensearchResponseWriter implements QueryResponseWriter {
            v = val.getVal(i);
            if (v instanceof SimpleOrderedMap) {
                int sz1 = ((SimpleOrderedMap<Object>) v).size();
-                List<String> t = new ArrayList<String>(sz1);
+                LinkedHashSet<String> t = new LinkedHashSet<String>();
                for (int j = 0; j < sz1; j++) {
                    vv = ((SimpleOrderedMap<Object>) v).getVal(j);
                    if (vv instanceof String[]) {
@ -319,6 +323,30 @@ public class OpensearchResponseWriter implements QueryResponseWriter {
        return snippets;
    }

+    final static Pattern keymarks = Pattern.compile("<b>|</b>");
+    
+    public static void removeSubsumedTitle(LinkedHashSet<String> snippets, String title) {
+        if (title == null || title.length() == 0 || snippets == null || snippets.size() == 0) return;
+        snippets.remove(title);
+        String tlc = title.toLowerCase();
+        Iterator<String> i = snippets.iterator();
+        while (i.hasNext()) {
+            String s = i.next().toLowerCase();
+            s = keymarks.matcher(s).replaceAll("");
+            if (tlc.toLowerCase().indexOf(s) >= 0 || s.toLowerCase().indexOf(tlc) >= 0) i.remove();
+        }
+        return;
+    }
+
+    public static String getLargestSnippet(LinkedHashSet<String> snippets) {
+        if (snippets == null || snippets.size() == 0) return null;
+        String l = null;
+        for (String s: snippets) {
+            if ((l == null || s.length() > l.length()) && s.indexOf(' ') > 0) l = s;
+        }
+        return l;
+    }
+    
    public static void openTag(final Writer writer, final String tag) throws IOException {
        writer.write('<'); writer.write(tag); writer.write(">\n");
    }
--- a/source/net/yacy/cora/federate/solr/responsewriter/YJsonResponseWriter.java
+++ b/source/net/yacy/cora/federate/solr/responsewriter/YJsonResponseWriter.java
@ -26,6 +26,7 @@ import java.net.MalformedURLException;
 import java.util.ArrayList;
 import java.util.Date;
 import java.util.HashMap;
+import java.util.LinkedHashSet;
 import java.util.List;
 import java.util.Map;

@ -98,7 +99,7 @@ public class YJsonResponseWriter implements QueryResponseWriter {
        SimpleOrderedMap<Object> facetFields = facetCounts == null || facetCounts.size() == 0 ? null : (SimpleOrderedMap<Object>) facetCounts.get("facet_fields");
        @SuppressWarnings("unchecked")
        SimpleOrderedMap<Object> highlighting = (SimpleOrderedMap<Object>) values.get("highlighting");
-        Map<String, List<String>> snippets = OpensearchResponseWriter.highlighting(highlighting);
+        Map<String, LinkedHashSet<String>> snippets = OpensearchResponseWriter.highlighting(highlighting);

        // parse response header
        ResHead resHead = new ResHead();
@ -213,8 +214,9 @@ public class YJsonResponseWriter implements QueryResponseWriter {
            // compute snippet from texts            
            solitaireTag(writer, "path", path.toString());
            solitaireTag(writer, "title", title.length() == 0 ? (texts.size() == 0 ? path.toString() : texts.get(0)) : title);
-            List<String> snippet = urlhash == null ? null : snippets.get(urlhash);
-            writer.write("\"description\":\""); writer.write(serverObjects.toJSON(snippet == null || snippet.size() == 0 ? (descriptions.size() > 0 ? descriptions.get(0) : "") : snippet.get(0))); writer.write("\"\n}\n");
+            LinkedHashSet<String> snippet = urlhash == null ? null : snippets.get(urlhash);
+            OpensearchResponseWriter.removeSubsumedTitle(snippet, title);
+            writer.write("\"description\":\""); writer.write(serverObjects.toJSON(snippet == null || snippet.size() == 0 ? (descriptions.size() > 0 ? descriptions.get(0) : "") : OpensearchResponseWriter.getLargestSnippet(snippet))); writer.write("\"\n}\n");
            if (i < responseCount - 1) {
                writer.write(",\n".toCharArray());
            }
--- a/source/net/yacy/http/servlets/GSAsearchServlet.java
+++ b/source/net/yacy/http/servlets/GSAsearchServlet.java
@ -181,7 +181,7 @@ public class GSAsearchServlet extends HttpServlet {
                CollectionSchema.size_i.getSolrFieldName());
        post.put("hl", "true");
        post.put("hl.q", originalQuery);
-        post.put("hl.fl", CollectionSchema.h1_txt.getSolrFieldName() + "," + CollectionSchema.h2_txt.getSolrFieldName() + "," + CollectionSchema.text_t.getSolrFieldName());
+        post.put("hl.fl", CollectionSchema.description_txt + "," + CollectionSchema.h4_txt.getSolrFieldName() + "," + CollectionSchema.h3_txt.getSolrFieldName() + "," + CollectionSchema.h2_txt.getSolrFieldName() + "," + CollectionSchema.h1_txt.getSolrFieldName() + "," + CollectionSchema.text_t.getSolrFieldName());
        post.put("hl.alternateField", CollectionSchema.description_txt.getSolrFieldName());
        post.put("hl.simple.pre", "<b>");
        post.put("hl.simple.post", "</b>");
--- a/source/net/yacy/http/servlets/SolrSelectServlet.java
+++ b/source/net/yacy/http/servlets/SolrSelectServlet.java
@ -180,7 +180,7 @@ public class SolrSelectServlet extends HttpServlet {
            if ((responseWriter instanceof YJsonResponseWriter || responseWriter instanceof OpensearchResponseWriter) && "true".equals(mmsp.get("hl", "true"))) {
                // add options for snippet generation
                if (!mmsp.getMap().containsKey("hl.q")) mmsp.getMap().put("hl.q", new String[]{q});
-                if (!mmsp.getMap().containsKey("hl.fl")) mmsp.getMap().put("hl.fl", new String[]{CollectionSchema.h1_txt.getSolrFieldName() + "," + CollectionSchema.h2_txt.getSolrFieldName() + "," + CollectionSchema.text_t.getSolrFieldName()});
+                if (!mmsp.getMap().containsKey("hl.fl")) mmsp.getMap().put("hl.fl", new String[]{CollectionSchema.description_txt + "," + CollectionSchema.h4_txt.getSolrFieldName() + "," + CollectionSchema.h3_txt.getSolrFieldName() + "," + CollectionSchema.h2_txt.getSolrFieldName() + "," + CollectionSchema.h1_txt.getSolrFieldName() + "," + CollectionSchema.text_t.getSolrFieldName()});
                if (!mmsp.getMap().containsKey("hl.alternateField")) mmsp.getMap().put("hl.alternateField", new String[]{CollectionSchema.description_txt.getSolrFieldName()});
                if (!mmsp.getMap().containsKey("hl.simple.pre")) mmsp.getMap().put("hl.simple.pre", new String[]{"<b>"});
                if (!mmsp.getMap().containsKey("hl.simple.post")) mmsp.getMap().put("hl.simple.post", new String[]{"</b>"});
--- a/source/net/yacy/peers/Protocol.java
+++ b/source/net/yacy/peers/Protocol.java
@ -53,6 +53,7 @@ import java.util.Collection;
 import java.util.HashMap;
 import java.util.Iterator;
 import java.util.LinkedHashMap;
+import java.util.LinkedHashSet;
 import java.util.List;
 import java.util.Map;
 import java.util.TreeMap;
@ -71,6 +72,7 @@ import net.yacy.cora.federate.opensearch.SRURSSConnector;
 import net.yacy.cora.federate.solr.connector.RemoteSolrConnector;
 import net.yacy.cora.federate.solr.connector.SolrConnector;
 import net.yacy.cora.federate.solr.instance.RemoteInstance;
+import net.yacy.cora.federate.solr.responsewriter.OpensearchResponseWriter;
 import net.yacy.cora.federate.yacy.CacheStrategy;
 import net.yacy.cora.order.Base64Order;
 import net.yacy.cora.order.Digest;
@ -999,7 +1001,7 @@ public final class Protocol {
        }
    }

-    private final static CollectionSchema[] snippetFields = new CollectionSchema[]{CollectionSchema.h1_txt, CollectionSchema.h2_txt, CollectionSchema.text_t};
+    private final static CollectionSchema[] snippetFields = new CollectionSchema[]{CollectionSchema.description_txt, CollectionSchema.h4_txt, CollectionSchema.h3_txt, CollectionSchema.h2_txt, CollectionSchema.h1_txt, CollectionSchema.text_t};
    
    protected static int solrQuery(
            final SearchEvent event,
@ -1025,14 +1027,14 @@ public final class Protocol {
            //solrQuery.setHighlightRequireFieldMatch();
            solrQuery.setHighlightSimplePost("</b>");
            solrQuery.setHighlightSimplePre("<b>");
-            solrQuery.setHighlightSnippets(1);
+            solrQuery.setHighlightSnippets(5);
            for (CollectionSchema field: snippetFields) solrQuery.addHighlightField(field.getSolrFieldName());
        } else {
            solrQuery.setHighlight(false);
        }
        boolean localsearch = target == null || target.equals(event.peers.mySeed());
        Map<String, ReversibleScoreMap<String>> facets = new HashMap<String, ReversibleScoreMap<String>>(event.query.facetfields.size());
-        Map<String, String> snippets = new HashMap<String, String>(); // this will be a list of urlhash-snippet entries
+        Map<String, LinkedHashSet<String>> snippets = new HashMap<String, LinkedHashSet<String>>(); // this will be a list of urlhash-snippet entries
        final QueryResponse[] rsp = new QueryResponse[]{null};
        final SolrDocumentList[] docList = new SolrDocumentList[]{null};
        {// encapsulate expensive solr QueryResponse object
@ -1122,7 +1124,9 @@ public final class Protocol {
                        if (rs.containsKey(field.getSolrFieldName())) {
                            List<String> s = rs.get(field.getSolrFieldName());
                            if (s.size() > 0) {
-                                snippets.put(re.getKey(), s.get(0));
+                                LinkedHashSet<String> ls = new LinkedHashSet<String>();
+                                ls.addAll(s);
+                                snippets.put(re.getKey(), ls);
                                continue nextsnippet;
                            }
                        }
--- a/source/net/yacy/search/query/SearchEvent.java
+++ b/source/net/yacy/search/query/SearchEvent.java
@ -33,6 +33,7 @@ import java.util.ConcurrentModificationException;
 import java.util.HashMap;
 import java.util.Iterator;
 import java.util.LinkedHashMap;
+import java.util.LinkedHashSet;
 import java.util.List;
 import java.util.Map;
 import java.util.SortedMap;
@ -49,6 +50,7 @@ import net.yacy.cora.document.analysis.Classification.ContentDomain;
 import net.yacy.cora.document.encoding.ASCII;
 import net.yacy.cora.document.id.DigestURL;
 import net.yacy.cora.document.id.MultiProtocolURL;
+import net.yacy.cora.federate.solr.responsewriter.OpensearchResponseWriter;
 import net.yacy.cora.federate.yacy.CacheStrategy;
 import net.yacy.cora.federate.yacy.Distribution;
 import net.yacy.cora.lod.vocabulary.Tagging;
@ -150,7 +152,7 @@ public final class SearchEvent {
    private final boolean                                 deleteIfSnippetFail;
    private long                                          urlRetrievalAllTime;
    private long                                          snippetComputationAllTime;
-    private ConcurrentHashMap<String, String> snippets;
+    private ConcurrentHashMap<String, LinkedHashSet<String>> snippets;
    private final boolean remote;
    private SortedMap<byte[], ReferenceContainer<WordReference>> localSearchInclusion;
    private final ScoreMap<String> ref; // reference score computation for the commonSense heuristic
@ -234,7 +236,7 @@ public final class SearchEvent {
        this.topicNavigatorCount = navcfg.contains("topics") ? MAX_TOPWORDS : 0;
        this.languageNavigator = navcfg.contains("language") ? new ConcurrentScoreMap<String>() : null;
        this.vocabularyNavigator = new ConcurrentHashMap<String, ScoreMap<String>>();
-        this.snippets = new ConcurrentHashMap<String, String>(); 
+        this.snippets = new ConcurrentHashMap<String, LinkedHashSet<String>>(); 
        this.secondarySearchSuperviser = (this.query.getQueryGoal().getIncludeHashes().size() > 1) ? new SecondarySearchSuperviser(this) : null; // generate abstracts only for combined searches
        if (this.secondarySearchSuperviser != null) this.secondarySearchSuperviser.start();
        this.secondarySearchThreads = null;
@ -701,7 +703,7 @@ public final class SearchEvent {
    public void addNodes(
        final List<URIMetadataNode> nodeList,
        final Map<String, ReversibleScoreMap<String>> facets, // a map from a field name to scored values
-        final Map<String, String> solrsnippets, // a map from urlhash to snippet text
+        final Map<String, LinkedHashSet<String>> solrsnippets, // a map from urlhash to snippet text
        final boolean local,
        final String resourceName,
        final int fullResource) {
@ -1218,9 +1220,10 @@ public final class SearchEvent {
        Element<URIMetadataNode> localEntryElement = this.nodeStack.sizeQueue() > 0 ? this.nodeStack.poll() : null;
        URIMetadataNode node = localEntryElement == null ? null : localEntryElement.getElement();
        if (node != null) {
-            String solrsnippet = this.snippets.remove(ASCII.String(node.hash())); // we can remove this because it's used only once
-            if (solrsnippet != null && solrsnippet.length() > 0) {
-                final TextSnippet snippet = new TextSnippet(node.hash(), solrsnippet, true, ResultClass.SOURCE_CACHE, "");
+            LinkedHashSet<String> solrsnippet = this.snippets.remove(ASCII.String(node.hash())); // we can remove this because it's used only once
+            if (solrsnippet != null && solrsnippet.size() > 0) {
+                OpensearchResponseWriter.removeSubsumedTitle(solrsnippet, node.dc_title());
+                final TextSnippet snippet = new TextSnippet(node.hash(), OpensearchResponseWriter.getLargestSnippet(solrsnippet), true, ResultClass.SOURCE_CACHE, "");
                ResultEntry re = new ResultEntry(node, this.query.getSegment(), this.peers, snippet, null, 0);
                addResult(re);
                success = true;