From fd1776a3b063ddbcc5e1fcaa59aae92cc18e6524 Mon Sep 17 00:00:00 2001 From: Michael Peter Christen Date: Wed, 12 Jun 2013 15:02:49 +0200 Subject: [PATCH] added a new 'Citations' function: each search result item can now be explored for citations within other documents. A click on the 'Citations' link shows an analysis with all text lines in the document each with a complete list of documents which contain the same line. A second section shows the linking documents in ascending order of number of citations from the original document. Because documents from different hosts are most interesting here, they are listed at the top of the page as possible 'copypasta' source. --- defaults/yacy.init | 1 + htroot/ConfigPortal.html | 1 + htroot/ConfigPortal.java | 3 + htroot/ConfigSearchPage_p.html | 2 + htroot/ConfigSearchPage_p.java | 3 + htroot/api/citation.html | 34 +++ htroot/api/citation.java | 199 ++++++++++++++++++ htroot/yacysearchitem.html | 1 + htroot/yacysearchitem.java | 2 + .../yacy/cora/document/MultiProtocolURI.java | 8 + .../GrepHTMLResponseWriter.java | 59 ++---- .../responsewriter/HTMLResponseWriter.java | 2 +- 12 files changed, 267 insertions(+), 48 deletions(-) create mode 100644 htroot/api/citation.html create mode 100644 htroot/api/citation.java diff --git a/defaults/yacy.init b/defaults/yacy.init index 51be0538a..a6a2a1024 100644 --- a/defaults/yacy.init +++ b/defaults/yacy.init @@ -769,6 +769,7 @@ search.result.show.date = true search.result.show.size = false search.result.show.metadata = false search.result.show.parser = false +search.result.show.citation = true search.result.show.pictures = false search.result.show.cache = true search.result.show.proxy = false diff --git a/htroot/ConfigPortal.html b/htroot/ConfigPortal.html index ad8487564..db03e3935 100644 --- a/htroot/ConfigPortal.html +++ b/htroot/ConfigPortal.html @@ -79,6 +79,7 @@ Size  Metadata  Parser  + Citations  Pictures  Cache Augmented Browsing diff --git a/htroot/ConfigPortal.java b/htroot/ConfigPortal.java index fab45a661..456e6d445 100644 --- a/htroot/ConfigPortal.java +++ b/htroot/ConfigPortal.java @@ -99,6 +99,7 @@ public class ConfigPortal { sb.setConfig("search.result.show.size", post.getBoolean("search.result.show.size")); sb.setConfig("search.result.show.metadata", post.getBoolean("search.result.show.metadata")); sb.setConfig("search.result.show.parser", post.getBoolean("search.result.show.parser")); + sb.setConfig("search.result.show.citation", post.getBoolean("search.result.show.citation")); sb.setConfig("search.result.show.pictures", post.getBoolean("search.result.show.pictures")); sb.setConfig("search.result.show.cache", post.getBoolean("search.result.show.cache")); sb.setConfig("search.result.show.proxy", post.getBoolean("search.result.show.proxy")); @@ -170,6 +171,7 @@ public class ConfigPortal { sb.setConfig("search.result.show.size", config.getProperty("search.result.show.size","false")); sb.setConfig("search.result.show.metadata", config.getProperty("search.result.show.metadata","false")); sb.setConfig("search.result.show.parser", config.getProperty("search.result.show.parser","false")); + sb.setConfig("search.result.show.citation", config.getProperty("search.result.show.citation","false")); sb.setConfig("search.result.show.pictures", config.getProperty("search.result.show.pictures","false")); sb.setConfig("search.result.show.cache", config.getProperty("search.result.show.cache","true")); sb.setConfig("search.result.show.proxy", config.getProperty("search.result.show.proxy","false")); @@ -205,6 +207,7 @@ public class ConfigPortal { prop.put("search.result.show.size", sb.getConfigBool("search.result.show.size", false) ? 1 : 0); prop.put("search.result.show.metadata", sb.getConfigBool("search.result.show.metadata", false) ? 1 : 0); prop.put("search.result.show.parser", sb.getConfigBool("search.result.show.parser", false) ? 1 : 0); + prop.put("search.result.show.citation", sb.getConfigBool("search.result.show.citation", false) ? 1 : 0); prop.put("search.result.show.pictures", sb.getConfigBool("search.result.show.pictures", false) ? 1 : 0); prop.put("search.result.show.cache", sb.getConfigBool("search.result.show.cache", false) ? 1 : 0); prop.put("search.result.show.proxy", sb.getConfigBool("search.result.show.proxy", false) ? 1 : 0); diff --git a/htroot/ConfigSearchPage_p.html b/htroot/ConfigSearchPage_p.html index d86c07b5b..e34a44627 100644 --- a/htroot/ConfigSearchPage_p.html +++ b/htroot/ConfigSearchPage_p.html @@ -179,6 +179,7 @@ $(function() {  | 42 kbyte  | Metadata  | Parser +  | Citation  | Pictures  | Cache  | Augmented Browsing @@ -190,6 +191,7 @@ $(function() { + diff --git a/htroot/ConfigSearchPage_p.java b/htroot/ConfigSearchPage_p.java index 7306b23ea..9f4ef6f7d 100644 --- a/htroot/ConfigSearchPage_p.java +++ b/htroot/ConfigSearchPage_p.java @@ -72,6 +72,7 @@ public class ConfigSearchPage_p { sb.setConfig("search.result.show.size", post.getBoolean("search.result.show.size")); sb.setConfig("search.result.show.metadata", post.getBoolean("search.result.show.metadata")); sb.setConfig("search.result.show.parser", post.getBoolean("search.result.show.parser")); + sb.setConfig("search.result.show.citation", post.getBoolean("search.result.show.citation")); sb.setConfig("search.result.show.pictures", post.getBoolean("search.result.show.pictures")); sb.setConfig("search.result.show.cache", post.getBoolean("search.result.show.cache")); sb.setConfig("search.result.show.proxy", post.getBoolean("search.result.show.proxy")); @@ -124,6 +125,7 @@ public class ConfigSearchPage_p { sb.setConfig("search.result.show.size", config.getProperty("search.result.show.size","false")); sb.setConfig("search.result.show.metadata", config.getProperty("search.result.show.metadata","false")); sb.setConfig("search.result.show.parser", config.getProperty("search.result.show.parser","false")); + sb.setConfig("search.result.show.citation", config.getProperty("search.result.show.citation","false")); sb.setConfig("search.result.show.pictures", config.getProperty("search.result.show.pictures","false")); sb.setConfig("search.result.show.cache", config.getProperty("search.result.show.cache","true")); sb.setConfig("search.result.show.proxy", config.getProperty("search.result.show.proxy","false")); @@ -150,6 +152,7 @@ public class ConfigSearchPage_p { prop.put("search.result.show.size", sb.getConfigBool("search.result.show.size", false) ? 1 : 0); prop.put("search.result.show.metadata", sb.getConfigBool("search.result.show.metadata", false) ? 1 : 0); prop.put("search.result.show.parser", sb.getConfigBool("search.result.show.parser", false) ? 1 : 0); + prop.put("search.result.show.citation", sb.getConfigBool("search.result.show.citation", false) ? 1 : 0); prop.put("search.result.show.pictures", sb.getConfigBool("search.result.show.pictures", false) ? 1 : 0); prop.put("search.result.show.cache", sb.getConfigBool("search.result.show.cache", false) ? 1 : 0); prop.put("search.result.show.proxy", sb.getConfigBool("search.result.show.proxy", false) ? 1 : 0); diff --git a/htroot/api/citation.html b/htroot/api/citation.html new file mode 100644 index 000000000..268ba99f8 --- /dev/null +++ b/htroot/api/citation.html @@ -0,0 +1,34 @@ + + + +YaCy '#[clientname]#': Document Citations for url #[url]# +#%env/templates/metas.template%# + + +#%env/templates/embeddedheader.template%# +

Document Citations for
#[url]#

+ #(similar)#:: +

Similar documents from different hosts:

+ + #(/similar)# +
+
+ +
#{sentences}# +
#[dt]#
#[dd]#
+ #{/sentences}#
+
+
+
+
+ +
#{citations}# +
#[dt]#
#[dd]#
+ #{/citations}#
+
+
+#%env/templates/embeddedfooter.template%# + + diff --git a/htroot/api/citation.java b/htroot/api/citation.java new file mode 100644 index 000000000..8cee1a124 --- /dev/null +++ b/htroot/api/citation.java @@ -0,0 +1,199 @@ +/** + * citation + * Copyright 2013 by Michael Peter Christen + * First released 12.6.2013 at http://yacy.net + * + * This library is free software; you can redistribute it and/or + * modify it under the terms of the GNU Lesser General Public + * License as published by the Free Software Foundation; either + * version 2.1 of the License, or (at your option) any later version. + * + * This library is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * Lesser General Public License for more details. + * + * You should have received a copy of the GNU Lesser General Public License + * along with this program in the file lgpl21.txt + * If not, see . + */ + + +import java.io.IOException; +import java.net.MalformedURLException; +import java.util.ArrayList; +import java.util.LinkedHashMap; +import java.util.Map; +import java.util.Set; +import java.util.TreeSet; + +import org.apache.solr.common.SolrDocument; +import org.apache.solr.common.SolrDocumentList; + +import net.yacy.cora.document.ASCII; +import net.yacy.cora.federate.solr.connector.SolrConnector; +import net.yacy.cora.protocol.RequestHeader; +import net.yacy.cora.sorting.OrderedScoreMap; +import net.yacy.document.SentenceReader; +import net.yacy.kelondro.data.meta.DigestURI; +import net.yacy.search.Switchboard; +import net.yacy.search.index.Segment; +import net.yacy.search.schema.CollectionSchema; +import net.yacy.server.serverObjects; +import net.yacy.server.serverSwitch; + +public class citation { + + public static serverObjects respond(@SuppressWarnings("unused") final RequestHeader header, final serverObjects post, final serverSwitch env) { + // return variable that accumulates replacements + final Switchboard sb = (Switchboard) env; + final serverObjects prop = new serverObjects(); + final Segment segment = sb.index; + final SolrConnector connector = segment.fulltext().getDefaultConnector(); + + // avoid UNRESOLVED PATTERN + prop.put("url", ""); + prop.put("citations", 0); + prop.put("sentences", 0); + + DigestURI uri = null; + String url = ""; + String hash = ""; + int ch = 10; + if (post != null) { + if (post.containsKey("url")) { + url = post.get("url"); + if (!url.startsWith("http://") && + !url.startsWith("https://") && + !url.startsWith("ftp://") && + !url.startsWith("smb://") && + !url.startsWith("file://")) { + url = "http://" + url; + } + } + if (post.containsKey("hash")) { + hash = post.get("hash"); + } + if (post.containsKey("ch")) { + ch = post.getInt("ch", ch); + } + } + + if (url.length() > 0) { + try { + uri = new DigestURI(url, null); + hash = ASCII.String(uri.hash()); + } catch (MalformedURLException e) {} + } + if (uri == null && hash.length() > 0) { + uri = sb.getURL(ASCII.getBytes(hash)); + } + if (uri == null) return prop; // no proper url addressed + url = uri.toNormalform(true); + prop.put("url", url); + + // get the document from the index + SolrDocument doc; + try { + doc = segment.fulltext().getDefaultConnector().getDocumentById(hash, CollectionSchema.title.getSolrFieldName(), CollectionSchema.text_t.getSolrFieldName()); + } catch (IOException e1) { + return prop; + } + @SuppressWarnings("unchecked") + ArrayList title = (ArrayList) doc.getFieldValue(CollectionSchema.title.getSolrFieldName()); + String text = (String) doc.getFieldValue(CollectionSchema.text_t.getSolrFieldName()); + + ArrayList sentences = new ArrayList(); + if (title != null) for (String s: title) if (s.length() > 0) sentences.add(s); + SentenceReader sr = new SentenceReader(text); + StringBuilder line; + while (sr.hasNext()) { + line = sr.next(); + if (line.length() > 0) sentences.add(line.toString()); + } + + // for each line make a statistic about the number of occurrences somewhere else + OrderedScoreMap scores = new OrderedScoreMap(null); // accumulates scores for citating urls + LinkedHashMap> sentenceOcc = new LinkedHashMap>(); + for (String sentence: sentences) { + if (sentence == null || sentence.length() < 40) { + // do not count the very short sentences + sentenceOcc.put(sentence, null); + continue; + } + try { + sentence = sentence.replace('"', '\''); + SolrDocumentList doclist = connector.getDocumentListByQuery("text_t:\"" + sentence + "\"", 0, 100, CollectionSchema.sku.getSolrFieldName()); + int count = (int) doclist.getNumFound(); + if (count > 0) { + Set list = new TreeSet(); + for (SolrDocument d: doclist) { + String u = (String) d.getFieldValue(CollectionSchema.sku.getSolrFieldName()); + if (u == null || u.equals(url)) continue; + scores.inc(u); + try {list.add(new DigestURI(u, null));} catch (MalformedURLException e) {} + } + sentenceOcc.put(sentence, list); + } + } catch (Throwable ee) { + + } + } + sentences.clear(); // we do not need this again + + // iterate the sentences + int i = 0; + for (Map.Entry> se: sentenceOcc.entrySet()) { + prop.put("sentences_" + i + "_dt", i); + StringBuilder dd = new StringBuilder(se.getKey()); + Set app = se.getValue(); + if (app != null && app.size() > 0) { + dd.append("
appears in:"); + for (DigestURI u: app) { + if (u != null) { + dd.append(" ").append(u.getHost()).append(""); + } + } + } + prop.put("sentences_" + i + "_dd", dd.toString()); + i++; + } + prop.put("sentences", i); + + // iterate the citations in order of number of citations + i = 0; + for (String u: scores.keyList(false)) { + try { + DigestURI uu = new DigestURI(u, null); + prop.put("citations_" + i + "_dt", "" + u + ""); + StringBuilder dd = new StringBuilder(); + dd.append("makes ").append(Integer.toString(scores.get(u))).append(" citations: of ").append(url); + for (Map.Entry> se: sentenceOcc.entrySet()) { + Set occurls = se.getValue(); + if (occurls != null && occurls.contains(uu)) dd.append("
").append(se.getKey()).append(""); + } + prop.put("citations_" + i + "_dd", dd.toString()); + i++; + } catch (MalformedURLException e) {} + } + prop.put("citations", i); + + // find similar documents from different hosts + i = 0; + for (String u: scores.keyList(false)) { + if (scores.get(u) < ch) continue; + try { + DigestURI uu = new DigestURI(u, null); + if (uu.getOrganization().equals(uri.getOrganization())) continue; + prop.put("similar_links_" + i + "_url", u); + i++; + } catch (MalformedURLException e) {} + } + prop.put("similar_links", i); + prop.put("similar", i > 0 ? 1 : 0); + + // return rewrite properties + return prop; + } + +} diff --git a/htroot/yacysearchitem.html b/htroot/yacysearchitem.html index 9b30e35b5..58996c0e7 100644 --- a/htroot/yacysearchitem.html +++ b/htroot/yacysearchitem.html @@ -28,6 +28,7 @@ #(showSize)#:: | #[sizename]##(/showSize)# #(showMetadata)#:: | Metadata#(/showMetadata)# #(showParser)#:: | Parser#(/showParser)# + #(showCitation)#:: | Citations#(/showCitation)# #(showPictures)#:: | Pictures#(/showPictures)# #(showCache)#:: | Cache#(/showCache)# #(showProxy)#:: | Augmented Browsing#(/showProxy)# diff --git a/htroot/yacysearchitem.java b/htroot/yacysearchitem.java index 275d2e99e..80783193a 100644 --- a/htroot/yacysearchitem.java +++ b/htroot/yacysearchitem.java @@ -133,6 +133,7 @@ public class yacysearchitem { prop.put("content_showSize", sb.getConfigBool("search.result.show.size", true) ? 1 : 0); prop.put("content_showMetadata", sb.getConfigBool("search.result.show.metadata", true) ? 1 : 0); prop.put("content_showParser", sb.getConfigBool("search.result.show.parser", true) ? 1 : 0); + prop.put("content_showCitation", sb.getConfigBool("search.result.show.citation", true) ? 1 : 0); prop.put("content_showPictures", sb.getConfigBool("search.result.show.pictures", true) ? 1 : 0); prop.put("content_showCache", sb.getConfigBool("search.result.show.cache", true) && Cache.has(resultURL.hash()) ? 1 : 0); prop.put("content_showProxy", sb.getConfigBool("search.result.show.proxy", true) ? 1 : 0); @@ -198,6 +199,7 @@ public class yacysearchitem { prop.put("content_showProxy_link", resultUrlstring); prop.put("content_showHostBrowser_link", resultUrlstring); prop.put("content_showParser_urlhash", resulthashString); + prop.put("content_showCitation_urlhash", resulthashString); prop.put("content_showTags_urlhash", resulthashString); prop.put("content_urlhexhash", Seed.b64Hash2hexHash(resulthashString)); prop.putHTML("content_urlname", nxTools.shortenURLString(result.urlname(), MAX_URL_LENGTH)); diff --git a/source/net/yacy/cora/document/MultiProtocolURI.java b/source/net/yacy/cora/document/MultiProtocolURI.java index 979637254..d5e73af0b 100644 --- a/source/net/yacy/cora/document/MultiProtocolURI.java +++ b/source/net/yacy/cora/document/MultiProtocolURI.java @@ -755,6 +755,14 @@ public class MultiProtocolURI implements Serializable, Comparable\n"); + writer.write("\n\n\n"); writer.write("\n"); writer.write("\n"); SolrParams params = request.getOriginalParams(); - boolean discover = params.getBool("discover", false); String grep = params.get("grep"); String query = ""; String q = params.get("q"); if (q == null) q = ""; @@ -112,8 +106,6 @@ public class GrepHTMLResponseWriter implements QueryResponseWriter { NamedList paramsList = params.toNamedList(); paramsList.remove("wt"); String xmlquery = dqp.matcher("/solr/select?" + SolrParams.toSolrParams(paramsList).toString()).replaceAll("%22"); - writer.write("
\"API\"\n"); - writer.write("This search result can also be retrieved as XML. Click the API icon to see an example call to the search rss API.
\n"); DocList response = ((ResultContext) values.get("response")).docs; final int sz = response.size(); @@ -121,10 +113,10 @@ public class GrepHTMLResponseWriter implements QueryResponseWriter { SolrIndexSearcher searcher = request.getSearcher(); DocIterator iterator = response.iterator(); IndexSchema schema = request.getSchema(); - writer.write("Document Grep for query \"" + query + "\" and grep phrase \"" + grep + "\"\n\n"); - - LinkedHashMap> sentenceCache = new LinkedHashMap>(); - + String h1 = "Document Grep for query \"" + query + "\" and grep phrase \"" + grep + "\""; + writer.write("" + h1 + "\n\n

" + h1 + "

\n"); + writer.write("
\"API\"\n"); + writer.write("This search result can also be retrieved as XML. Click the API icon to see an example call to the search rss API.\n"); for (int i = 0; i < sz; i++) { int id = iterator.nextDoc(); Document doc = searcher.doc(id, DEFAULT_FIELD_LIST); @@ -141,24 +133,7 @@ public class GrepHTMLResponseWriter implements QueryResponseWriter { line = sr.next(); if (line.length() > 0) sentences.add(line.toString()); } - sentenceCache.put(sku, sentences); - } - - OrderedScoreMap scores = null; - if (discover) { - // for each line make a statistic about the number of occurrences somewhere else - SolrConnector connector = Switchboard.getSwitchboard().index.fulltext().getDefaultConnector(); - scores = new OrderedScoreMap(null); - for (Map.Entry> entry: sentenceCache.entrySet()) { - for (String line: entry.getValue()) { - long count = connector.getCountByQuery("text_t:\"" + line + "\""); - if (count > 0) scores.inc(entry.getKey()); - } - } - } - - for (Map.Entry> entry: sentenceCache.entrySet()) { - writeDoc(writer, entry.getKey(), entry.getValue(), grep, scores); + writeDoc(writer, sku, sentences, grep); } } else { writer.write("No Document Found\n\n"); @@ -167,7 +142,7 @@ public class GrepHTMLResponseWriter implements QueryResponseWriter { writer.write("\n"); } - private static final void writeDoc(Writer writer, String url, ArrayList sentences, String grep, OrderedScoreMap scores) throws IOException { + private static final void writeDoc(Writer writer, String url, ArrayList sentences, String grep) throws IOException { writer.write("
\n"); writer.write("
\n"); writer.write("

" + url + "

\n"); @@ -180,29 +155,19 @@ public class GrepHTMLResponseWriter implements QueryResponseWriter { if (grep == null || grep.length() == 0) writer.write("all lines in document"); else {writer.write("matches for grep phrase \"");writer.write(grep);writer.write("\"");} } writer.write(""); - writedd(writer, line, scores); - } - if (scores != null) { - Collection discoveries = scores.keyList(false); - writer.write("
Citations:
"); - for (String u: discoveries) { - writer.write("
"); - writer.write(Integer.toString(scores.get(u))); - writer.write(" citations
"); - writedd(writer, u, scores); - } + writedd(writer, line, grep); } writer.write("\n"); writer.write("
\n"); writer.write("
\n"); } - private static void writedd(Writer writer, String line, OrderedScoreMap scores) throws IOException { + private static void writedd(Writer writer, String line, String grep) throws IOException { writer.write("
"); + writer.write("%22&rows=100&grep=%22"); + XML.escapeAttributeValue(grep, writer); + writer.write("%22&wt=grephtml\">"); XML.escapeAttributeValue(line, writer); writer.write("
\n"); } diff --git a/source/net/yacy/cora/federate/solr/responsewriter/HTMLResponseWriter.java b/source/net/yacy/cora/federate/solr/responsewriter/HTMLResponseWriter.java index 7bcabcfa1..b3d83c7df 100644 --- a/source/net/yacy/cora/federate/solr/responsewriter/HTMLResponseWriter.java +++ b/source/net/yacy/cora/federate/solr/responsewriter/HTMLResponseWriter.java @@ -90,7 +90,7 @@ public class HTMLResponseWriter implements QueryResponseWriter { paramsList.remove("wt"); String xmlquery = dqp.matcher("/solr/select?" + SolrParams.toSolrParams(paramsList).toString()).replaceAll("%22"); writer.write("
\"API\"\n"); - writer.write("This search result can also be retrieved as XML. Click the API icon to see an example call to the search rss API.
\n"); + writer.write("This search result can also be retrieved as XML. Click the API icon to see this page as XML.
\n"); DocList response = ((ResultContext) values.get("response")).docs; final int sz = response.size();