diff --git a/defaults/yacy.init b/defaults/yacy.init index 51be0538a..a6a2a1024 100644 --- a/defaults/yacy.init +++ b/defaults/yacy.init @@ -769,6 +769,7 @@ search.result.show.date = true search.result.show.size = false search.result.show.metadata = false search.result.show.parser = false +search.result.show.citation = true search.result.show.pictures = false search.result.show.cache = true search.result.show.proxy = false diff --git a/htroot/ConfigPortal.html b/htroot/ConfigPortal.html index ad8487564..db03e3935 100644 --- a/htroot/ConfigPortal.html +++ b/htroot/ConfigPortal.html @@ -79,6 +79,7 @@ Size  Metadata  Parser  + Citations  Pictures  Cache Augmented Browsing diff --git a/htroot/ConfigPortal.java b/htroot/ConfigPortal.java index fab45a661..456e6d445 100644 --- a/htroot/ConfigPortal.java +++ b/htroot/ConfigPortal.java @@ -99,6 +99,7 @@ public class ConfigPortal { sb.setConfig("search.result.show.size", post.getBoolean("search.result.show.size")); sb.setConfig("search.result.show.metadata", post.getBoolean("search.result.show.metadata")); sb.setConfig("search.result.show.parser", post.getBoolean("search.result.show.parser")); + sb.setConfig("search.result.show.citation", post.getBoolean("search.result.show.citation")); sb.setConfig("search.result.show.pictures", post.getBoolean("search.result.show.pictures")); sb.setConfig("search.result.show.cache", post.getBoolean("search.result.show.cache")); sb.setConfig("search.result.show.proxy", post.getBoolean("search.result.show.proxy")); @@ -170,6 +171,7 @@ public class ConfigPortal { sb.setConfig("search.result.show.size", config.getProperty("search.result.show.size","false")); sb.setConfig("search.result.show.metadata", config.getProperty("search.result.show.metadata","false")); sb.setConfig("search.result.show.parser", config.getProperty("search.result.show.parser","false")); + sb.setConfig("search.result.show.citation", config.getProperty("search.result.show.citation","false")); sb.setConfig("search.result.show.pictures", config.getProperty("search.result.show.pictures","false")); sb.setConfig("search.result.show.cache", config.getProperty("search.result.show.cache","true")); sb.setConfig("search.result.show.proxy", config.getProperty("search.result.show.proxy","false")); @@ -205,6 +207,7 @@ public class ConfigPortal { prop.put("search.result.show.size", sb.getConfigBool("search.result.show.size", false) ? 1 : 0); prop.put("search.result.show.metadata", sb.getConfigBool("search.result.show.metadata", false) ? 1 : 0); prop.put("search.result.show.parser", sb.getConfigBool("search.result.show.parser", false) ? 1 : 0); + prop.put("search.result.show.citation", sb.getConfigBool("search.result.show.citation", false) ? 1 : 0); prop.put("search.result.show.pictures", sb.getConfigBool("search.result.show.pictures", false) ? 1 : 0); prop.put("search.result.show.cache", sb.getConfigBool("search.result.show.cache", false) ? 1 : 0); prop.put("search.result.show.proxy", sb.getConfigBool("search.result.show.proxy", false) ? 1 : 0); diff --git a/htroot/ConfigSearchPage_p.html b/htroot/ConfigSearchPage_p.html index d86c07b5b..e34a44627 100644 --- a/htroot/ConfigSearchPage_p.html +++ b/htroot/ConfigSearchPage_p.html @@ -179,6 +179,7 @@ $(function() {  | 42 kbyte  | Metadata  | Parser +  | Citation  | Pictures  | Cache  | Augmented Browsing @@ -190,6 +191,7 @@ $(function() { + diff --git a/htroot/ConfigSearchPage_p.java b/htroot/ConfigSearchPage_p.java index 7306b23ea..9f4ef6f7d 100644 --- a/htroot/ConfigSearchPage_p.java +++ b/htroot/ConfigSearchPage_p.java @@ -72,6 +72,7 @@ public class ConfigSearchPage_p { sb.setConfig("search.result.show.size", post.getBoolean("search.result.show.size")); sb.setConfig("search.result.show.metadata", post.getBoolean("search.result.show.metadata")); sb.setConfig("search.result.show.parser", post.getBoolean("search.result.show.parser")); + sb.setConfig("search.result.show.citation", post.getBoolean("search.result.show.citation")); sb.setConfig("search.result.show.pictures", post.getBoolean("search.result.show.pictures")); sb.setConfig("search.result.show.cache", post.getBoolean("search.result.show.cache")); sb.setConfig("search.result.show.proxy", post.getBoolean("search.result.show.proxy")); @@ -124,6 +125,7 @@ public class ConfigSearchPage_p { sb.setConfig("search.result.show.size", config.getProperty("search.result.show.size","false")); sb.setConfig("search.result.show.metadata", config.getProperty("search.result.show.metadata","false")); sb.setConfig("search.result.show.parser", config.getProperty("search.result.show.parser","false")); + sb.setConfig("search.result.show.citation", config.getProperty("search.result.show.citation","false")); sb.setConfig("search.result.show.pictures", config.getProperty("search.result.show.pictures","false")); sb.setConfig("search.result.show.cache", config.getProperty("search.result.show.cache","true")); sb.setConfig("search.result.show.proxy", config.getProperty("search.result.show.proxy","false")); @@ -150,6 +152,7 @@ public class ConfigSearchPage_p { prop.put("search.result.show.size", sb.getConfigBool("search.result.show.size", false) ? 1 : 0); prop.put("search.result.show.metadata", sb.getConfigBool("search.result.show.metadata", false) ? 1 : 0); prop.put("search.result.show.parser", sb.getConfigBool("search.result.show.parser", false) ? 1 : 0); + prop.put("search.result.show.citation", sb.getConfigBool("search.result.show.citation", false) ? 1 : 0); prop.put("search.result.show.pictures", sb.getConfigBool("search.result.show.pictures", false) ? 1 : 0); prop.put("search.result.show.cache", sb.getConfigBool("search.result.show.cache", false) ? 1 : 0); prop.put("search.result.show.proxy", sb.getConfigBool("search.result.show.proxy", false) ? 1 : 0); diff --git a/htroot/api/citation.html b/htroot/api/citation.html new file mode 100644 index 000000000..268ba99f8 --- /dev/null +++ b/htroot/api/citation.html @@ -0,0 +1,34 @@ + + + +YaCy '#[clientname]#': Document Citations for url #[url]# +#%env/templates/metas.template%# + + +#%env/templates/embeddedheader.template%# +

Document Citations for
#[url]#

+ #(similar)#:: +

Similar documents from different hosts:

+ + #(/similar)# +
+
+ +
#{sentences}# +
#[dt]#
#[dd]#
+ #{/sentences}#
+
+
+
+
+ +
#{citations}# +
#[dt]#
#[dd]#
+ #{/citations}#
+
+
+#%env/templates/embeddedfooter.template%# + + diff --git a/htroot/api/citation.java b/htroot/api/citation.java new file mode 100644 index 000000000..8cee1a124 --- /dev/null +++ b/htroot/api/citation.java @@ -0,0 +1,199 @@ +/** + * citation + * Copyright 2013 by Michael Peter Christen + * First released 12.6.2013 at http://yacy.net + * + * This library is free software; you can redistribute it and/or + * modify it under the terms of the GNU Lesser General Public + * License as published by the Free Software Foundation; either + * version 2.1 of the License, or (at your option) any later version. + * + * This library is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * Lesser General Public License for more details. + * + * You should have received a copy of the GNU Lesser General Public License + * along with this program in the file lgpl21.txt + * If not, see . + */ + + +import java.io.IOException; +import java.net.MalformedURLException; +import java.util.ArrayList; +import java.util.LinkedHashMap; +import java.util.Map; +import java.util.Set; +import java.util.TreeSet; + +import org.apache.solr.common.SolrDocument; +import org.apache.solr.common.SolrDocumentList; + +import net.yacy.cora.document.ASCII; +import net.yacy.cora.federate.solr.connector.SolrConnector; +import net.yacy.cora.protocol.RequestHeader; +import net.yacy.cora.sorting.OrderedScoreMap; +import net.yacy.document.SentenceReader; +import net.yacy.kelondro.data.meta.DigestURI; +import net.yacy.search.Switchboard; +import net.yacy.search.index.Segment; +import net.yacy.search.schema.CollectionSchema; +import net.yacy.server.serverObjects; +import net.yacy.server.serverSwitch; + +public class citation { + + public static serverObjects respond(@SuppressWarnings("unused") final RequestHeader header, final serverObjects post, final serverSwitch env) { + // return variable that accumulates replacements + final Switchboard sb = (Switchboard) env; + final serverObjects prop = new serverObjects(); + final Segment segment = sb.index; + final SolrConnector connector = segment.fulltext().getDefaultConnector(); + + // avoid UNRESOLVED PATTERN + prop.put("url", ""); + prop.put("citations", 0); + prop.put("sentences", 0); + + DigestURI uri = null; + String url = ""; + String hash = ""; + int ch = 10; + if (post != null) { + if (post.containsKey("url")) { + url = post.get("url"); + if (!url.startsWith("http://") && + !url.startsWith("https://") && + !url.startsWith("ftp://") && + !url.startsWith("smb://") && + !url.startsWith("file://")) { + url = "http://" + url; + } + } + if (post.containsKey("hash")) { + hash = post.get("hash"); + } + if (post.containsKey("ch")) { + ch = post.getInt("ch", ch); + } + } + + if (url.length() > 0) { + try { + uri = new DigestURI(url, null); + hash = ASCII.String(uri.hash()); + } catch (MalformedURLException e) {} + } + if (uri == null && hash.length() > 0) { + uri = sb.getURL(ASCII.getBytes(hash)); + } + if (uri == null) return prop; // no proper url addressed + url = uri.toNormalform(true); + prop.put("url", url); + + // get the document from the index + SolrDocument doc; + try { + doc = segment.fulltext().getDefaultConnector().getDocumentById(hash, CollectionSchema.title.getSolrFieldName(), CollectionSchema.text_t.getSolrFieldName()); + } catch (IOException e1) { + return prop; + } + @SuppressWarnings("unchecked") + ArrayList title = (ArrayList) doc.getFieldValue(CollectionSchema.title.getSolrFieldName()); + String text = (String) doc.getFieldValue(CollectionSchema.text_t.getSolrFieldName()); + + ArrayList sentences = new ArrayList(); + if (title != null) for (String s: title) if (s.length() > 0) sentences.add(s); + SentenceReader sr = new SentenceReader(text); + StringBuilder line; + while (sr.hasNext()) { + line = sr.next(); + if (line.length() > 0) sentences.add(line.toString()); + } + + // for each line make a statistic about the number of occurrences somewhere else + OrderedScoreMap scores = new OrderedScoreMap(null); // accumulates scores for citating urls + LinkedHashMap> sentenceOcc = new LinkedHashMap>(); + for (String sentence: sentences) { + if (sentence == null || sentence.length() < 40) { + // do not count the very short sentences + sentenceOcc.put(sentence, null); + continue; + } + try { + sentence = sentence.replace('"', '\''); + SolrDocumentList doclist = connector.getDocumentListByQuery("text_t:\"" + sentence + "\"", 0, 100, CollectionSchema.sku.getSolrFieldName()); + int count = (int) doclist.getNumFound(); + if (count > 0) { + Set list = new TreeSet(); + for (SolrDocument d: doclist) { + String u = (String) d.getFieldValue(CollectionSchema.sku.getSolrFieldName()); + if (u == null || u.equals(url)) continue; + scores.inc(u); + try {list.add(new DigestURI(u, null));} catch (MalformedURLException e) {} + } + sentenceOcc.put(sentence, list); + } + } catch (Throwable ee) { + + } + } + sentences.clear(); // we do not need this again + + // iterate the sentences + int i = 0; + for (Map.Entry> se: sentenceOcc.entrySet()) { + prop.put("sentences_" + i + "_dt", i); + StringBuilder dd = new StringBuilder(se.getKey()); + Set app = se.getValue(); + if (app != null && app.size() > 0) { + dd.append("
appears in:"); + for (DigestURI u: app) { + if (u != null) { + dd.append(" ").append(u.getHost()).append(""); + } + } + } + prop.put("sentences_" + i + "_dd", dd.toString()); + i++; + } + prop.put("sentences", i); + + // iterate the citations in order of number of citations + i = 0; + for (String u: scores.keyList(false)) { + try { + DigestURI uu = new DigestURI(u, null); + prop.put("citations_" + i + "_dt", "" + u + ""); + StringBuilder dd = new StringBuilder(); + dd.append("makes ").append(Integer.toString(scores.get(u))).append(" citations: of ").append(url); + for (Map.Entry> se: sentenceOcc.entrySet()) { + Set occurls = se.getValue(); + if (occurls != null && occurls.contains(uu)) dd.append("
").append(se.getKey()).append(""); + } + prop.put("citations_" + i + "_dd", dd.toString()); + i++; + } catch (MalformedURLException e) {} + } + prop.put("citations", i); + + // find similar documents from different hosts + i = 0; + for (String u: scores.keyList(false)) { + if (scores.get(u) < ch) continue; + try { + DigestURI uu = new DigestURI(u, null); + if (uu.getOrganization().equals(uri.getOrganization())) continue; + prop.put("similar_links_" + i + "_url", u); + i++; + } catch (MalformedURLException e) {} + } + prop.put("similar_links", i); + prop.put("similar", i > 0 ? 1 : 0); + + // return rewrite properties + return prop; + } + +} diff --git a/htroot/yacysearchitem.html b/htroot/yacysearchitem.html index 9b30e35b5..58996c0e7 100644 --- a/htroot/yacysearchitem.html +++ b/htroot/yacysearchitem.html @@ -28,6 +28,7 @@ #(showSize)#:: | #[sizename]##(/showSize)# #(showMetadata)#:: | Metadata#(/showMetadata)# #(showParser)#:: | Parser#(/showParser)# + #(showCitation)#:: | Citations#(/showCitation)# #(showPictures)#:: | Pictures#(/showPictures)# #(showCache)#:: | Cache#(/showCache)# #(showProxy)#:: | Augmented Browsing#(/showProxy)# diff --git a/htroot/yacysearchitem.java b/htroot/yacysearchitem.java index 275d2e99e..80783193a 100644 --- a/htroot/yacysearchitem.java +++ b/htroot/yacysearchitem.java @@ -133,6 +133,7 @@ public class yacysearchitem { prop.put("content_showSize", sb.getConfigBool("search.result.show.size", true) ? 1 : 0); prop.put("content_showMetadata", sb.getConfigBool("search.result.show.metadata", true) ? 1 : 0); prop.put("content_showParser", sb.getConfigBool("search.result.show.parser", true) ? 1 : 0); + prop.put("content_showCitation", sb.getConfigBool("search.result.show.citation", true) ? 1 : 0); prop.put("content_showPictures", sb.getConfigBool("search.result.show.pictures", true) ? 1 : 0); prop.put("content_showCache", sb.getConfigBool("search.result.show.cache", true) && Cache.has(resultURL.hash()) ? 1 : 0); prop.put("content_showProxy", sb.getConfigBool("search.result.show.proxy", true) ? 1 : 0); @@ -198,6 +199,7 @@ public class yacysearchitem { prop.put("content_showProxy_link", resultUrlstring); prop.put("content_showHostBrowser_link", resultUrlstring); prop.put("content_showParser_urlhash", resulthashString); + prop.put("content_showCitation_urlhash", resulthashString); prop.put("content_showTags_urlhash", resulthashString); prop.put("content_urlhexhash", Seed.b64Hash2hexHash(resulthashString)); prop.putHTML("content_urlname", nxTools.shortenURLString(result.urlname(), MAX_URL_LENGTH)); diff --git a/source/net/yacy/cora/document/MultiProtocolURI.java b/source/net/yacy/cora/document/MultiProtocolURI.java index 979637254..d5e73af0b 100644 --- a/source/net/yacy/cora/document/MultiProtocolURI.java +++ b/source/net/yacy/cora/document/MultiProtocolURI.java @@ -755,6 +755,14 @@ public class MultiProtocolURI implements Serializable, Comparable\n"); + writer.write("\n\n\n"); writer.write("\n"); writer.write("\n"); SolrParams params = request.getOriginalParams(); - boolean discover = params.getBool("discover", false); String grep = params.get("grep"); String query = ""; String q = params.get("q"); if (q == null) q = ""; @@ -112,8 +106,6 @@ public class GrepHTMLResponseWriter implements QueryResponseWriter { NamedList paramsList = params.toNamedList(); paramsList.remove("wt"); String xmlquery = dqp.matcher("/solr/select?" + SolrParams.toSolrParams(paramsList).toString()).replaceAll("%22"); - writer.write("
\"API\"\n"); - writer.write("This search result can also be retrieved as XML. Click the API icon to see an example call to the search rss API.
\n"); DocList response = ((ResultContext) values.get("response")).docs; final int sz = response.size(); @@ -121,10 +113,10 @@ public class GrepHTMLResponseWriter implements QueryResponseWriter { SolrIndexSearcher searcher = request.getSearcher(); DocIterator iterator = response.iterator(); IndexSchema schema = request.getSchema(); - writer.write("Document Grep for query \"" + query + "\" and grep phrase \"" + grep + "\"\n\n"); - - LinkedHashMap> sentenceCache = new LinkedHashMap>(); - + String h1 = "Document Grep for query \"" + query + "\" and grep phrase \"" + grep + "\""; + writer.write("" + h1 + "\n\n

" + h1 + "

\n"); + writer.write("
\"API\"\n"); + writer.write("This search result can also be retrieved as XML. Click the API icon to see an example call to the search rss API.\n"); for (int i = 0; i < sz; i++) { int id = iterator.nextDoc(); Document doc = searcher.doc(id, DEFAULT_FIELD_LIST); @@ -141,24 +133,7 @@ public class GrepHTMLResponseWriter implements QueryResponseWriter { line = sr.next(); if (line.length() > 0) sentences.add(line.toString()); } - sentenceCache.put(sku, sentences); - } - - OrderedScoreMap scores = null; - if (discover) { - // for each line make a statistic about the number of occurrences somewhere else - SolrConnector connector = Switchboard.getSwitchboard().index.fulltext().getDefaultConnector(); - scores = new OrderedScoreMap(null); - for (Map.Entry> entry: sentenceCache.entrySet()) { - for (String line: entry.getValue()) { - long count = connector.getCountByQuery("text_t:\"" + line + "\""); - if (count > 0) scores.inc(entry.getKey()); - } - } - } - - for (Map.Entry> entry: sentenceCache.entrySet()) { - writeDoc(writer, entry.getKey(), entry.getValue(), grep, scores); + writeDoc(writer, sku, sentences, grep); } } else { writer.write("No Document Found\n\n"); @@ -167,7 +142,7 @@ public class GrepHTMLResponseWriter implements QueryResponseWriter { writer.write("\n"); } - private static final void writeDoc(Writer writer, String url, ArrayList sentences, String grep, OrderedScoreMap scores) throws IOException { + private static final void writeDoc(Writer writer, String url, ArrayList sentences, String grep) throws IOException { writer.write("
\n"); writer.write("
\n"); writer.write("

" + url + "

\n"); @@ -180,29 +155,19 @@ public class GrepHTMLResponseWriter implements QueryResponseWriter { if (grep == null || grep.length() == 0) writer.write("all lines in document"); else {writer.write("matches for grep phrase \"");writer.write(grep);writer.write("\"");} } writer.write(""); - writedd(writer, line, scores); - } - if (scores != null) { - Collection discoveries = scores.keyList(false); - writer.write("
Citations:
"); - for (String u: discoveries) { - writer.write("
"); - writer.write(Integer.toString(scores.get(u))); - writer.write(" citations
"); - writedd(writer, u, scores); - } + writedd(writer, line, grep); } writer.write("\n"); writer.write("
\n"); writer.write("
\n"); } - private static void writedd(Writer writer, String line, OrderedScoreMap scores) throws IOException { + private static void writedd(Writer writer, String line, String grep) throws IOException { writer.write("
"); + writer.write("%22&rows=100&grep=%22"); + XML.escapeAttributeValue(grep, writer); + writer.write("%22&wt=grephtml\">"); XML.escapeAttributeValue(line, writer); writer.write("
\n"); } diff --git a/source/net/yacy/cora/federate/solr/responsewriter/HTMLResponseWriter.java b/source/net/yacy/cora/federate/solr/responsewriter/HTMLResponseWriter.java index 7bcabcfa1..b3d83c7df 100644 --- a/source/net/yacy/cora/federate/solr/responsewriter/HTMLResponseWriter.java +++ b/source/net/yacy/cora/federate/solr/responsewriter/HTMLResponseWriter.java @@ -90,7 +90,7 @@ public class HTMLResponseWriter implements QueryResponseWriter { paramsList.remove("wt"); String xmlquery = dqp.matcher("/solr/select?" + SolrParams.toSolrParams(paramsList).toString()).replaceAll("%22"); writer.write("
\"API\"\n"); - writer.write("This search result can also be retrieved as XML. Click the API icon to see an example call to the search rss API.
\n"); + writer.write("This search result can also be retrieved as XML. Click the API icon to see this page as XML.
\n"); DocList response = ((ResultContext) values.get("response")).docs; final int sz = response.size();