mirror of
https://github.com/yacy/yacy_search_server.git
synced 2024-09-19 00:01:41 +02:00
Adding heuristic to get search results from configured systems which support opensearch specification
- any system supporting opensearch specification can be configured - search query is only forwarded to remote system if not enough results available on local peer - discover function provided, checking the local Solr index for links to opensearchdescription files, to add to the config - sample config file with some general search engines with opensearch support
This commit is contained in:
parent
eb90d38cd7
commit
168b1d130d
23
defaults/heuristicopensearch.conf
Normal file
23
defaults/heuristicopensearch.conf
Normal file
|
@ -0,0 +1,23 @@
|
|||
## List of search engines used by YaCy heuristic search option
|
||||
## Format example
|
||||
## SystemName = http://www.thesystem.org/search?q={searchTerms}
|
||||
## all opensearch parameters can be used in search url
|
||||
## {searchTerms} is replaced by search query
|
||||
## {startIndex?} is replaced by result start
|
||||
## {count} is replaced by expected number of results
|
||||
##
|
||||
## the syntax of this file:
|
||||
## - all lines beginning with '##' are comments
|
||||
## - all non-empty lines not beginning with '#' are keyword lines
|
||||
## - all lines beginning with '#' and where the second character is not '#' are commented-out keyword lines
|
||||
##
|
||||
|
||||
#Nutch = http://www.search2.net/opensearch?query={searchTerms} # get 20 results from Nutch
|
||||
#Blekko = http://blekko.com/ws/{searchTerms}+/rss # get 20 results from blekko
|
||||
#Faroo-Web = http://www.faroo.com/instant.rss?q={searchTerms}&start={startIndex}&length={count}&l=en&src=web # get results from Faroo web-search
|
||||
#Faroo-News = http://www.faroo.com/instant.rss?q={searchTerms}&start={startIndex}&length=20&l=en&src=news # get results from Faroo news-search
|
||||
#openBDB = http://www.openbdb.com/b/{searchTerms}.xml # Open Book Database
|
||||
#Twitter = http://search.twitter.com/search.rss?rpp=20&q={searchTerms}
|
||||
#WordPress.com = http://en.search.wordpress.com/?q={searchTerms}&f=feed&page={startPage?} #Search WordPress.com Blogs
|
||||
#Sueddeutsche.de = http://suche.sueddeutsche.de/query/{searchTerms}?output=rss # Sueddeutsche Zeitung Artikel Archiv
|
||||
#Los Angeles Times = http://framework.latimes.com/?s={searchTerms}&feed=rss2
|
|
@ -1020,6 +1020,7 @@ heuristic.blekko = false
|
|||
heuristic.twitter = false
|
||||
heuristic.searchresults = false
|
||||
heuristic.searchresults.crawlglobal = false
|
||||
heuristic.opensearch = false
|
||||
|
||||
# colours for generic design
|
||||
color_background = #FFFFFF
|
||||
|
|
|
@ -97,6 +97,66 @@
|
|||
</fieldset>
|
||||
</form>
|
||||
|
||||
<fieldset>
|
||||
<form id="HeuristicFormOpenSearch" method="post" action="ConfigHeuristics_p.html" enctype="multipart/form-data" accept-charset="UTF-8">
|
||||
<legend>
|
||||
<input type="checkbox" name="opensearch_check" id="opensearch" onclick="window.location.href='ConfigHeuristics_p.html?#(opensearch.checked)#opensearch_on=::opensearch_off=#(/opensearch.checked)#'" value="opensearch"#(opensearch.checked)#:: checked="checked"#(/opensearch.checked)# />
|
||||
<label for="opensearch">opensearch load external search result list from active systems below</label>
|
||||
</legend>
|
||||
<p>
|
||||
When using this heuristic, then every search request line is used for a call to listed opensearch systems until enough results to fill the current search page are available.
|
||||
20 results are taken from remote system and loaded simultanously, parsed and indexed immediately.
|
||||
To find out more about OpenSearch see <a href="http://www.opensearch.org" target="_blank">OpenSearch.org</a>
|
||||
</p>
|
||||
</form>
|
||||
|
||||
<form action="ConfigHeuristics_p.html" method="post" enctype="multipart/form-data" accept-charset="UTF-8">
|
||||
<div>
|
||||
<b>Available/Active Opensearch System</b>
|
||||
<table class="sortable" border="0" cellpadding="2" cellspacing="1">
|
||||
<tr class="TableHeader" valign="bottom">
|
||||
<td>Active</td>
|
||||
<td>Title</td>
|
||||
<td>Comment</td>
|
||||
<td>Url <small>(format opensearch <a href="http://www.opensearch.org/Specifications/OpenSearch/1.1#OpenSearch_URL_template_syntax" target="_blank">Url template syntax</a>)</small></td>
|
||||
<td>delete</td>
|
||||
</tr>
|
||||
#{osdcfg}#
|
||||
<tr class="TableCell#(dark)#Light::Dark#(/dark)#">
|
||||
<td align="center"><input type="checkbox" name="ossys_#[title]#" value="checked" #(checked)#::checked="checked"#(/checked)#/></td>
|
||||
<td align="left"><b><a href="#[urlhostlink]#" target="_blank">#[title]#</b></a> </td>
|
||||
<td align="left">#[comment]#</td>
|
||||
<td align="left"><input type="text" name="ossys_url_#[title]#" value="#[url]#" size="70"/></td>
|
||||
<td align="center"><input type="checkbox" name="ossys_del_#[title]#" value="checked" #(delchecked)#::checked="checked"#(/delchecked)#/></td>
|
||||
</tr>
|
||||
#{/osdcfg}#
|
||||
<tr>
|
||||
<td><small>new</small></td>
|
||||
<td><input type="text" name="ossys_newtitle"/></td>
|
||||
<td><input type="text" name="ossys_newcomment"/></td>
|
||||
<td><input type="text" name="ossys_newurl" size="70"/></td>
|
||||
<td><input type="submit" name="addnewosd" value="add"/></td>
|
||||
</tr>
|
||||
</table>
|
||||
</div>
|
||||
<div>
|
||||
<input type="submit" name="setopensearch" value="Save" class="submitready"/>
|
||||
<span style="color:red">#[osderrmsg]#</span>
|
||||
</div>
|
||||
<br>
|
||||
<div>
|
||||
<div style="float:right">
|
||||
<input type="submit" name="discoverosd" id="discoverosd" value="discover from index" class="submitready" onclick="return confirm('start background task, depending on index size this may run a long time')"/>
|
||||
</div>
|
||||
With the button "discover from index" you can search within the metadata of your local index to find systems which support the Opensearch specification.
|
||||
The task is started in the background. It may take some minutes before new entries appear (after refreshing the page).
|
||||
Alternatively you may <a href="?copydefaultosdconfig=">copy & paste a example config file</a> located in <i>defaults/heuristicopensearch.conf</i> to the DATA/SETTINGS directory.
|
||||
For the discover function the field <i>outboundlinks_tag_txt</i> (and <i>inboundlinks_tag_txt</i>) has to be switched on in the <a href="IndexFederated_p.html">Solr Schema</a>.
|
||||
#{osdsolrfieldswitch}#<input type="submit" name="switchsolrfieldson" value="switch Solr fields on" class="submitready" onclick="return confirm('modify Solr Schema')"/>#{/osdsolrfieldswitch}#
|
||||
</div>
|
||||
</form>
|
||||
</fieldset>
|
||||
|
||||
#%env/templates/footer.template%#
|
||||
</body>
|
||||
</html>
|
||||
|
|
|
@ -5,9 +5,9 @@
|
|||
//
|
||||
// This is a part of YaCy, a peer-to-peer based web search engine
|
||||
//
|
||||
// $LastChangedDate: 2010-02-09 18:14:16 +0100 (Di, 09 Feb 2010) $
|
||||
// $LastChangedRevision: 6658 $
|
||||
// $LastChangedBy: lotus $
|
||||
// $LastChangedDate: 2012-12-19 $
|
||||
// $LastChangedRevision: $
|
||||
// $LastChangedBy: reger $
|
||||
//
|
||||
// LICENSE
|
||||
//
|
||||
|
@ -25,9 +25,16 @@
|
|||
// along with this program; if not, write to the Free Software
|
||||
// Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
|
||||
|
||||
import com.google.common.io.Files;
|
||||
import java.io.File;
|
||||
import net.yacy.cora.protocol.RequestHeader;
|
||||
import net.yacy.data.WorkTables;
|
||||
import net.yacy.search.Switchboard;
|
||||
import java.io.IOException;
|
||||
import java.util.Iterator;
|
||||
import net.yacy.cora.federate.yacy.ConfigurationSet;
|
||||
import net.yacy.cora.federate.opensearch.OpenSearchConnector;
|
||||
import net.yacy.cora.federate.solr.YaCySchema;
|
||||
import net.yacy.server.serverObjects;
|
||||
import net.yacy.server.serverSwitch;
|
||||
|
||||
|
@ -38,6 +45,7 @@ public class ConfigHeuristics_p {
|
|||
final Switchboard sb = (Switchboard) env;
|
||||
final serverObjects prop = new serverObjects();
|
||||
|
||||
String osderrmsg = "";
|
||||
if (post != null) {
|
||||
|
||||
// store this call as api call
|
||||
|
@ -53,14 +61,160 @@ public class ConfigHeuristics_p {
|
|||
if (post.containsKey("blekko_off")) sb.setConfig("heuristic.blekko", false);
|
||||
if (post.containsKey("twitter_on")) sb.setConfig("heuristic.twitter", true);
|
||||
if (post.containsKey("twitter_off")) sb.setConfig("heuristic.twitter", false);
|
||||
if (post.containsKey("opensearch_on")) {
|
||||
sb.setConfig("heuristic.opensearch", true);
|
||||
// re-read config (and create work table)
|
||||
OpenSearchConnector os = new OpenSearchConnector(sb, true);
|
||||
if (os.getSize() == 0) {
|
||||
osderrmsg = "no active search targets are configured";
|
||||
}
|
||||
|
||||
}
|
||||
if (post.containsKey("opensearch_off")) sb.setConfig("heuristic.opensearch", false);
|
||||
if (post.containsKey("discoverosd")) {
|
||||
final boolean metafieldNOTavailable = sb.index.fulltext().getSolrScheme().containsDisabled(YaCySchema.outboundlinks_tag_txt.name());
|
||||
if (!metafieldNOTavailable) {
|
||||
OpenSearchConnector osc = new OpenSearchConnector(sb, false);
|
||||
if (osc.discoverFromSolrIndex(sb)) {
|
||||
osderrmsg = "started background search for target systems, refresh page after some minutes";
|
||||
} else {
|
||||
osderrmsg = "Solr index needs to be available and field outboundlinks_tag_txt on";
|
||||
}
|
||||
} else {
|
||||
osderrmsg = "Error: field outboundlinks_tag_txt needs to be activated in Solr index";
|
||||
}
|
||||
}
|
||||
|
||||
final String tmpurl = post.get("ossys_newurl");
|
||||
// if user entered new opensearch url but hit the wrong button, simulate "add" button
|
||||
if (tmpurl != null && !tmpurl.isEmpty()) post.put("addnewosd", 1);
|
||||
|
||||
if (post.containsKey("addnewosd")) {
|
||||
// add new entry to config file
|
||||
final String tmpname = post.get("ossys_newtitle");
|
||||
if (tmpname != null && tmpurl !=null) {
|
||||
if (!tmpname.isEmpty() && !tmpurl.isEmpty() && tmpurl.toLowerCase().contains("{searchterms}")) {
|
||||
final String tmpcomment = post.get("ossys_newcomment");
|
||||
OpenSearchConnector osc = new OpenSearchConnector(sb,false);
|
||||
osc.add (tmpname,tmpurl,false,tmpcomment);
|
||||
} else osderrmsg = "Url template must contain '{searchTerms}'";
|
||||
}
|
||||
}
|
||||
|
||||
if (post.containsKey("setopensearch")) {
|
||||
// read index scheme table flags
|
||||
writeopensearchcfg (sb,post);
|
||||
}
|
||||
|
||||
if (post.containsKey("switchsolrfieldson")) {
|
||||
final boolean metafieldNOTavailable = sb.index.fulltext().getSolrScheme().containsDisabled(YaCySchema.outboundlinks_tag_txt.name());
|
||||
if (metafieldNOTavailable) {
|
||||
ConfigurationSet.Entry entry;
|
||||
entry = sb.index.fulltext().getSolrScheme().get(YaCySchema.outboundlinks_tag_txt.name());
|
||||
if (entry != null && !entry.enabled()) {
|
||||
entry.setEnable(true);
|
||||
}
|
||||
entry = sb.index.fulltext().getSolrScheme().get(YaCySchema.inboundlinks_tag_txt.name());
|
||||
if (entry != null && !entry.enabled()) {
|
||||
entry.setEnable(true);
|
||||
}
|
||||
try {
|
||||
sb.index.fulltext().getSolrScheme().commit();
|
||||
} catch (IOException ex) {}
|
||||
}
|
||||
}
|
||||
|
||||
// copy default opensearch heuristic config with sample entries
|
||||
if (post.containsKey("copydefaultosdconfig")) {
|
||||
// prepare a solr index profile switch list
|
||||
final File osdDefaultConfig = new File(sb.getDataPath(), "defaults/heuristicopensearch.conf");
|
||||
final File osdConfig = new File(sb.getDataPath(), "DATA/SETTINGS/heuristicopensearch.conf");
|
||||
if (!osdConfig.exists() && osdDefaultConfig.exists()) {
|
||||
try {
|
||||
Files.copy(osdDefaultConfig, osdConfig);
|
||||
} catch (IOException ex) {
|
||||
osderrmsg = "file I/O error during copy";
|
||||
}
|
||||
} else {osderrmsg = "config file exists or default doesn't exist";}
|
||||
}
|
||||
}
|
||||
|
||||
final boolean showmetafieldbutton = sb.index.fulltext().getSolrScheme().containsDisabled(YaCySchema.outboundlinks_tag_txt.name());
|
||||
if (showmetafieldbutton) prop.put("osdsolrfieldswitch",1);
|
||||
prop.put("site.checked", sb.getConfigBool("heuristic.site", false) ? 1 : 0);
|
||||
prop.put("searchresult.checked", sb.getConfigBool("heuristic.searchresults", false) ? 1 : 0);
|
||||
prop.put("searchresultglobal.checked", sb.getConfigBool("heuristic.searchresults.crawlglobal", false) ? 1 : 0);
|
||||
prop.put("blekko.checked", sb.getConfigBool("heuristic.blekko", false) ? 1 : 0);
|
||||
prop.put("twitter.checked", sb.getConfigBool("heuristic.twitter", false) ? 1 : 0);
|
||||
prop.put("opensearch.checked", sb.getConfigBool("heuristic.opensearch", false) ? 1 : 0);
|
||||
|
||||
// display config file content
|
||||
final File f = new File (sb.getDataPath(),"DATA/SETTINGS/heuristicopensearch.conf");
|
||||
ConfigurationSet p = new ConfigurationSet(f);
|
||||
int c = 0;
|
||||
boolean dark = false;
|
||||
Iterator<ConfigurationSet.Entry> i = p.entryIterator();
|
||||
while (i.hasNext()) {
|
||||
ConfigurationSet.Entry e = i.next();
|
||||
prop.put("osdcfg_" + c + "_dark", dark ? 1 : 0);
|
||||
dark = !dark;
|
||||
prop.put("osdcfg_" + c + "_checked", e.enabled() ? 1 : 0);
|
||||
prop.putHTML("osdcfg_" + c + "_title", e.key());
|
||||
prop.putHTML("osdcfg_" + c + "_comment", e.getComment() != null ? e.getComment() : "");
|
||||
|
||||
String tmps = e.getValue();
|
||||
prop.putHTML("osdcfg_" + c + "_url", tmps);
|
||||
tmps = tmps.substring(0,tmps.lastIndexOf("/"));
|
||||
prop.putHTML("osdcfg_" + c + "_urlhostlink", tmps);
|
||||
|
||||
c++;
|
||||
}
|
||||
prop.put("osdcfg", c);
|
||||
prop.putHTML("osderrmsg",osderrmsg);
|
||||
return prop;
|
||||
}
|
||||
|
||||
private static void writeopensearchcfg(final Switchboard sb, final serverObjects post) {
|
||||
// read index scheme table flags
|
||||
|
||||
final File f = new File(sb.getDataPath(), "DATA/SETTINGS/heuristicopensearch.conf");
|
||||
ConfigurationSet cfg = new ConfigurationSet(f);
|
||||
final Iterator<ConfigurationSet.Entry> cfgentries = cfg.entryIterator();
|
||||
ConfigurationSet.Entry entry;
|
||||
boolean modified = false; // flag to remember changes
|
||||
while (cfgentries.hasNext()) {
|
||||
entry = cfgentries.next();
|
||||
final String sfn = post.get("ossys_url_" + entry.key());
|
||||
if (sfn != null) {
|
||||
if (!sfn.equals(entry.getValue())) {
|
||||
entry.setValue(sfn);
|
||||
modified = true;
|
||||
}
|
||||
}
|
||||
// set enable flag
|
||||
String v = post.get("ossys_" + entry.key());
|
||||
boolean c = v != null && v.equals("checked");
|
||||
if (entry.enabled() != c) {
|
||||
entry.setEnable(c);
|
||||
modified = true;
|
||||
}
|
||||
// delete entry from config
|
||||
v = post.get("ossys_del_" + entry.key());
|
||||
c = v != null && v.equals("checked");
|
||||
if (c) {
|
||||
cfgentries.remove();
|
||||
modified = true;
|
||||
}
|
||||
}
|
||||
if (modified) { // save settings to config file if modified
|
||||
try {
|
||||
cfg.commit();
|
||||
} catch (IOException ex) {
|
||||
}
|
||||
}
|
||||
// re-read config (and create/update work table)
|
||||
if (sb.getConfigBool("heuristic.opensearch", true)) {
|
||||
OpenSearchConnector os = new OpenSearchConnector(sb, true);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
|
File diff suppressed because it is too large
Load Diff
|
@ -0,0 +1,275 @@
|
|||
/**
|
||||
* OpenSearchConnector
|
||||
* Copyright 2012 by Michael Peter Christen
|
||||
* First released 03.11.2012 at http://yacy.net
|
||||
*
|
||||
* This library is free software; you can redistribute it and/or
|
||||
* modify it under the terms of the GNU Lesser General Public
|
||||
* License as published by the Free Software Foundation; either
|
||||
* version 2.1 of the License, or (at your option) any later version.
|
||||
*
|
||||
* This library is distributed in the hope that it will be useful,
|
||||
* but WITHOUT ANY WARRANTY; without even the implied warranty of
|
||||
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
|
||||
* Lesser General Public License for more details.
|
||||
*
|
||||
* You should have received a copy of the GNU Lesser General Public License
|
||||
* along with this program in the file lgpl21.txt
|
||||
* If not, see <http://www.gnu.org/licenses/>.
|
||||
*/
|
||||
package net.yacy.cora.federate.opensearch;
|
||||
|
||||
import java.io.*;
|
||||
import java.net.MalformedURLException;
|
||||
import java.net.URL;
|
||||
import java.util.*;
|
||||
import net.yacy.cora.federate.solr.YaCySchema;
|
||||
import net.yacy.cora.federate.solr.connector.EmbeddedSolrConnector;
|
||||
import net.yacy.cora.federate.yacy.ConfigurationSet;
|
||||
import net.yacy.cora.federate.yacy.ConfigurationSet.Entry;
|
||||
import net.yacy.cora.util.SpaceExceededException;
|
||||
import net.yacy.document.parser.xml.opensearchdescriptionReader;
|
||||
import net.yacy.kelondro.blob.Tables;
|
||||
import net.yacy.kelondro.logging.Log;
|
||||
import net.yacy.search.Switchboard;
|
||||
import net.yacy.search.query.SearchEvent;
|
||||
import org.apache.solr.common.SolrDocument;
|
||||
import org.apache.solr.common.SolrDocumentList;
|
||||
|
||||
/**
|
||||
* Handling of queries to remote OpenSearch systems. Iterates to a list of
|
||||
* configured systems until number of needed results are available. Uses a
|
||||
* temporary work table to store search template urls for the iteration during
|
||||
* search.
|
||||
*/
|
||||
public class OpenSearchConnector {
|
||||
|
||||
private File confFile = null; // later initialized to DATA/SETTINGS/heuristicopensearch.conf
|
||||
private int size = 0; // remember the size of active opensearch targets
|
||||
|
||||
public OpenSearchConnector(Switchboard sb, boolean createworktable) {
|
||||
super();
|
||||
if (sb == null) {
|
||||
return;
|
||||
}
|
||||
|
||||
confFile = new File(sb.getDataPath(), "DATA/SETTINGS/heuristicopensearch.conf");
|
||||
|
||||
if (createworktable) { // read from config file and create worktable
|
||||
sb.tables.clear("opensearchsys");
|
||||
try {
|
||||
ConfigurationSet cfg = new ConfigurationSet(confFile);
|
||||
|
||||
// copy active opensearch systems to a work table (opensearchsys)
|
||||
Iterator<Entry> cfgentries = cfg.entryIterator();
|
||||
while (cfgentries.hasNext()) {
|
||||
Entry e = cfgentries.next();
|
||||
if (e.enabled()) {
|
||||
String title = e.key(); // get the title
|
||||
String urlstr = e.getValue(); // get the search template url
|
||||
|
||||
Tables.Data row = new Tables.Data();
|
||||
row.put("title", title);
|
||||
row.put("url", urlstr);
|
||||
try {
|
||||
sb.tables.insert("opensearchsys", row);
|
||||
} catch (SpaceExceededException ex) {
|
||||
Log.logException(ex);
|
||||
}
|
||||
}
|
||||
}
|
||||
size = sb.tables.size("opensearchsys");
|
||||
} catch (IOException ex) {
|
||||
Log.logException(ex);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
/**
|
||||
* Sends a search request to remote systems listed in worktable until the
|
||||
* searchevent contains less than needed results. Depending on already
|
||||
* collected search results none to all configured systems are queried to
|
||||
* complete available search results.
|
||||
* if query search domain is LOCAL procedure does nothing.
|
||||
*/
|
||||
static public void query(Switchboard sb, SearchEvent theSearch) {
|
||||
if (theSearch != null && sb != null) {
|
||||
if (!theSearch.query.isLocal()) {
|
||||
try {
|
||||
Iterator<Tables.Row> ossysworktable = sb.tables.iterator("opensearchsys");
|
||||
int needres = theSearch.query.neededResults(); // get number of needed results
|
||||
while (ossysworktable.hasNext() && theSearch.query.getResultCount() < needres) {
|
||||
Tables.Row row = ossysworktable.next();
|
||||
String osurl = row.get("url", "");
|
||||
String name = row.get("title", "");
|
||||
// to reuse existing heuristicRSS procedure replace querystring with "$"
|
||||
// querystring is inserted/replaced inside heuristicRSS
|
||||
sb.heuristicRSS(parseSearchTemplate(osurl, "$", 0, theSearch.query.itemsPerPage), theSearch, "opensearch:" + name);
|
||||
}
|
||||
} catch (IOException ex) {
|
||||
Log.logWarning("OpenSearchConnector.query", "failed reading table opensearchsys");
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
/**
|
||||
* replace Opensearchdescription search template parameter with actual values
|
||||
*/
|
||||
private static String parseSearchTemplate(String searchurltemplate, String query, int start, int rows) {
|
||||
String tmps = searchurltemplate.replaceAll("\\?}=", "}="); // some optional parameters may include question mark '{param?}='
|
||||
tmps = tmps.replace("{startIndex}", Integer.toString(start));
|
||||
tmps = tmps.replace("{startPage}", "");
|
||||
tmps = tmps.replace("{count}", Integer.toString(rows));
|
||||
tmps = tmps.replace("{language}", "");
|
||||
tmps = tmps.replace("{inputEncoding}", "UTF-8");
|
||||
tmps = tmps.replace("{outputEncoding}", "UTF-8");
|
||||
return tmps.replace("{searchTerms}", query);
|
||||
}
|
||||
|
||||
/**
|
||||
* add a opensearch target system to the config file
|
||||
*/
|
||||
public boolean add(String name, String url, boolean active, String comment) {
|
||||
if (confFile == null) {
|
||||
return false;
|
||||
}
|
||||
|
||||
ConfigurationSet conf = new ConfigurationSet(confFile);
|
||||
if (name != null && !name.isEmpty()) {
|
||||
conf.add(name, null, active);
|
||||
Entry e = conf.get(name);
|
||||
e.setValue(url);
|
||||
e.setEnable(active);
|
||||
e.setComment(comment);
|
||||
conf.put(name, e);
|
||||
try {
|
||||
conf.commit();
|
||||
} catch (IOException ex) {
|
||||
Log.logWarning("OpenSearchConnector.add", "config file write error");
|
||||
}
|
||||
return true;
|
||||
} else {
|
||||
return false;
|
||||
}
|
||||
}
|
||||
|
||||
/**
|
||||
* Get the number of active remote opensearch target systems
|
||||
*/
|
||||
public int getSize() {
|
||||
return size;
|
||||
}
|
||||
|
||||
/**
|
||||
* Discover opensearch description links from local (embedded) Solr index using
|
||||
* meta data field 'outboundlinks_tag_txt' and add found systems to the
|
||||
* config file
|
||||
*/
|
||||
public boolean discoverFromSolrIndex(final Switchboard sb) {
|
||||
if (sb == null) {
|
||||
return false;
|
||||
}
|
||||
final EmbeddedSolrConnector connector = (EmbeddedSolrConnector) sb.index.fulltext().getLocalSolr();
|
||||
// check if needed Solr fields are available (selected)
|
||||
if (connector == null) {
|
||||
Log.logSevere("OpenSearchConnector.Discover", "Error on connecting to embedded Solr index");
|
||||
return false;
|
||||
}
|
||||
final boolean metafieldNOTavailable = sb.index.fulltext().getSolrScheme().containsDisabled(YaCySchema.outboundlinks_tag_txt.name());
|
||||
if (metafieldNOTavailable) {
|
||||
Log.logWarning("OpenSearchConnector.Discover", "Solr Schema field outboundlinks_tag_txt must be switched on");
|
||||
return false;
|
||||
}
|
||||
// the solr query
|
||||
final String solrquerystr = YaCySchema.outboundlinks_tag_txt.getSolrFieldName() + ":\"rel=\\\"search\\\"\" OR "
|
||||
+ YaCySchema.inboundlinks_tag_txt.getSolrFieldName() + ":\"rel=\\\"search\\\"\"&fl="
|
||||
+ YaCySchema.sku.getSolrFieldName() + "," + YaCySchema.outboundlinks_tag_txt.getSolrFieldName() +"," + YaCySchema.inboundlinks_tag_txt.getSolrFieldName();
|
||||
final long numfound;
|
||||
try {
|
||||
SolrDocumentList docList = connector.query(solrquerystr, 0, 1);
|
||||
numfound = docList.getNumFound();
|
||||
if (numfound == 0) {
|
||||
Log.logInfo("OpenSearchConnector.Discover", "no results found, abort discover job");
|
||||
return false;
|
||||
} else {
|
||||
Log.logInfo("OpenSearchConnector.Discover", "start checking " + Long.toString(numfound) + " found index results");
|
||||
}
|
||||
} catch (IOException ex) {
|
||||
Log.logException(ex);
|
||||
return false;
|
||||
}
|
||||
|
||||
final long stoptime = System.currentTimeMillis() + 1000 * 3600; // make sure job doesn't run forever
|
||||
|
||||
// job to iterate through Solr index to find links to opensearchdescriptions
|
||||
// started as background job as connect timeouts may cause it run a long time
|
||||
final Thread job = new Thread() {
|
||||
@Override
|
||||
public void run() {
|
||||
try {
|
||||
boolean doloop = true;
|
||||
int loopnr = 0;
|
||||
Set<String> dblmem = new HashSet<String>(); // temp memory for already checked url
|
||||
while (doloop) {
|
||||
Log.logInfo("OpenSearchConnector.Discover", "start Solr query loop at " + Integer.toString(loopnr * 20) + " of " + Long.toString(numfound));
|
||||
SolrDocumentList docList = connector.query(solrquerystr, loopnr * 20, 20); // check chunk of 20 result documents
|
||||
loopnr++;
|
||||
if (stoptime < System.currentTimeMillis()) {// stop after max 1h
|
||||
doloop = false;
|
||||
Log.logInfo("OpenSearchConnector.Discover", "long running discover task aborted");
|
||||
}
|
||||
if (docList != null && docList.size() > 0) {
|
||||
Iterator<SolrDocument> docidx = docList.iterator();
|
||||
while (docidx.hasNext()) {
|
||||
SolrDocument sdoc = docidx.next();
|
||||
Collection<Object> tagtxtlist = sdoc.getFieldValues(YaCySchema.outboundlinks_tag_txt.getSolrFieldName());
|
||||
if (tagtxtlist == null) {
|
||||
tagtxtlist = sdoc.getFieldValues(YaCySchema.inboundlinks_tag_txt.getSolrFieldName());
|
||||
} else {
|
||||
tagtxtlist.addAll(sdoc.getFieldValues(YaCySchema.inboundlinks_tag_txt.getSolrFieldName()));
|
||||
}
|
||||
Iterator<Object> tagtxtidx = tagtxtlist.iterator();
|
||||
while (tagtxtidx.hasNext()) {
|
||||
// check and extract links to opensearchdescription
|
||||
// example: <a href="http://url/osd.xml" rel="search" name="xyz.com"></a>
|
||||
String tagtxt = (String) tagtxtidx.next();
|
||||
if (tagtxt.contains("search")) {
|
||||
int hrefstartpos = tagtxt.indexOf("href=");
|
||||
if (hrefstartpos > 0) {
|
||||
String hrefendpos = tagtxt.substring(hrefstartpos + 6);
|
||||
hrefstartpos = hrefendpos.indexOf('"');
|
||||
String hrefurltxt = hrefendpos.substring(0, hrefstartpos); // hrefurltxt contains now url to opensearchdescription
|
||||
try {
|
||||
URL url = new URL(hrefurltxt);
|
||||
//TODO: check Blacklist
|
||||
if (dblmem.add(url.getAuthority())) { // use only main path to detect double entries
|
||||
opensearchdescriptionReader os = new opensearchdescriptionReader(hrefurltxt);
|
||||
if (os.getRSSorAtomUrl() != null) {
|
||||
// add found system to config file
|
||||
add(os.getShortName(), os.getRSSorAtomUrl(), false, os.getItem("LongName"));
|
||||
Log.logInfo("OpenSearchConnector.Discover", "added " + os.getShortName() + " " + hrefurltxt);
|
||||
} else {
|
||||
Log.logInfo("OpenSearchConnector.Discover", "osd.xml check failed (no RSS or Atom support) for " + hrefurltxt);
|
||||
}
|
||||
}
|
||||
} catch (MalformedURLException ex) {
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
} else {
|
||||
doloop = false;
|
||||
}
|
||||
}
|
||||
Log.logInfo("OpenSearchConnector.Discover", "finisched Solr query (checked " + Integer.toString(dblmem.size()) + " unique opensearchdescription links found in " + Long.toString(numfound) + " results)");
|
||||
} catch (IOException ex) {
|
||||
Log.logException(ex);
|
||||
}
|
||||
}
|
||||
};
|
||||
job.start();
|
||||
return true;
|
||||
}
|
||||
}
|
|
@ -23,38 +23,34 @@
|
|||
// You should have received a copy of the GNU General Public License
|
||||
// along with this program; if not, write to the Free Software
|
||||
// Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
|
||||
|
||||
|
||||
package net.yacy.document.parser.xml;
|
||||
|
||||
import java.io.ByteArrayInputStream;
|
||||
import java.io.IOException;
|
||||
import java.io.InputStream;
|
||||
import java.util.ArrayList;
|
||||
import java.util.HashMap;
|
||||
import java.util.HashSet;
|
||||
|
||||
import javax.xml.parsers.ParserConfigurationException;
|
||||
import javax.xml.parsers.SAXParser;
|
||||
import javax.xml.parsers.SAXParserFactory;
|
||||
|
||||
import net.yacy.cora.document.UTF8;
|
||||
import net.yacy.cora.protocol.ClientIdentification;
|
||||
import net.yacy.cora.protocol.http.HTTPClient;
|
||||
import net.yacy.kelondro.logging.Log;
|
||||
import net.yacy.kelondro.util.ByteBuffer;
|
||||
|
||||
import org.xml.sax.Attributes;
|
||||
import org.xml.sax.SAXException;
|
||||
import org.xml.sax.helpers.DefaultHandler;
|
||||
|
||||
|
||||
/*
|
||||
* reads opensearchdescription xml document and provides the parsed search url
|
||||
* templates via get methodes as well as all other tags by getItem(tagname)
|
||||
*/
|
||||
public class opensearchdescriptionReader extends DefaultHandler {
|
||||
|
||||
// statics for item generation and automatic categorization
|
||||
static int guidcount = 0;
|
||||
//private static final String recordTag = "OpenSearchDescription";
|
||||
private static final String[] tagsDef = new String[]{
|
||||
"ShortName",
|
||||
"LongName",
|
||||
"Image",
|
||||
// "Image",
|
||||
"Language",
|
||||
"OutputEncoding",
|
||||
"InputEncoding",
|
||||
|
@ -97,35 +93,32 @@ public class opensearchdescriptionReader extends DefaultHandler {
|
|||
}
|
||||
|
||||
// class variables
|
||||
private Item channel;
|
||||
private final StringBuilder buffer;
|
||||
private boolean parsingChannel;
|
||||
private final String imageURL;
|
||||
private final ArrayList<String> itemsGUID; // a list of GUIDs, so the items can be retrieved by a specific order
|
||||
private final HashMap<String, Item> items; // a guid:Item map
|
||||
|
||||
private boolean parsingDescription, parsingTextValue;
|
||||
private final HashMap<String, String> items; // Opensearchdescription Item map
|
||||
private String rssurl, atomurl; // search url templates
|
||||
|
||||
public opensearchdescriptionReader() {
|
||||
this.itemsGUID = new ArrayList<String>();
|
||||
this.items = new HashMap<String, Item>();
|
||||
this.items = new HashMap<String, String>();
|
||||
this.buffer = new StringBuilder();
|
||||
this.channel = null;
|
||||
this.parsingChannel = false;
|
||||
this.imageURL = null;
|
||||
this.parsingDescription = false;
|
||||
this.parsingTextValue = false;
|
||||
this.rssurl = null;
|
||||
this.atomurl = null;
|
||||
}
|
||||
|
||||
private static final ThreadLocal<SAXParser> tlSax = new ThreadLocal<SAXParser>();
|
||||
private static SAXParser getParser() throws SAXException {
|
||||
SAXParser parser = tlSax.get();
|
||||
if (parser == null) {
|
||||
try {
|
||||
parser = SAXParserFactory.newInstance().newSAXParser();
|
||||
} catch (ParserConfigurationException e) {
|
||||
throw new SAXException(e.getMessage(), e);
|
||||
}
|
||||
tlSax.set(parser);
|
||||
}
|
||||
return parser;
|
||||
SAXParser parser = tlSax.get();
|
||||
if (parser == null) {
|
||||
try {
|
||||
parser = SAXParserFactory.newInstance().newSAXParser();
|
||||
} catch (ParserConfigurationException e) {
|
||||
throw new SAXException(e.getMessage(), e);
|
||||
}
|
||||
tlSax.set(parser);
|
||||
}
|
||||
return parser;
|
||||
}
|
||||
|
||||
public opensearchdescriptionReader(final String path) {
|
||||
|
@ -148,102 +141,105 @@ public class opensearchdescriptionReader extends DefaultHandler {
|
|||
}
|
||||
}
|
||||
|
||||
public static opensearchdescriptionReader parse(final byte[] a) {
|
||||
|
||||
// check integrity of array
|
||||
if ((a == null) || (a.length == 0)) {
|
||||
Log.logWarning("opensearchdescriptionReader", "response=null");
|
||||
return null;
|
||||
}
|
||||
if (a.length < 100) {
|
||||
Log.logWarning("opensearchdescriptionReader", "response=" + UTF8.String(a));
|
||||
return null;
|
||||
}
|
||||
if (!ByteBuffer.equals(a, UTF8.getBytes("<?xml"))) {
|
||||
Log.logWarning("opensearchdescriptionReader", "response does not contain valid xml");
|
||||
return null;
|
||||
}
|
||||
final String end = UTF8.String(a, a.length - 10, 10);
|
||||
if (end.indexOf("rss",0) < 0) {
|
||||
Log.logWarning("opensearchdescriptionReader", "response incomplete");
|
||||
return null;
|
||||
}
|
||||
|
||||
// make input stream
|
||||
final ByteArrayInputStream bais = new ByteArrayInputStream(a);
|
||||
|
||||
// parse stream
|
||||
opensearchdescriptionReader reader = null;
|
||||
public opensearchdescriptionReader(final String path, int timeout) {
|
||||
this();
|
||||
try {
|
||||
reader = new opensearchdescriptionReader(bais);
|
||||
HTTPClient www = new HTTPClient(ClientIdentification.getUserAgent(), timeout);
|
||||
www.GET(path);
|
||||
final SAXParser saxParser = getParser();
|
||||
saxParser.parse(www.getContentstream(), this);
|
||||
www.finish();
|
||||
} catch (final Exception e) {
|
||||
Log.logException(e);
|
||||
}
|
||||
}
|
||||
|
||||
public boolean read(String path) {
|
||||
this.items.clear();
|
||||
this.buffer.setLength(0);
|
||||
this.parsingDescription = false;
|
||||
this.parsingTextValue = false;
|
||||
this.rssurl = null;
|
||||
this.atomurl = null;
|
||||
try {
|
||||
HTTPClient www = new HTTPClient(ClientIdentification.getUserAgent(), 1000);
|
||||
www.GET(path);
|
||||
final SAXParser saxParser = getParser();
|
||||
try {
|
||||
saxParser.parse(www.getContentstream(), this);
|
||||
} catch (final SAXException se) {
|
||||
www.finish();
|
||||
return false;
|
||||
} catch (final IOException ioe) {
|
||||
www.finish();
|
||||
return false;
|
||||
}
|
||||
www.finish();
|
||||
return true;
|
||||
} catch (final Exception e) {
|
||||
Log.logWarning("opensearchdescriptionReader", "parse exception: " + e);
|
||||
return null;
|
||||
return false;
|
||||
}
|
||||
try { bais.close(); } catch (final IOException e) {}
|
||||
return reader;
|
||||
}
|
||||
|
||||
@Override
|
||||
public void startElement(final String uri, final String name, final String tag, final Attributes atts) throws SAXException {
|
||||
if ("channel".equals(tag)) {
|
||||
this.channel = new Item();
|
||||
this.parsingChannel = true;
|
||||
if ("OpenSearchDescription".equals(tag)) {
|
||||
this.parsingDescription = true;
|
||||
} else if (this.parsingDescription) {
|
||||
if ("Url".equals(tag)) {
|
||||
this.parsingTextValue = false;
|
||||
String type = atts.getValue("type");
|
||||
if ("application/rss+xml".equals(type)) {
|
||||
rssurl = atts.getValue("template");
|
||||
} else if ("application/atom+xml".equals(type)) {
|
||||
atomurl = atts.getValue("template");
|
||||
}
|
||||
} else {
|
||||
this.parsingTextValue = tags.contains(tag);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
@Override
|
||||
public void endElement(final String uri, final String name, final String tag) {
|
||||
if (tag == null) return;
|
||||
if ("channel".equals(tag)) {
|
||||
this.parsingChannel = false;
|
||||
} else if (this.parsingChannel) {
|
||||
if (tag == null) return;
|
||||
if (parsingDescription && "OpenSearchDescription".equals(tag)) {
|
||||
this.parsingDescription = false;
|
||||
} else if (this.parsingTextValue) {
|
||||
final String value = this.buffer.toString().trim();
|
||||
this.buffer.setLength(0);
|
||||
if (tags.contains(tag)) this.channel.setValue(tag, value);
|
||||
if (tags.contains(tag)) {
|
||||
this.items.put(tag, value);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
@Override
|
||||
public void characters(final char ch[], final int start, final int length) {
|
||||
if (this.parsingChannel) {
|
||||
if (this.parsingTextValue) {
|
||||
this.buffer.append(ch, start, length);
|
||||
}
|
||||
}
|
||||
|
||||
public Item getChannel() {
|
||||
return this.channel;
|
||||
public String getRSSTemplate() {
|
||||
return this.rssurl;
|
||||
}
|
||||
|
||||
public Item getItem(final int i) {
|
||||
// retrieve item by order number
|
||||
return getItem(this.itemsGUID.get(i));
|
||||
public String getRSSorAtomUrl() {
|
||||
return this.rssurl == null ? this.atomurl : this.rssurl;
|
||||
}
|
||||
|
||||
public Item getItem(final String guid) {
|
||||
// retrieve item by guid
|
||||
return this.items.get(guid);
|
||||
public String getShortName() {
|
||||
return items.get("ShortName");
|
||||
}
|
||||
|
||||
public String getItem(final String name) {
|
||||
// retrieve item by name
|
||||
return this.items.get(name);
|
||||
}
|
||||
|
||||
public int items() {
|
||||
return this.items.size();
|
||||
}
|
||||
|
||||
public String getImage() {
|
||||
return this.imageURL;
|
||||
}
|
||||
|
||||
public static class Item {
|
||||
|
||||
private final HashMap<String, String> map;
|
||||
|
||||
public Item() {
|
||||
this.map = new HashMap<String, String>();
|
||||
this.map.put("guid", Long.toHexString(System.currentTimeMillis()) + ":" + guidcount++);
|
||||
}
|
||||
|
||||
public void setValue(final String name, final String value) {
|
||||
this.map.put(name, value);
|
||||
}
|
||||
}
|
||||
}
|
Loading…
Reference in New Issue
Block a user