/** * AutoSearch.java * Copyright 2015 by Burkhard Buelte * First released 09.01.2015 at http://yacy.net * * This is a part of YaCy, a peer-to-peer based web search engine * * LICENSE * * This library is free software; you can redistribute it and/or * modify it under the terms of the GNU Lesser General Public * License as published by the Free Software Foundation; either * version 2.1 of the License, or (at your option) any later version. * * This library is distributed in the hope that it will be useful, * but WITHOUT ANY WARRANTY; without even the implied warranty of * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU * Lesser General Public License for more details. * * You should have received a copy of the GNU Lesser General Public License * along with this program in the file lgpl21.txt * If not, see . */ package net.yacy.search; import java.io.File; import java.io.FileInputStream; import java.io.FileNotFoundException; import java.io.FileOutputStream; import java.io.IOException; import java.io.OutputStream; import java.util.ArrayList; import java.util.HashSet; import java.util.Iterator; import java.util.List; import java.util.Properties; import java.util.Set; import net.yacy.cora.document.feed.RSSFeed; import net.yacy.cora.document.id.DigestURL; import net.yacy.cora.document.id.MultiProtocolURL; import static net.yacy.cora.federate.opensearch.SRURSSConnector.loadSRURSS; import net.yacy.cora.federate.solr.connector.RemoteSolrConnector; import net.yacy.cora.federate.solr.connector.SolrConnector; import net.yacy.cora.federate.solr.instance.RemoteInstance; import net.yacy.cora.federate.yacy.CacheStrategy; import net.yacy.cora.protocol.ClientIdentification; import net.yacy.cora.util.ConcurrentLog; import net.yacy.data.BookmarksDB.Bookmark; import net.yacy.kelondro.workflow.AbstractBusyThread; import net.yacy.peers.Seed; import net.yacy.search.schema.CollectionSchema; import org.apache.solr.client.solrj.SolrQuery; import org.apache.solr.client.solrj.response.QueryResponse; import org.apache.solr.common.SolrDocument; import org.apache.solr.common.SolrDocumentList; import org.apache.solr.common.params.CommonParams; /** * AutoSearch retrieves queries from Bookmarks or a property file (if existing) * and loops to a list of connected peers and asks each for results which are * added to the local index. */ public class AutoSearch extends AbstractBusyThread { private Set querystack; // serach query public String currentQuery = null; // current query private Set currentTargets = null; // peer hashes final Switchboard sb; public int gotresults; private long lastInitTime; // to recognize new data (Bookmarks) to import public AutoSearch(Switchboard xsb) { super(3000, 1000); // set lower limits of cycle delay this.setIdleSleep(60000); // set actual cycle delays this.setBusySleep(10000); this.sb = xsb; gotresults = 0; querystack = new HashSet(); this.lastInitTime = System.currentTimeMillis() - 600000; // init to now - 10 min if (!checkBookmarkDB()) { try { // check for old queries in temp property file File pfile = new File(xsb.dataPath, "DATA/SETTINGS/autosearch.conf"); if (pfile.exists()) { ConcurrentLog.info(AutoSearch.class.getName(), "read queries from file " + pfile.getAbsolutePath()); Properties prop = new Properties(); FileInputStream fileIn = new FileInputStream(pfile); prop.load(fileIn); if (prop.size() > 0) { Set all = prop.keySet(); for (Object s : all) { String query = prop.getProperty((String) s); if (query != null && !query.isEmpty()) { querystack.add(query); } } } fileIn.close(); } } catch (final IOException e) { ConcurrentLog.warn(AutoSearch.class.getName(), "Error reading config file"); } } } /** * Save current queries to a (temporary) property file to allow continue * after a restart. Existing file will be overwritten or deleted. */ private void saveasPropFile() { File pfile = new File(sb.dataPath, "DATA/SETTINGS/autosearch.conf"); if (querystack.size() == 0) { if (pfile.exists()) { pfile.delete(); } } else { try { Properties prop = new Properties(); for (String s : querystack) { prop.put("query" + s.hashCode(), s); } OutputStream fileOut = new FileOutputStream(pfile); prop.store(fileOut, "AutoSearch query list"); fileOut.close(); } catch (FileNotFoundException ex) { ConcurrentLog.warn(AutoSearch.class.getName(), "can not create file " + pfile.getAbsolutePath()); } catch (IOException ex) { ConcurrentLog.warn(AutoSearch.class.getName(), "IO error writing to file " + pfile.getAbsolutePath()); } } } /** * Get peers to query (peers connected) * * @return Set of peer hashes to contact */ private void initPeerList() { if (currentTargets == null) { currentTargets = new HashSet(); } // TODO: DHT peers could be excluded Iterator it = Switchboard.getSwitchboard().peers.seedsConnected(true, false, null, 0); while (it.hasNext()) { Seed s = it.next(); currentTargets.add(s.hash); } } /** * Check BookmarkDB for existing queries return true if new entry added to * query queue. Store queries in (temporary) property file * * @return true if new query from bookmark was added */ private boolean checkBookmarkDB() { int added = 0; Iterator it = Switchboard.getSwitchboard().bookmarksDB.getBookmarksIterator(); if (it != null) { while (it.hasNext()) { Bookmark bmk = it.next(); // get search bookmarks only if (bmk.getFoldersString().startsWith("/search")) { // take only new created or edited bookmarks if (bmk.getTimeStamp() >= this.lastInitTime) { final String query = bmk.getDescription(); if (!query.isEmpty() && query.startsWith("query=")) { { querystack.add(query.substring(6)); added++; ConcurrentLog.info(AutoSearch.class.getName(), "add query from Bookmarks " + query); } } } } } } if (added > 0) { this.lastInitTime = System.currentTimeMillis(); saveasPropFile(); return true; } else { return false; } } /** * Process query queue, select one query and peer to ask next * * @return true if something processed */ @Override public boolean job() { if (currentQuery == null && querystack != null && querystack.size() > 0) { currentQuery = querystack.iterator().next(); querystack.remove(currentQuery); // imediate remove to asure no repeat initPeerList(); // late initialization of peerlist to get currently connected } // ask next peer for search term if (currentQuery != null && !currentQuery.isEmpty()) { if (currentTargets != null && !currentTargets.isEmpty()) { while (currentTargets.size() > 0) { // loop only to skip disconnected peers String peerhash = currentTargets.iterator().next(); currentTargets.remove(peerhash); Seed seed = Switchboard.getSwitchboard().peers.getConnected(peerhash); if (seed != null) { processSingleTarget(seed); return true; // just one query per busycycle is intended } } } currentQuery = null; } // no search targets checkBookmarkDB(); // TODO: do idle processing // analyse content of local index // extend search with learned new search terms // follow most promising links ConcurrentLog.fine(AutoSearch.class.getName(), "nothing to do"); return this.querystack.size() > 0; } /** * Calls one peer for search results of the current query and adds it to the * local index. Depending on peers SolrAvailable flag the a solr query or * opensearch/rss query is used. * * @param seed the peer to ask */ private void processSingleTarget(Seed seed) { ConcurrentLog.fine(AutoSearch.class.getName(), "ask " + seed.getIP() + " " + seed.getName() + " for query=" + currentQuery); if (seed.getFlagSolrAvailable()) { // do a solr query SolrDocumentList docList = null; SolrQuery solrQuery = new SolrQuery(); // use remote defaults and ranking (to query their index right) solrQuery.set(CommonParams.Q, currentQuery + " AND (" + CollectionSchema.httpstatus_i.name() + ":200)"); // except this yacy special solrQuery.set("q.op", "AND"); // except ... no one word matches please solrQuery.set(CommonParams.ROWS, "20"); this.setName("Protocol.solrQuery(" + solrQuery.getQuery() + " to " + seed.hash + ")"); try { RemoteInstance instance = new RemoteInstance("http://" + seed.getPublicAddress(seed.getIP()) + "/solr/", null, null, 10000); // this is a 'patch configuration' which considers 'solr' as default collection try { SolrConnector solrConnector = new RemoteSolrConnector(instance, true, null); if (!solrConnector.isClosed()) { try { QueryResponse rsp = solrConnector.getResponseByParams(solrQuery); docList = rsp.getResults(); } catch (Throwable e) { } finally { solrConnector.close(); } } } catch (Throwable ee) { } finally { instance.close(); } if (docList != null) { for (SolrDocument d : docList) { sb.index.fulltext().putDocument(sb.index.fulltext().getDefaultConfiguration().toSolrInputDocument(d)); this.gotresults++; } ConcurrentLog.info(AutoSearch.class.getName(), "added " + docList.size() + " results from " + seed.getName() + " to index for solrquery=" + currentQuery); } } catch (Throwable eee) { } } else { // do a yacysearch.rss query final String rssSearchServiceURL = "http://" + seed.getPublicAddress(seed.getIP()) + "/yacysearch.rss"; try { RSSFeed feed = loadSRURSS( rssSearchServiceURL, currentQuery, 0, 20, CacheStrategy.IFFRESH, false, // just local, as we ask others too ClientIdentification.yacyInternetCrawlerAgent); final List urls = new ArrayList(); for (final MultiProtocolURL entry : feed.getLinks()) { urls.add(new DigestURL(entry, (byte[]) null)); this.gotresults++; } sb.addToIndex(urls, null, "AutoSearch", null, true); ConcurrentLog.info(AutoSearch.class.getName(), "added " + urls.size() + " results from " + seed.getName() + " to index for query=" + currentQuery); } catch (IOException ex) { ConcurrentLog.info(AutoSearch.class.getName(), "no answer from " + seed.getName()); } } } /** * Estimate of queries to perform */ @Override public int getJobCount() { if (currentTargets != null) { int cnt = currentTargets.size(); cnt += querystack.size() * sb.peers.sizeConnected(); return cnt; } else { return 0; } } @Override public void freemem() { } @Override public void close() { this.saveasPropFile(); // saves or deletes property file with queries } }