From 54e77e6255be45ff32349bf1c9e2a42140352e82 Mon Sep 17 00:00:00 2001 From: orbiter Date: Mon, 10 Jan 2011 08:40:41 +0000 Subject: [PATCH] refactoring git-svn-id: https://svn.berlios.de/svnroot/repos/yacy/trunk@7426 6c8d7289-2bf4-0310-a012-ef5d649a1542 --- htroot/yacysearch_location.java | 4 +- source/de/anomic/yacy/yacyClient.java | 4 +- source/net/yacy/cora/document/RSSFeed.java | 18 + source/net/yacy/cora/document/RSSMessage.java | 56 +++- source/net/yacy/cora/services/Search.java | 313 ------------------ .../yacy/cora/services/SearchAccumulator.java | 40 +++ source/net/yacy/cora/services/SearchHub.java | 164 +++++++++ .../net/yacy/cora/services/SearchSRURSS.java | 201 +++++++++++ .../net/yacy/document/SnippetExtractor.java | 3 +- 9 files changed, 466 insertions(+), 337 deletions(-) delete mode 100644 source/net/yacy/cora/services/Search.java create mode 100644 source/net/yacy/cora/services/SearchAccumulator.java create mode 100644 source/net/yacy/cora/services/SearchHub.java create mode 100644 source/net/yacy/cora/services/SearchSRURSS.java diff --git a/htroot/yacysearch_location.java b/htroot/yacysearch_location.java index 5820f6243..74d9bbe92 100644 --- a/htroot/yacysearch_location.java +++ b/htroot/yacysearch_location.java @@ -26,7 +26,7 @@ import java.util.concurrent.TimeUnit; import net.yacy.cora.document.RSSMessage; import net.yacy.cora.protocol.HeaderFramework; import net.yacy.cora.protocol.RequestHeader; -import net.yacy.cora.services.Search; +import net.yacy.cora.services.SearchSRURSS; import net.yacy.document.geolocalization.Location; import de.anomic.data.LibraryProvider; import de.anomic.search.Switchboard; @@ -93,7 +93,7 @@ public class yacysearch_location { // get a queue of search results String rssSearchServiceURL = "http://localhost:" + sb.getConfig("port", "8080") + "/yacysearch.rss"; BlockingQueue results = new LinkedBlockingQueue(); - Search.searchSRURSS(rssSearchServiceURL, query, maximumTime, Integer.MAX_VALUE, false, false, results); + SearchSRURSS.searchSRURSS(results, rssSearchServiceURL, query, maximumTime, Integer.MAX_VALUE, false, false); // take the results and compute some locations RSSMessage message; diff --git a/source/de/anomic/yacy/yacyClient.java b/source/de/anomic/yacy/yacyClient.java index 30ad547d7..2c9ba9002 100644 --- a/source/de/anomic/yacy/yacyClient.java +++ b/source/de/anomic/yacy/yacyClient.java @@ -62,7 +62,7 @@ import net.yacy.cora.document.RSSFeed; import net.yacy.cora.document.RSSMessage; import net.yacy.cora.document.RSSReader; import net.yacy.cora.protocol.http.HTTPConnector; -import net.yacy.cora.services.Search; +import net.yacy.cora.services.SearchSRURSS; import net.yacy.kelondro.data.meta.URIMetadataRow; import net.yacy.kelondro.data.word.Word; import net.yacy.kelondro.data.word.WordReference; @@ -372,7 +372,7 @@ public final class yacyClient { public static RSSFeed search(final yacySeed targetSeed, String query, boolean verify, boolean global, long timeout, int startRecord, int maximumRecords) throws IOException { String address = (targetSeed == null || targetSeed == Switchboard.getSwitchboard().peers.mySeed()) ? "localhost:" + Switchboard.getSwitchboard().getConfig("port", "8080") : targetSeed.getClusterAddress(); String urlBase = "http://" + address + "/yacysearch.rss"; - return Search.loadSRURSS(urlBase, query, timeout, startRecord, maximumRecords, verify, global); + return SearchSRURSS.loadSRURSS(urlBase, query, timeout, startRecord, maximumRecords, verify, global); } @SuppressWarnings("unchecked") diff --git a/source/net/yacy/cora/document/RSSFeed.java b/source/net/yacy/cora/document/RSSFeed.java index 9a9017969..9325ac385 100644 --- a/source/net/yacy/cora/document/RSSFeed.java +++ b/source/net/yacy/cora/document/RSSFeed.java @@ -45,6 +45,24 @@ public class RSSFeed implements Iterable { this.maxsize = maxsize; } + /** + * make a RSS feed using a set of urls + * the source string is assigned to all messages as author to mark the messages' origin + * @param links + * @param source + */ + public RSSFeed(Set links, String source) { + this(Integer.MAX_VALUE); + String u; + RSSMessage message; + for (MultiProtocolURI uri: links) { + u = uri.toNormalform(true, false); + message = new RSSMessage(u, "", u); + message.setAuthor(source); + this.addMessage(message); + } + } + public void setChannel(final RSSMessage channelItem) { this.channel = channelItem; } diff --git a/source/net/yacy/cora/document/RSSMessage.java b/source/net/yacy/cora/document/RSSMessage.java index df751ea1b..1182f3498 100644 --- a/source/net/yacy/cora/document/RSSMessage.java +++ b/source/net/yacy/cora/document/RSSMessage.java @@ -21,6 +21,7 @@ package net.yacy.cora.document; import java.text.ParseException; +import java.util.Comparator; import java.util.Date; import java.util.HashSet; import java.util.Map; @@ -31,7 +32,7 @@ import net.yacy.cora.date.GenericFormatter; import net.yacy.cora.date.ISO8601Formatter; import net.yacy.cora.protocol.HeaderFramework; -public class RSSMessage implements Hit { +public class RSSMessage implements Hit, Comparable, Comparator { public static enum Token { @@ -58,13 +59,13 @@ public class RSSMessage implements Hit { for (String s: k) this.keys.add(s); } - public String valueFrom(Map map) { + public String valueFrom(Map map, String dflt) { String value; for (String key: this.keys) { value = map.get(key); if (value != null) return value; } - return ""; + return dflt; } public Set keys() { @@ -107,31 +108,50 @@ public class RSSMessage implements Hit { } public String getTitle() { - return Token.title.valueFrom(this.map); + return Token.title.valueFrom(this.map, ""); } public String getLink() { - return Token.link.valueFrom(this.map); + return Token.link.valueFrom(this.map, ""); + } + + public boolean equals(Object o) { + return (o instanceof RSSMessage) && ((RSSMessage) o).getLink().equals(this.getLink()); + } + + public int hashCode() { + return getLink().hashCode(); + } + + @Override + public int compareTo(RSSMessage o) { + if (!(o instanceof RSSMessage)) return 1; + return this.getLink().compareTo(o.getLink()); + } + + @Override + public int compare(RSSMessage o1, RSSMessage o2) { + return o1.compareTo(o2); } public String getDescription() { - return Token.description.valueFrom(this.map); + return Token.description.valueFrom(this.map, ""); } public String getAuthor() { - return Token.author.valueFrom(this.map); + return Token.author.valueFrom(this.map, ""); } public String getCopyright() { - return Token.copyright.valueFrom(this.map); + return Token.copyright.valueFrom(this.map, ""); } public String getCategory() { - return Token.category.valueFrom(this.map); + return Token.category.valueFrom(this.map, ""); } public String[] getSubject() { - String subject = Token.subject.valueFrom(this.map); + String subject = Token.subject.valueFrom(this.map, ""); if (subject.indexOf(',') >= 0) return subject.split(","); if (subject.indexOf(';') >= 0) return subject.split(";"); if (subject.indexOf('|') >= 0) return subject.split("|"); @@ -139,15 +159,15 @@ public class RSSMessage implements Hit { } public String getReferrer() { - return Token.referrer.valueFrom(this.map); + return Token.referrer.valueFrom(this.map, ""); } public String getLanguage() { - return Token.language.valueFrom(this.map); + return Token.language.valueFrom(this.map, ""); } public Date getPubDate() { - String dateString = Token.pubDate.valueFrom(this.map); + String dateString = Token.pubDate.valueFrom(this.map, ""); Date date; try { date = ISO8601Formatter.FORMATTER.parse(dateString); @@ -162,20 +182,20 @@ public class RSSMessage implements Hit { } public String getGuid() { - return Token.guid.valueFrom(this.map); + return Token.guid.valueFrom(this.map, ""); } public String getTTL() { - return Token.ttl.valueFrom(this.map); + return Token.ttl.valueFrom(this.map, ""); } public String getDocs() { - return Token.docs.valueFrom(this.map); + return Token.docs.valueFrom(this.map, ""); } public long getSize() { - String size = Token.size.valueFrom(this.map); - return (size == null || size.length() == 0) ? 0 : Long.parseLong(size); + String size = Token.size.valueFrom(this.map, "-1"); + return (size == null || size.length() == 0) ? -1 : Long.parseLong(size); } public String getFulltext() { diff --git a/source/net/yacy/cora/services/Search.java b/source/net/yacy/cora/services/Search.java deleted file mode 100644 index 83291d343..000000000 --- a/source/net/yacy/cora/services/Search.java +++ /dev/null @@ -1,313 +0,0 @@ -/** - * Search - * Copyright 2010 by Michael Peter Christen - * First released 25.05.2010 at http://yacy.net - * - * This library is free software; you can redistribute it and/or - * modify it under the terms of the GNU Lesser General Public - * License as published by the Free Software Foundation; either - * version 2.1 of the License, or (at your option) any later version. - * - * This library is distributed in the hope that it will be useful, - * but WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU - * Lesser General Public License for more details. - * - * You should have received a copy of the GNU Lesser General Public License - * along with this program in the file lgpl21.txt - * If not, see . - */ - -package net.yacy.cora.services; - -import java.io.IOException; -import java.net.MalformedURLException; -import java.util.ArrayList; -import java.util.Iterator; -import java.util.LinkedHashMap; -import java.util.List; -import java.util.Map; -import java.util.Set; -import java.util.concurrent.BlockingQueue; -import java.util.concurrent.ConcurrentHashMap; -import java.util.concurrent.LinkedBlockingQueue; -import java.util.concurrent.TimeUnit; -import java.util.regex.Pattern; - -import net.yacy.cora.document.MultiProtocolURI; -import net.yacy.cora.document.RSSFeed; -import net.yacy.cora.document.RSSMessage; -import net.yacy.cora.document.RSSReader; -import net.yacy.cora.protocol.HeaderFramework; -import net.yacy.cora.protocol.RequestHeader; -import net.yacy.cora.protocol.http.HTTPClient; -import net.yacy.cora.protocol.http.HTTPConnector; -import net.yacy.cora.protocol.http.LinkExtractor; -import net.yacy.cora.storage.ScoreMap; - -import org.apache.http.entity.mime.content.ContentBody; -import org.apache.http.entity.mime.content.StringBody; - -public class Search extends Thread { - - private final static int recordsPerSession = 10; - - public static final String[] SRURSSServicesList = { - "http://yacy.dyndns.org:8000/yacysearch.rss", - "http://yacy.caloulinux.net:8085/yacysearch.rss", - "http://algire.dyndns.org:8085/yacysearch.rss", - "http://breyvogel.dyndns.org:8002/yacysearch.rss" - }; - - public static final String[] genericServicesList = { - "http://www.scroogle.org/cgi-bin/nbbw.cgi?Gw=$&n=2", - "http://blekko.com/ws/$+/rss", - "http://www.bing.com/search?q=$&format=rss", - "http://search.twitter.com/search.atom?q=$" - }; - - public static Thread accumulateSRURSS( - final String urlBase, - final String query, - final long timeoutInit, - final int maximumRecordsInit, - final boolean verify, - final boolean global, - final Map> result) { - Thread t = new Thread() { - BlockingQueue results = new LinkedBlockingQueue(); - public void run() { - searchSRURSS(urlBase, query, timeoutInit, maximumRecordsInit, verify, global, results); - int p = 1; - RSSMessage message; - try { - while ((message = results.poll(timeoutInit, TimeUnit.MILLISECONDS)) != RSSMessage.POISON) { - MultiProtocolURI uri; - if (message == null) break; - try { - uri = new MultiProtocolURI(message.getLink()); - List m = result.get(uri); - if (m == null) m = new ArrayList(); - m.add(new Integer(p++)); - result.put(uri, m); - } catch (MalformedURLException e) { - e.printStackTrace(); - } - } - } catch (InterruptedException e) { - e.printStackTrace(); - } - } - }; - t.start(); - return t; - } - - public static Thread searchSRURSS( - final String urlBase, - final String query, - final long timeoutInit, - final int maximumRecordsInit, - final boolean verify, - final boolean global, - final BlockingQueue queue) { - Thread job = new Thread() { - public void run() { - int startRecord = 0; - RSSMessage message; - int maximumRecords = maximumRecordsInit; - long timeout = timeoutInit; - mainloop: while (timeout > 0 && maximumRecords > 0) { - long st = System.currentTimeMillis(); - RSSFeed feed; - try { - feed = loadSRURSS(urlBase, query, timeout, startRecord, recordsPerSession, verify, global); - } catch (IOException e1) { - break mainloop; - } - if (feed == null || feed.isEmpty()) break mainloop; - maximumRecords -= feed.size(); - innerloop: while (!feed.isEmpty()) { - message = feed.pollMessage(); - if (message == null) break innerloop; - try { - queue.put(message); - } catch (InterruptedException e) { - break innerloop; - } - } - startRecord += recordsPerSession; - timeout -= System.currentTimeMillis() - st; - } - try { queue.put(RSSMessage.POISON); } catch (InterruptedException e) {} - } - }; - job.start(); - return job; - } - - /** - * send a query to a yacy public search interface - * @param rssSearchServiceURL the target url base (everything before the ? that follows the SRU request syntax properties). can null, then the local peer is used - * @param query the query as string - * @param startRecord number of first record - * @param maximumRecords maximum number of records - * @param verify if true, result entries are verified using the snippet fetch (slow); if false simply the result is returned - * @param global if true also search results from other peers are included - * @param timeout milliseconds that are waited at maximum for a search result - * @return - */ - public static RSSFeed loadSRURSS( - String rssSearchServiceURL, - String query, - long timeout, - int startRecord, - int maximumRecords, - boolean verify, - boolean global) throws IOException { - MultiProtocolURI uri = null; - try { - uri = new MultiProtocolURI(rssSearchServiceURL); - } catch (MalformedURLException e) { - throw new IOException("cora.Search failed asking peer '" + rssSearchServiceURL + "': bad url, " + e.getMessage()); - } - - // send request - try { - final LinkedHashMap parts = new LinkedHashMap(); - parts.put("query", new StringBody(query)); - parts.put("startRecord", new StringBody(Integer.toString(startRecord))); - parts.put("maximumRecords", new StringBody(Long.toString(maximumRecords))); - parts.put("verify", new StringBody(verify ? "true" : "false")); - parts.put("resource", new StringBody(global ? "global" : "local")); - final byte[] result = HTTPConnector.getConnector(MultiProtocolURI.yacybotUserAgent).post(new MultiProtocolURI(rssSearchServiceURL), (int) timeout, uri.getHost(), parts); - //String debug = new String(result); System.out.println("*** DEBUG: " + debug); - final RSSReader reader = RSSReader.parse(RSSFeed.DEFAULT_MAXSIZE, result); - if (reader == null) { - throw new IOException("cora.Search failed asking peer '" + uri.getHost() + "': probably bad response from remote peer (1), reader == null"); - } - final RSSFeed feed = reader.getFeed(); - if (feed == null) { - // case where the rss reader does not understand the content - throw new IOException("cora.Search failed asking peer '" + uri.getHost() + "': probably bad response from remote peer (2)"); - } - return feed; - } catch (final IOException e) { - throw new IOException("cora.Search error asking peer '" + uri.getHost() + "':" + e.toString()); - } - } - - public static Thread accumulateGeneric( - String query, - String service, - final Map> result, - final int timeout) { - query = query.replace(' ', '+'); - final String servicePatched = service.replaceAll("\\$", query); - Thread t = new Thread() { - public void run() { - try { - MultiProtocolURI[] sr = loadGeneric(new MultiProtocolURI(servicePatched), timeout); - int p = 1; - for (MultiProtocolURI u: sr) { - List m = result.get(u); - if (m == null) m = new ArrayList(); - m.add(new Integer(p++)); - result.put(u, m); - } - } catch (MalformedURLException e) { - e.printStackTrace(); - } catch (IOException e) { - e.printStackTrace(); - } - } - }; - t.start(); - return t; - } - - private static MultiProtocolURI[] loadGeneric(MultiProtocolURI uri, long timeout) throws IOException { - final RequestHeader requestHeader = new RequestHeader(); - requestHeader.put(HeaderFramework.USER_AGENT, MultiProtocolURI.yacybotUserAgent); - final HTTPClient client = new HTTPClient(); - client.setTimout((int) timeout); - client.setHeader(requestHeader.entrySet()); - byte[] result = client.GETbytes(uri.toString()); - client.finish(); - if (client.getStatusCode() != 200) { - throw new IOException("Server returned status: " + client.getHttpResponse().getStatusLine()); - } - if (result == null) throw new IOException("cora.Search error asking peer '" + uri.getHost() + "': null"); - LinkExtractor le = new LinkExtractor(Pattern.compile(".*" + uri.getHost() + ".*")); - le.scrape(new String(result)); - MultiProtocolURI[] links = le.getLinks(); - return links; - } - - public static RSSFeed links2feed(Set links, String source) { - RSSFeed feed = new RSSFeed(Integer.MAX_VALUE); - String u; - RSSMessage message; - for (MultiProtocolURI uri: links) { - u = uri.toNormalform(true, false); - message = new RSSMessage(u, "", u); - message.setAuthor(source); - feed.addMessage(message); - } - return feed; - } - - private Map> result; - private String query; - private int count; - private String[] yacyServices, rssServices, genericServices; - private List threads; - - public Search(String query, int count, String[] rssServices, String[] genericServices) { - this.result = new ConcurrentHashMap>(); - this.query = query; - this.count = count; - this.yacyServices = yacyServices; - this.rssServices = rssServices; - this.genericServices = genericServices; - this.threads = new ArrayList(); - } - - public void run() { - for (String service: this.rssServices) threads.add(accumulateSRURSS(service, this.query, 10000, this.count, false, true, this.result)); - for (String service: this.genericServices) threads.add(accumulateGeneric(this.query, service, this.result, 10000)); - } - - public ScoreMap getResults() { - ScoreMap scores = new ScoreMap(); - int m = this.rssServices.length + this.genericServices.length; - for (Map.Entry> entry: this.result.entrySet()) { - int a = 0; - for (Integer i : entry.getValue()) a += i.intValue(); - scores.inc(entry.getKey(), a * m / entry.getValue().size()); - } - return scores; - } - - public void waitTermination() { - for (Thread t: threads) try {t.join();} catch (InterruptedException e) {} - } - - public static void main(String[] args) { - StringBuilder sb = new StringBuilder(); - for (String s: args) sb.append(s).append(' '); - String query = sb.toString().trim(); - Search search = new Search(query, 100, SRURSSServicesList, genericServicesList); - search.start(); - try {Thread.sleep(100);} catch (InterruptedException e1) {} - search.waitTermination(); - ScoreMap result = search.getResults(); - Iterator i = result.keys(true); - MultiProtocolURI u; - while (i.hasNext()) { - u = i.next(); - System.out.println("[" + result.get(u) + "] " + u.toNormalform(true, false)); - } - try {HTTPClient.closeConnectionManager();} catch (InterruptedException e) {} - } -} diff --git a/source/net/yacy/cora/services/SearchAccumulator.java b/source/net/yacy/cora/services/SearchAccumulator.java new file mode 100644 index 000000000..ddf2fb114 --- /dev/null +++ b/source/net/yacy/cora/services/SearchAccumulator.java @@ -0,0 +1,40 @@ +/** + * Accumulator + * Copyright 2010 by Michael Peter Christen + * First released 07.01.2011 at http://yacy.net + * + * This library is free software; you can redistribute it and/or + * modify it under the terms of the GNU Lesser General Public + * License as published by the Free Software Foundation; either + * version 2.1 of the License, or (at your option) any later version. + * + * This library is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * Lesser General Public License for more details. + * + * You should have received a copy of the GNU Lesser General Public License + * along with this program in the file lgpl21.txt + * If not, see . + */ + +package net.yacy.cora.services; + +/** + * place-holder class to provide a object declaration for threads in Search object + */ +public interface SearchAccumulator extends Runnable { + + /** + * join this accumulator: wait until it terminates + * @throws InterruptedException + */ + public void join() throws InterruptedException; + + /** + * test if the accumulator is still running + * @return + */ + public boolean isAlive(); + +} diff --git a/source/net/yacy/cora/services/SearchHub.java b/source/net/yacy/cora/services/SearchHub.java new file mode 100644 index 000000000..fd3e27511 --- /dev/null +++ b/source/net/yacy/cora/services/SearchHub.java @@ -0,0 +1,164 @@ +/** + * Search + * Copyright 2010 by Michael Peter Christen + * First released 25.05.2010 at http://yacy.net + * + * This library is free software; you can redistribute it and/or + * modify it under the terms of the GNU Lesser General private + * License as published by the Free Software Foundation; either + * version 2.1 of the License, or (at your option) any later version. + * + * This library is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * Lesser General private License for more details. + * + * You should have received a copy of the GNU Lesser General private License + * along with this program in the file lgpl21.txt + * If not, see . + */ + +package net.yacy.cora.services; + +import java.util.ArrayList; +import java.util.Iterator; +import java.util.List; +import java.util.Map; +import java.util.concurrent.ConcurrentHashMap; + +import net.yacy.cora.document.RSSMessage; +import net.yacy.cora.protocol.http.HTTPClient; +import net.yacy.cora.storage.ScoreMap; + +public class SearchHub { + + private static final String[] SRURSSServicesList = { + "http://yacy.dyndns.org:8000/yacysearch.rss", + "http://yacy.caloulinux.net:8085/yacysearch.rss", + "http://algire.dyndns.org:8085/yacysearch.rss", + "http://breyvogel.dyndns.org:8002/yacysearch.rss" + }; + + public final static SearchHub EMPTY = new SearchHub("", 0); + + private String query; + private int timeout; + private List threads; + private Map> result; + + public SearchHub(final String query, final int timeout) { + this.query = query; + this.timeout = timeout; + this.threads = new ArrayList(); + this.result = new ConcurrentHashMap>(); + } + + /** + * get the result of the accumulation + * @return + */ + public Map> getAccumulation() { + return this.result; + } + + /** + * add an accumulator to the list of accumulation theads. + * this is mainly used for awaitTermination() and isTerminated() + * @param a + */ + public void addAccumulator(SearchAccumulator a) { + this.threads.add(a); + } + + /** + * get the original query string + * @return + */ + public String getQuery() { + return this.query; + } + + /** + * get the given time-out of the search request + * @return + */ + public int getTimeout() { + return this.timeout; + } + + /** + * get the list of search results as scored map. + * The results are combined using their appearance positions. + * Every time this method is called the list is re-computed to reflect the latest results + * @return a score map of urls + */ + public ScoreMap getResults() { + ScoreMap scores = new ScoreMap(); + int m = threads.size(); + for (Map.Entry> entry: this.result.entrySet()) { + int a = 0; + for (Integer i : entry.getValue()) a += i.intValue(); + scores.inc(entry.getKey().getLink(), a * m / entry.getValue().size()); + } + return scores; + } + + /** + * wait until all accumulation threads have terminated + */ + public void waitTermination() { + for (SearchAccumulator t: threads) try {t.join();} catch (InterruptedException e) {} + } + + /** + * return true if all accumulation threads have terminated + * @return + */ + public boolean isTerminated() { + for (SearchAccumulator t: threads) if (t.isAlive()) return false; + return true; + } + + /** + * return a hash code of the search hub. + * This is computed using only the query string because that identifies the object + */ + public int hashCode() { + return query.hashCode(); + } + + /** + * test method to add a list of SRU RSS services. + * such services are provided by YaCy peers + * @param search + * @param rssServices + * @param count + * @param verify + * @param global + */ + public static void addSRURSSServices(SearchHub search, String[] rssServices, int count, boolean verify, boolean global) { + for (String service: rssServices) { + SearchSRURSS accumulator = new SearchSRURSS(search, service, count, verify, global); + accumulator.start(); + search.addAccumulator(accumulator); + } + } + + public static void main(String[] args) { + StringBuilder sb = new StringBuilder(); + for (String s: args) sb.append(s).append(' '); + String query = sb.toString().trim(); + SearchHub search = new SearchHub(query, 10000); + addSRURSSServices(search, SRURSSServicesList, 100, false, false); + try {Thread.sleep(100);} catch (InterruptedException e1) {} + search.waitTermination(); + ScoreMap result = search.getResults(); + Iterator i = result.keys(true); + String u; + while (i.hasNext()) { + u = i.next(); + System.out.println("[" + result.get(u) + "] " + u); + } + try {HTTPClient.closeConnectionManager();} catch (InterruptedException e) {} + } +} diff --git a/source/net/yacy/cora/services/SearchSRURSS.java b/source/net/yacy/cora/services/SearchSRURSS.java new file mode 100644 index 000000000..a208468e1 --- /dev/null +++ b/source/net/yacy/cora/services/SearchSRURSS.java @@ -0,0 +1,201 @@ +/** + * AccumulateSRURSS + * Copyright 2010 by Michael Peter Christen + * First released 06.01.2011 at http://yacy.net + * + * This library is free software; you can redistribute it and/or + * modify it under the terms of the GNU Lesser General Public + * License as published by the Free Software Foundation; either + * version 2.1 of the License, or (at your option) any later version. + * + * This library is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * Lesser General Public License for more details. + * + * You should have received a copy of the GNU Lesser General Public License + * along with this program in the file lgpl21.txt + * If not, see . + */ + +package net.yacy.cora.services; + +import java.io.IOException; +import java.net.MalformedURLException; +import java.util.ArrayList; +import java.util.LinkedHashMap; +import java.util.List; +import java.util.Map; +import java.util.concurrent.BlockingQueue; +import java.util.concurrent.LinkedBlockingQueue; +import java.util.concurrent.TimeUnit; + +import org.apache.http.entity.mime.content.ContentBody; +import org.apache.http.entity.mime.content.StringBody; + +import net.yacy.cora.document.MultiProtocolURI; +import net.yacy.cora.document.RSSFeed; +import net.yacy.cora.document.RSSMessage; +import net.yacy.cora.document.RSSReader; +import net.yacy.cora.protocol.http.HTTPConnector; + +public class SearchSRURSS extends Thread implements SearchAccumulator { + + private final static int recordsPerSession = 10; + + final String urlBase; + final String query; + final long timeoutInit; + final int maximumRecordsInit; + final boolean verify; + final boolean global; + final Map> result; + + private final BlockingQueue results; + + public SearchSRURSS( + final Map> result, + final String query, + final long timeoutInit, + final String urlBase, + final int maximumRecordsInit, + final boolean verify, + final boolean global) { + this.results = new LinkedBlockingQueue(); + this.result = result; + this.query = query; + this.timeoutInit = timeoutInit; + this.urlBase = urlBase; + this.maximumRecordsInit = maximumRecordsInit; + this.verify = verify; + this.global = global; + } + + public SearchSRURSS( + final SearchHub search, + final String urlBase, + final int maximumRecordsInit, + final boolean verify, + final boolean global) { + this.results = new LinkedBlockingQueue(); + this.result = search.getAccumulation(); + this.query = search.getQuery(); + this.timeoutInit = search.getTimeout(); + this.urlBase = urlBase; + this.maximumRecordsInit = maximumRecordsInit; + this.verify = verify; + this.global = global; + } + + public void run() { + searchSRURSS(results, urlBase, query, timeoutInit, maximumRecordsInit, verify, global); + int p = 1; + RSSMessage message; + try { + while ((message = results.poll(timeoutInit, TimeUnit.MILLISECONDS)) != RSSMessage.POISON) { + if (message == null) break; + List m = result.get(message.getLink()); + if (m == null) m = new ArrayList(); + m.add(new Integer(p++)); + result.put(message, m); + } + } catch (InterruptedException e) { + e.printStackTrace(); + } + } + + public static Thread searchSRURSS( + final BlockingQueue queue, + final String urlBase, + final String query, + final long timeoutInit, + final int maximumRecordsInit, + final boolean verify, + final boolean global) { + Thread job = new Thread() { + public void run() { + int startRecord = 0; + RSSMessage message; + int maximumRecords = maximumRecordsInit; + long timeout = timeoutInit; + mainloop: while (timeout > 0 && maximumRecords > 0) { + long st = System.currentTimeMillis(); + RSSFeed feed; + try { + feed = loadSRURSS(urlBase, query, timeout, startRecord, recordsPerSession, verify, global); + } catch (IOException e1) { + break mainloop; + } + if (feed == null || feed.isEmpty()) break mainloop; + maximumRecords -= feed.size(); + innerloop: while (!feed.isEmpty()) { + message = feed.pollMessage(); + if (message == null) break innerloop; + try { + queue.put(message); + } catch (InterruptedException e) { + break innerloop; + } + } + startRecord += recordsPerSession; + timeout -= System.currentTimeMillis() - st; + } + try { queue.put(RSSMessage.POISON); } catch (InterruptedException e) {} + } + }; + job.start(); + return job; + } + + /** + * send a query to a yacy public search interface + * @param rssSearchServiceURL the target url base (everything before the ? that follows the SRU request syntax properties). can null, then the local peer is used + * @param query the query as string + * @param startRecord number of first record + * @param maximumRecords maximum number of records + * @param verify if true, result entries are verified using the snippet fetch (slow); if false simply the result is returned + * @param global if true also search results from other peers are included + * @param timeout milliseconds that are waited at maximum for a search result + * @return + */ + public static RSSFeed loadSRURSS( + String rssSearchServiceURL, + String query, + long timeout, + int startRecord, + int maximumRecords, + boolean verify, + boolean global) throws IOException { + MultiProtocolURI uri = null; + try { + uri = new MultiProtocolURI(rssSearchServiceURL); + } catch (MalformedURLException e) { + throw new IOException("cora.Search failed asking peer '" + rssSearchServiceURL + "': bad url, " + e.getMessage()); + } + + // send request + try { + final LinkedHashMap parts = new LinkedHashMap(); + parts.put("query", new StringBody(query)); + parts.put("startRecord", new StringBody(Integer.toString(startRecord))); + parts.put("maximumRecords", new StringBody(Long.toString(maximumRecords))); + parts.put("verify", new StringBody(verify ? "true" : "false")); + parts.put("resource", new StringBody(global ? "global" : "local")); + final byte[] result = HTTPConnector.getConnector(MultiProtocolURI.yacybotUserAgent).post(new MultiProtocolURI(rssSearchServiceURL), (int) timeout, uri.getHost(), parts); + //String debug = new String(result); System.out.println("*** DEBUG: " + debug); + final RSSReader reader = RSSReader.parse(RSSFeed.DEFAULT_MAXSIZE, result); + if (reader == null) { + throw new IOException("cora.Search failed asking peer '" + uri.getHost() + "': probably bad response from remote peer (1), reader == null"); + } + final RSSFeed feed = reader.getFeed(); + if (feed == null) { + // case where the rss reader does not understand the content + throw new IOException("cora.Search failed asking peer '" + uri.getHost() + "': probably bad response from remote peer (2)"); + } + return feed; + } catch (final IOException e) { + throw new IOException("cora.Search error asking peer '" + uri.getHost() + "':" + e.toString()); + } + } + +} diff --git a/source/net/yacy/document/SnippetExtractor.java b/source/net/yacy/document/SnippetExtractor.java index 385cb340a..2a45e0c57 100644 --- a/source/net/yacy/document/SnippetExtractor.java +++ b/source/net/yacy/document/SnippetExtractor.java @@ -166,8 +166,7 @@ public class SnippetExtractor { assert maxpos >= minpos; final int newlen = Math.max(10, maxpos - minpos + 10); final int around = (maxLength - newlen) / 2; - assert minpos - around < sentence.length() : "maxpos = " + maxpos + ", minpos = " + minpos + ", around = " + around + ", sentence.length() = " + sentence.length(); - //assert ((maxpos + around) <= sentence.length()) && ((maxpos + around) <= sentence.length()) : "maxpos = " + maxpos + ", minpos = " + minpos + ", around = " + around + ", sentence.length() = " + sentence.length(); + assert minpos - around < sentence.length() : "maxpos = " + maxpos + ", minpos = " + minpos + ", around = " + around + ", sentence.length() = " + sentence.length(); //maxpos = 435, minpos = 17, around = -124, sentence.length() = 44 sentence = "[..] " + sentence.substring(minpos - around, ((maxpos + around) > sentence.length()) ? sentence.length() : (maxpos + around)).trim() + " [..]"; minpos = around; maxpos = sentence.length() - around - 5;