refactoring

git-svn-id: https://svn.berlios.de/svnroot/repos/yacy/trunk@7426 6c8d7289-2bf4-0310-a012-ef5d649a1542
This commit is contained in:
orbiter 2011-01-10 08:40:41 +00:00
parent f5baf53391
commit 54e77e6255
9 changed files with 466 additions and 337 deletions

View File

@ -26,7 +26,7 @@ import java.util.concurrent.TimeUnit;
import net.yacy.cora.document.RSSMessage; import net.yacy.cora.document.RSSMessage;
import net.yacy.cora.protocol.HeaderFramework; import net.yacy.cora.protocol.HeaderFramework;
import net.yacy.cora.protocol.RequestHeader; import net.yacy.cora.protocol.RequestHeader;
import net.yacy.cora.services.Search; import net.yacy.cora.services.SearchSRURSS;
import net.yacy.document.geolocalization.Location; import net.yacy.document.geolocalization.Location;
import de.anomic.data.LibraryProvider; import de.anomic.data.LibraryProvider;
import de.anomic.search.Switchboard; import de.anomic.search.Switchboard;
@ -93,7 +93,7 @@ public class yacysearch_location {
// get a queue of search results // get a queue of search results
String rssSearchServiceURL = "http://localhost:" + sb.getConfig("port", "8080") + "/yacysearch.rss"; String rssSearchServiceURL = "http://localhost:" + sb.getConfig("port", "8080") + "/yacysearch.rss";
BlockingQueue<RSSMessage> results = new LinkedBlockingQueue<RSSMessage>(); BlockingQueue<RSSMessage> results = new LinkedBlockingQueue<RSSMessage>();
Search.searchSRURSS(rssSearchServiceURL, query, maximumTime, Integer.MAX_VALUE, false, false, results); SearchSRURSS.searchSRURSS(results, rssSearchServiceURL, query, maximumTime, Integer.MAX_VALUE, false, false);
// take the results and compute some locations // take the results and compute some locations
RSSMessage message; RSSMessage message;

View File

@ -62,7 +62,7 @@ import net.yacy.cora.document.RSSFeed;
import net.yacy.cora.document.RSSMessage; import net.yacy.cora.document.RSSMessage;
import net.yacy.cora.document.RSSReader; import net.yacy.cora.document.RSSReader;
import net.yacy.cora.protocol.http.HTTPConnector; import net.yacy.cora.protocol.http.HTTPConnector;
import net.yacy.cora.services.Search; import net.yacy.cora.services.SearchSRURSS;
import net.yacy.kelondro.data.meta.URIMetadataRow; import net.yacy.kelondro.data.meta.URIMetadataRow;
import net.yacy.kelondro.data.word.Word; import net.yacy.kelondro.data.word.Word;
import net.yacy.kelondro.data.word.WordReference; import net.yacy.kelondro.data.word.WordReference;
@ -372,7 +372,7 @@ public final class yacyClient {
public static RSSFeed search(final yacySeed targetSeed, String query, boolean verify, boolean global, long timeout, int startRecord, int maximumRecords) throws IOException { public static RSSFeed search(final yacySeed targetSeed, String query, boolean verify, boolean global, long timeout, int startRecord, int maximumRecords) throws IOException {
String address = (targetSeed == null || targetSeed == Switchboard.getSwitchboard().peers.mySeed()) ? "localhost:" + Switchboard.getSwitchboard().getConfig("port", "8080") : targetSeed.getClusterAddress(); String address = (targetSeed == null || targetSeed == Switchboard.getSwitchboard().peers.mySeed()) ? "localhost:" + Switchboard.getSwitchboard().getConfig("port", "8080") : targetSeed.getClusterAddress();
String urlBase = "http://" + address + "/yacysearch.rss"; String urlBase = "http://" + address + "/yacysearch.rss";
return Search.loadSRURSS(urlBase, query, timeout, startRecord, maximumRecords, verify, global); return SearchSRURSS.loadSRURSS(urlBase, query, timeout, startRecord, maximumRecords, verify, global);
} }
@SuppressWarnings("unchecked") @SuppressWarnings("unchecked")

View File

@ -45,6 +45,24 @@ public class RSSFeed implements Iterable<RSSMessage> {
this.maxsize = maxsize; this.maxsize = maxsize;
} }
/**
* make a RSS feed using a set of urls
* the source string is assigned to all messages as author to mark the messages' origin
* @param links
* @param source
*/
public RSSFeed(Set<MultiProtocolURI> links, String source) {
this(Integer.MAX_VALUE);
String u;
RSSMessage message;
for (MultiProtocolURI uri: links) {
u = uri.toNormalform(true, false);
message = new RSSMessage(u, "", u);
message.setAuthor(source);
this.addMessage(message);
}
}
public void setChannel(final RSSMessage channelItem) { public void setChannel(final RSSMessage channelItem) {
this.channel = channelItem; this.channel = channelItem;
} }

View File

@ -21,6 +21,7 @@
package net.yacy.cora.document; package net.yacy.cora.document;
import java.text.ParseException; import java.text.ParseException;
import java.util.Comparator;
import java.util.Date; import java.util.Date;
import java.util.HashSet; import java.util.HashSet;
import java.util.Map; import java.util.Map;
@ -31,7 +32,7 @@ import net.yacy.cora.date.GenericFormatter;
import net.yacy.cora.date.ISO8601Formatter; import net.yacy.cora.date.ISO8601Formatter;
import net.yacy.cora.protocol.HeaderFramework; import net.yacy.cora.protocol.HeaderFramework;
public class RSSMessage implements Hit { public class RSSMessage implements Hit, Comparable<RSSMessage>, Comparator<RSSMessage> {
public static enum Token { public static enum Token {
@ -58,13 +59,13 @@ public class RSSMessage implements Hit {
for (String s: k) this.keys.add(s); for (String s: k) this.keys.add(s);
} }
public String valueFrom(Map<String, String> map) { public String valueFrom(Map<String, String> map, String dflt) {
String value; String value;
for (String key: this.keys) { for (String key: this.keys) {
value = map.get(key); value = map.get(key);
if (value != null) return value; if (value != null) return value;
} }
return ""; return dflt;
} }
public Set<String> keys() { public Set<String> keys() {
@ -107,31 +108,50 @@ public class RSSMessage implements Hit {
} }
public String getTitle() { public String getTitle() {
return Token.title.valueFrom(this.map); return Token.title.valueFrom(this.map, "");
} }
public String getLink() { public String getLink() {
return Token.link.valueFrom(this.map); return Token.link.valueFrom(this.map, "");
}
public boolean equals(Object o) {
return (o instanceof RSSMessage) && ((RSSMessage) o).getLink().equals(this.getLink());
}
public int hashCode() {
return getLink().hashCode();
}
@Override
public int compareTo(RSSMessage o) {
if (!(o instanceof RSSMessage)) return 1;
return this.getLink().compareTo(o.getLink());
}
@Override
public int compare(RSSMessage o1, RSSMessage o2) {
return o1.compareTo(o2);
} }
public String getDescription() { public String getDescription() {
return Token.description.valueFrom(this.map); return Token.description.valueFrom(this.map, "");
} }
public String getAuthor() { public String getAuthor() {
return Token.author.valueFrom(this.map); return Token.author.valueFrom(this.map, "");
} }
public String getCopyright() { public String getCopyright() {
return Token.copyright.valueFrom(this.map); return Token.copyright.valueFrom(this.map, "");
} }
public String getCategory() { public String getCategory() {
return Token.category.valueFrom(this.map); return Token.category.valueFrom(this.map, "");
} }
public String[] getSubject() { public String[] getSubject() {
String subject = Token.subject.valueFrom(this.map); String subject = Token.subject.valueFrom(this.map, "");
if (subject.indexOf(',') >= 0) return subject.split(","); if (subject.indexOf(',') >= 0) return subject.split(",");
if (subject.indexOf(';') >= 0) return subject.split(";"); if (subject.indexOf(';') >= 0) return subject.split(";");
if (subject.indexOf('|') >= 0) return subject.split("|"); if (subject.indexOf('|') >= 0) return subject.split("|");
@ -139,15 +159,15 @@ public class RSSMessage implements Hit {
} }
public String getReferrer() { public String getReferrer() {
return Token.referrer.valueFrom(this.map); return Token.referrer.valueFrom(this.map, "");
} }
public String getLanguage() { public String getLanguage() {
return Token.language.valueFrom(this.map); return Token.language.valueFrom(this.map, "");
} }
public Date getPubDate() { public Date getPubDate() {
String dateString = Token.pubDate.valueFrom(this.map); String dateString = Token.pubDate.valueFrom(this.map, "");
Date date; Date date;
try { try {
date = ISO8601Formatter.FORMATTER.parse(dateString); date = ISO8601Formatter.FORMATTER.parse(dateString);
@ -162,20 +182,20 @@ public class RSSMessage implements Hit {
} }
public String getGuid() { public String getGuid() {
return Token.guid.valueFrom(this.map); return Token.guid.valueFrom(this.map, "");
} }
public String getTTL() { public String getTTL() {
return Token.ttl.valueFrom(this.map); return Token.ttl.valueFrom(this.map, "");
} }
public String getDocs() { public String getDocs() {
return Token.docs.valueFrom(this.map); return Token.docs.valueFrom(this.map, "");
} }
public long getSize() { public long getSize() {
String size = Token.size.valueFrom(this.map); String size = Token.size.valueFrom(this.map, "-1");
return (size == null || size.length() == 0) ? 0 : Long.parseLong(size); return (size == null || size.length() == 0) ? -1 : Long.parseLong(size);
} }
public String getFulltext() { public String getFulltext() {

View File

@ -1,313 +0,0 @@
/**
* Search
* Copyright 2010 by Michael Peter Christen
* First released 25.05.2010 at http://yacy.net
*
* This library is free software; you can redistribute it and/or
* modify it under the terms of the GNU Lesser General Public
* License as published by the Free Software Foundation; either
* version 2.1 of the License, or (at your option) any later version.
*
* This library is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
* Lesser General Public License for more details.
*
* You should have received a copy of the GNU Lesser General Public License
* along with this program in the file lgpl21.txt
* If not, see <http://www.gnu.org/licenses/>.
*/
package net.yacy.cora.services;
import java.io.IOException;
import java.net.MalformedURLException;
import java.util.ArrayList;
import java.util.Iterator;
import java.util.LinkedHashMap;
import java.util.List;
import java.util.Map;
import java.util.Set;
import java.util.concurrent.BlockingQueue;
import java.util.concurrent.ConcurrentHashMap;
import java.util.concurrent.LinkedBlockingQueue;
import java.util.concurrent.TimeUnit;
import java.util.regex.Pattern;
import net.yacy.cora.document.MultiProtocolURI;
import net.yacy.cora.document.RSSFeed;
import net.yacy.cora.document.RSSMessage;
import net.yacy.cora.document.RSSReader;
import net.yacy.cora.protocol.HeaderFramework;
import net.yacy.cora.protocol.RequestHeader;
import net.yacy.cora.protocol.http.HTTPClient;
import net.yacy.cora.protocol.http.HTTPConnector;
import net.yacy.cora.protocol.http.LinkExtractor;
import net.yacy.cora.storage.ScoreMap;
import org.apache.http.entity.mime.content.ContentBody;
import org.apache.http.entity.mime.content.StringBody;
public class Search extends Thread {
private final static int recordsPerSession = 10;
public static final String[] SRURSSServicesList = {
"http://yacy.dyndns.org:8000/yacysearch.rss",
"http://yacy.caloulinux.net:8085/yacysearch.rss",
"http://algire.dyndns.org:8085/yacysearch.rss",
"http://breyvogel.dyndns.org:8002/yacysearch.rss"
};
public static final String[] genericServicesList = {
"http://www.scroogle.org/cgi-bin/nbbw.cgi?Gw=$&n=2",
"http://blekko.com/ws/$+/rss",
"http://www.bing.com/search?q=$&format=rss",
"http://search.twitter.com/search.atom?q=$"
};
public static Thread accumulateSRURSS(
final String urlBase,
final String query,
final long timeoutInit,
final int maximumRecordsInit,
final boolean verify,
final boolean global,
final Map<MultiProtocolURI, List<Integer>> result) {
Thread t = new Thread() {
BlockingQueue<RSSMessage> results = new LinkedBlockingQueue<RSSMessage>();
public void run() {
searchSRURSS(urlBase, query, timeoutInit, maximumRecordsInit, verify, global, results);
int p = 1;
RSSMessage message;
try {
while ((message = results.poll(timeoutInit, TimeUnit.MILLISECONDS)) != RSSMessage.POISON) {
MultiProtocolURI uri;
if (message == null) break;
try {
uri = new MultiProtocolURI(message.getLink());
List<Integer> m = result.get(uri);
if (m == null) m = new ArrayList<Integer>();
m.add(new Integer(p++));
result.put(uri, m);
} catch (MalformedURLException e) {
e.printStackTrace();
}
}
} catch (InterruptedException e) {
e.printStackTrace();
}
}
};
t.start();
return t;
}
public static Thread searchSRURSS(
final String urlBase,
final String query,
final long timeoutInit,
final int maximumRecordsInit,
final boolean verify,
final boolean global,
final BlockingQueue<RSSMessage> queue) {
Thread job = new Thread() {
public void run() {
int startRecord = 0;
RSSMessage message;
int maximumRecords = maximumRecordsInit;
long timeout = timeoutInit;
mainloop: while (timeout > 0 && maximumRecords > 0) {
long st = System.currentTimeMillis();
RSSFeed feed;
try {
feed = loadSRURSS(urlBase, query, timeout, startRecord, recordsPerSession, verify, global);
} catch (IOException e1) {
break mainloop;
}
if (feed == null || feed.isEmpty()) break mainloop;
maximumRecords -= feed.size();
innerloop: while (!feed.isEmpty()) {
message = feed.pollMessage();
if (message == null) break innerloop;
try {
queue.put(message);
} catch (InterruptedException e) {
break innerloop;
}
}
startRecord += recordsPerSession;
timeout -= System.currentTimeMillis() - st;
}
try { queue.put(RSSMessage.POISON); } catch (InterruptedException e) {}
}
};
job.start();
return job;
}
/**
* send a query to a yacy public search interface
* @param rssSearchServiceURL the target url base (everything before the ? that follows the SRU request syntax properties). can null, then the local peer is used
* @param query the query as string
* @param startRecord number of first record
* @param maximumRecords maximum number of records
* @param verify if true, result entries are verified using the snippet fetch (slow); if false simply the result is returned
* @param global if true also search results from other peers are included
* @param timeout milliseconds that are waited at maximum for a search result
* @return
*/
public static RSSFeed loadSRURSS(
String rssSearchServiceURL,
String query,
long timeout,
int startRecord,
int maximumRecords,
boolean verify,
boolean global) throws IOException {
MultiProtocolURI uri = null;
try {
uri = new MultiProtocolURI(rssSearchServiceURL);
} catch (MalformedURLException e) {
throw new IOException("cora.Search failed asking peer '" + rssSearchServiceURL + "': bad url, " + e.getMessage());
}
// send request
try {
final LinkedHashMap<String,ContentBody> parts = new LinkedHashMap<String,ContentBody>();
parts.put("query", new StringBody(query));
parts.put("startRecord", new StringBody(Integer.toString(startRecord)));
parts.put("maximumRecords", new StringBody(Long.toString(maximumRecords)));
parts.put("verify", new StringBody(verify ? "true" : "false"));
parts.put("resource", new StringBody(global ? "global" : "local"));
final byte[] result = HTTPConnector.getConnector(MultiProtocolURI.yacybotUserAgent).post(new MultiProtocolURI(rssSearchServiceURL), (int) timeout, uri.getHost(), parts);
//String debug = new String(result); System.out.println("*** DEBUG: " + debug);
final RSSReader reader = RSSReader.parse(RSSFeed.DEFAULT_MAXSIZE, result);
if (reader == null) {
throw new IOException("cora.Search failed asking peer '" + uri.getHost() + "': probably bad response from remote peer (1), reader == null");
}
final RSSFeed feed = reader.getFeed();
if (feed == null) {
// case where the rss reader does not understand the content
throw new IOException("cora.Search failed asking peer '" + uri.getHost() + "': probably bad response from remote peer (2)");
}
return feed;
} catch (final IOException e) {
throw new IOException("cora.Search error asking peer '" + uri.getHost() + "':" + e.toString());
}
}
public static Thread accumulateGeneric(
String query,
String service,
final Map<MultiProtocolURI, List<Integer>> result,
final int timeout) {
query = query.replace(' ', '+');
final String servicePatched = service.replaceAll("\\$", query);
Thread t = new Thread() {
public void run() {
try {
MultiProtocolURI[] sr = loadGeneric(new MultiProtocolURI(servicePatched), timeout);
int p = 1;
for (MultiProtocolURI u: sr) {
List<Integer> m = result.get(u);
if (m == null) m = new ArrayList<Integer>();
m.add(new Integer(p++));
result.put(u, m);
}
} catch (MalformedURLException e) {
e.printStackTrace();
} catch (IOException e) {
e.printStackTrace();
}
}
};
t.start();
return t;
}
private static MultiProtocolURI[] loadGeneric(MultiProtocolURI uri, long timeout) throws IOException {
final RequestHeader requestHeader = new RequestHeader();
requestHeader.put(HeaderFramework.USER_AGENT, MultiProtocolURI.yacybotUserAgent);
final HTTPClient client = new HTTPClient();
client.setTimout((int) timeout);
client.setHeader(requestHeader.entrySet());
byte[] result = client.GETbytes(uri.toString());
client.finish();
if (client.getStatusCode() != 200) {
throw new IOException("Server returned status: " + client.getHttpResponse().getStatusLine());
}
if (result == null) throw new IOException("cora.Search error asking peer '" + uri.getHost() + "': null");
LinkExtractor le = new LinkExtractor(Pattern.compile(".*" + uri.getHost() + ".*"));
le.scrape(new String(result));
MultiProtocolURI[] links = le.getLinks();
return links;
}
public static RSSFeed links2feed(Set<MultiProtocolURI> links, String source) {
RSSFeed feed = new RSSFeed(Integer.MAX_VALUE);
String u;
RSSMessage message;
for (MultiProtocolURI uri: links) {
u = uri.toNormalform(true, false);
message = new RSSMessage(u, "", u);
message.setAuthor(source);
feed.addMessage(message);
}
return feed;
}
private Map<MultiProtocolURI, List<Integer>> result;
private String query;
private int count;
private String[] yacyServices, rssServices, genericServices;
private List<Thread> threads;
public Search(String query, int count, String[] rssServices, String[] genericServices) {
this.result = new ConcurrentHashMap<MultiProtocolURI, List<Integer>>();
this.query = query;
this.count = count;
this.yacyServices = yacyServices;
this.rssServices = rssServices;
this.genericServices = genericServices;
this.threads = new ArrayList<Thread>();
}
public void run() {
for (String service: this.rssServices) threads.add(accumulateSRURSS(service, this.query, 10000, this.count, false, true, this.result));
for (String service: this.genericServices) threads.add(accumulateGeneric(this.query, service, this.result, 10000));
}
public ScoreMap<MultiProtocolURI> getResults() {
ScoreMap<MultiProtocolURI> scores = new ScoreMap<MultiProtocolURI>();
int m = this.rssServices.length + this.genericServices.length;
for (Map.Entry<MultiProtocolURI, List<Integer>> entry: this.result.entrySet()) {
int a = 0;
for (Integer i : entry.getValue()) a += i.intValue();
scores.inc(entry.getKey(), a * m / entry.getValue().size());
}
return scores;
}
public void waitTermination() {
for (Thread t: threads) try {t.join();} catch (InterruptedException e) {}
}
public static void main(String[] args) {
StringBuilder sb = new StringBuilder();
for (String s: args) sb.append(s).append(' ');
String query = sb.toString().trim();
Search search = new Search(query, 100, SRURSSServicesList, genericServicesList);
search.start();
try {Thread.sleep(100);} catch (InterruptedException e1) {}
search.waitTermination();
ScoreMap<MultiProtocolURI> result = search.getResults();
Iterator<MultiProtocolURI> i = result.keys(true);
MultiProtocolURI u;
while (i.hasNext()) {
u = i.next();
System.out.println("[" + result.get(u) + "] " + u.toNormalform(true, false));
}
try {HTTPClient.closeConnectionManager();} catch (InterruptedException e) {}
}
}

View File

@ -0,0 +1,40 @@
/**
* Accumulator
* Copyright 2010 by Michael Peter Christen
* First released 07.01.2011 at http://yacy.net
*
* This library is free software; you can redistribute it and/or
* modify it under the terms of the GNU Lesser General Public
* License as published by the Free Software Foundation; either
* version 2.1 of the License, or (at your option) any later version.
*
* This library is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
* Lesser General Public License for more details.
*
* You should have received a copy of the GNU Lesser General Public License
* along with this program in the file lgpl21.txt
* If not, see <http://www.gnu.org/licenses/>.
*/
package net.yacy.cora.services;
/**
* place-holder class to provide a object declaration for threads in Search object
*/
public interface SearchAccumulator extends Runnable {
/**
* join this accumulator: wait until it terminates
* @throws InterruptedException
*/
public void join() throws InterruptedException;
/**
* test if the accumulator is still running
* @return
*/
public boolean isAlive();
}

View File

@ -0,0 +1,164 @@
/**
* Search
* Copyright 2010 by Michael Peter Christen
* First released 25.05.2010 at http://yacy.net
*
* This library is free software; you can redistribute it and/or
* modify it under the terms of the GNU Lesser General private
* License as published by the Free Software Foundation; either
* version 2.1 of the License, or (at your option) any later version.
*
* This library is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
* Lesser General private License for more details.
*
* You should have received a copy of the GNU Lesser General private License
* along with this program in the file lgpl21.txt
* If not, see <http://www.gnu.org/licenses/>.
*/
package net.yacy.cora.services;
import java.util.ArrayList;
import java.util.Iterator;
import java.util.List;
import java.util.Map;
import java.util.concurrent.ConcurrentHashMap;
import net.yacy.cora.document.RSSMessage;
import net.yacy.cora.protocol.http.HTTPClient;
import net.yacy.cora.storage.ScoreMap;
public class SearchHub {
private static final String[] SRURSSServicesList = {
"http://yacy.dyndns.org:8000/yacysearch.rss",
"http://yacy.caloulinux.net:8085/yacysearch.rss",
"http://algire.dyndns.org:8085/yacysearch.rss",
"http://breyvogel.dyndns.org:8002/yacysearch.rss"
};
public final static SearchHub EMPTY = new SearchHub("", 0);
private String query;
private int timeout;
private List<SearchAccumulator> threads;
private Map<RSSMessage, List<Integer>> result;
public SearchHub(final String query, final int timeout) {
this.query = query;
this.timeout = timeout;
this.threads = new ArrayList<SearchAccumulator>();
this.result = new ConcurrentHashMap<RSSMessage, List<Integer>>();
}
/**
* get the result of the accumulation
* @return
*/
public Map<RSSMessage, List<Integer>> getAccumulation() {
return this.result;
}
/**
* add an accumulator to the list of accumulation theads.
* this is mainly used for awaitTermination() and isTerminated()
* @param a
*/
public void addAccumulator(SearchAccumulator a) {
this.threads.add(a);
}
/**
* get the original query string
* @return
*/
public String getQuery() {
return this.query;
}
/**
* get the given time-out of the search request
* @return
*/
public int getTimeout() {
return this.timeout;
}
/**
* get the list of search results as scored map.
* The results are combined using their appearance positions.
* Every time this method is called the list is re-computed to reflect the latest results
* @return a score map of urls
*/
public ScoreMap<String> getResults() {
ScoreMap<String> scores = new ScoreMap<String>();
int m = threads.size();
for (Map.Entry<RSSMessage, List<Integer>> entry: this.result.entrySet()) {
int a = 0;
for (Integer i : entry.getValue()) a += i.intValue();
scores.inc(entry.getKey().getLink(), a * m / entry.getValue().size());
}
return scores;
}
/**
* wait until all accumulation threads have terminated
*/
public void waitTermination() {
for (SearchAccumulator t: threads) try {t.join();} catch (InterruptedException e) {}
}
/**
* return true if all accumulation threads have terminated
* @return
*/
public boolean isTerminated() {
for (SearchAccumulator t: threads) if (t.isAlive()) return false;
return true;
}
/**
* return a hash code of the search hub.
* This is computed using only the query string because that identifies the object
*/
public int hashCode() {
return query.hashCode();
}
/**
* test method to add a list of SRU RSS services.
* such services are provided by YaCy peers
* @param search
* @param rssServices
* @param count
* @param verify
* @param global
*/
public static void addSRURSSServices(SearchHub search, String[] rssServices, int count, boolean verify, boolean global) {
for (String service: rssServices) {
SearchSRURSS accumulator = new SearchSRURSS(search, service, count, verify, global);
accumulator.start();
search.addAccumulator(accumulator);
}
}
public static void main(String[] args) {
StringBuilder sb = new StringBuilder();
for (String s: args) sb.append(s).append(' ');
String query = sb.toString().trim();
SearchHub search = new SearchHub(query, 10000);
addSRURSSServices(search, SRURSSServicesList, 100, false, false);
try {Thread.sleep(100);} catch (InterruptedException e1) {}
search.waitTermination();
ScoreMap<String> result = search.getResults();
Iterator<String> i = result.keys(true);
String u;
while (i.hasNext()) {
u = i.next();
System.out.println("[" + result.get(u) + "] " + u);
}
try {HTTPClient.closeConnectionManager();} catch (InterruptedException e) {}
}
}

View File

@ -0,0 +1,201 @@
/**
* AccumulateSRURSS
* Copyright 2010 by Michael Peter Christen
* First released 06.01.2011 at http://yacy.net
*
* This library is free software; you can redistribute it and/or
* modify it under the terms of the GNU Lesser General Public
* License as published by the Free Software Foundation; either
* version 2.1 of the License, or (at your option) any later version.
*
* This library is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
* Lesser General Public License for more details.
*
* You should have received a copy of the GNU Lesser General Public License
* along with this program in the file lgpl21.txt
* If not, see <http://www.gnu.org/licenses/>.
*/
package net.yacy.cora.services;
import java.io.IOException;
import java.net.MalformedURLException;
import java.util.ArrayList;
import java.util.LinkedHashMap;
import java.util.List;
import java.util.Map;
import java.util.concurrent.BlockingQueue;
import java.util.concurrent.LinkedBlockingQueue;
import java.util.concurrent.TimeUnit;
import org.apache.http.entity.mime.content.ContentBody;
import org.apache.http.entity.mime.content.StringBody;
import net.yacy.cora.document.MultiProtocolURI;
import net.yacy.cora.document.RSSFeed;
import net.yacy.cora.document.RSSMessage;
import net.yacy.cora.document.RSSReader;
import net.yacy.cora.protocol.http.HTTPConnector;
public class SearchSRURSS extends Thread implements SearchAccumulator {
private final static int recordsPerSession = 10;
final String urlBase;
final String query;
final long timeoutInit;
final int maximumRecordsInit;
final boolean verify;
final boolean global;
final Map<RSSMessage, List<Integer>> result;
private final BlockingQueue<RSSMessage> results;
public SearchSRURSS(
final Map<RSSMessage, List<Integer>> result,
final String query,
final long timeoutInit,
final String urlBase,
final int maximumRecordsInit,
final boolean verify,
final boolean global) {
this.results = new LinkedBlockingQueue<RSSMessage>();
this.result = result;
this.query = query;
this.timeoutInit = timeoutInit;
this.urlBase = urlBase;
this.maximumRecordsInit = maximumRecordsInit;
this.verify = verify;
this.global = global;
}
public SearchSRURSS(
final SearchHub search,
final String urlBase,
final int maximumRecordsInit,
final boolean verify,
final boolean global) {
this.results = new LinkedBlockingQueue<RSSMessage>();
this.result = search.getAccumulation();
this.query = search.getQuery();
this.timeoutInit = search.getTimeout();
this.urlBase = urlBase;
this.maximumRecordsInit = maximumRecordsInit;
this.verify = verify;
this.global = global;
}
public void run() {
searchSRURSS(results, urlBase, query, timeoutInit, maximumRecordsInit, verify, global);
int p = 1;
RSSMessage message;
try {
while ((message = results.poll(timeoutInit, TimeUnit.MILLISECONDS)) != RSSMessage.POISON) {
if (message == null) break;
List<Integer> m = result.get(message.getLink());
if (m == null) m = new ArrayList<Integer>();
m.add(new Integer(p++));
result.put(message, m);
}
} catch (InterruptedException e) {
e.printStackTrace();
}
}
public static Thread searchSRURSS(
final BlockingQueue<RSSMessage> queue,
final String urlBase,
final String query,
final long timeoutInit,
final int maximumRecordsInit,
final boolean verify,
final boolean global) {
Thread job = new Thread() {
public void run() {
int startRecord = 0;
RSSMessage message;
int maximumRecords = maximumRecordsInit;
long timeout = timeoutInit;
mainloop: while (timeout > 0 && maximumRecords > 0) {
long st = System.currentTimeMillis();
RSSFeed feed;
try {
feed = loadSRURSS(urlBase, query, timeout, startRecord, recordsPerSession, verify, global);
} catch (IOException e1) {
break mainloop;
}
if (feed == null || feed.isEmpty()) break mainloop;
maximumRecords -= feed.size();
innerloop: while (!feed.isEmpty()) {
message = feed.pollMessage();
if (message == null) break innerloop;
try {
queue.put(message);
} catch (InterruptedException e) {
break innerloop;
}
}
startRecord += recordsPerSession;
timeout -= System.currentTimeMillis() - st;
}
try { queue.put(RSSMessage.POISON); } catch (InterruptedException e) {}
}
};
job.start();
return job;
}
/**
* send a query to a yacy public search interface
* @param rssSearchServiceURL the target url base (everything before the ? that follows the SRU request syntax properties). can null, then the local peer is used
* @param query the query as string
* @param startRecord number of first record
* @param maximumRecords maximum number of records
* @param verify if true, result entries are verified using the snippet fetch (slow); if false simply the result is returned
* @param global if true also search results from other peers are included
* @param timeout milliseconds that are waited at maximum for a search result
* @return
*/
public static RSSFeed loadSRURSS(
String rssSearchServiceURL,
String query,
long timeout,
int startRecord,
int maximumRecords,
boolean verify,
boolean global) throws IOException {
MultiProtocolURI uri = null;
try {
uri = new MultiProtocolURI(rssSearchServiceURL);
} catch (MalformedURLException e) {
throw new IOException("cora.Search failed asking peer '" + rssSearchServiceURL + "': bad url, " + e.getMessage());
}
// send request
try {
final LinkedHashMap<String,ContentBody> parts = new LinkedHashMap<String,ContentBody>();
parts.put("query", new StringBody(query));
parts.put("startRecord", new StringBody(Integer.toString(startRecord)));
parts.put("maximumRecords", new StringBody(Long.toString(maximumRecords)));
parts.put("verify", new StringBody(verify ? "true" : "false"));
parts.put("resource", new StringBody(global ? "global" : "local"));
final byte[] result = HTTPConnector.getConnector(MultiProtocolURI.yacybotUserAgent).post(new MultiProtocolURI(rssSearchServiceURL), (int) timeout, uri.getHost(), parts);
//String debug = new String(result); System.out.println("*** DEBUG: " + debug);
final RSSReader reader = RSSReader.parse(RSSFeed.DEFAULT_MAXSIZE, result);
if (reader == null) {
throw new IOException("cora.Search failed asking peer '" + uri.getHost() + "': probably bad response from remote peer (1), reader == null");
}
final RSSFeed feed = reader.getFeed();
if (feed == null) {
// case where the rss reader does not understand the content
throw new IOException("cora.Search failed asking peer '" + uri.getHost() + "': probably bad response from remote peer (2)");
}
return feed;
} catch (final IOException e) {
throw new IOException("cora.Search error asking peer '" + uri.getHost() + "':" + e.toString());
}
}
}

View File

@ -166,8 +166,7 @@ public class SnippetExtractor {
assert maxpos >= minpos; assert maxpos >= minpos;
final int newlen = Math.max(10, maxpos - minpos + 10); final int newlen = Math.max(10, maxpos - minpos + 10);
final int around = (maxLength - newlen) / 2; final int around = (maxLength - newlen) / 2;
assert minpos - around < sentence.length() : "maxpos = " + maxpos + ", minpos = " + minpos + ", around = " + around + ", sentence.length() = " + sentence.length(); assert minpos - around < sentence.length() : "maxpos = " + maxpos + ", minpos = " + minpos + ", around = " + around + ", sentence.length() = " + sentence.length(); //maxpos = 435, minpos = 17, around = -124, sentence.length() = 44
//assert ((maxpos + around) <= sentence.length()) && ((maxpos + around) <= sentence.length()) : "maxpos = " + maxpos + ", minpos = " + minpos + ", around = " + around + ", sentence.length() = " + sentence.length();
sentence = "[..] " + sentence.substring(minpos - around, ((maxpos + around) > sentence.length()) ? sentence.length() : (maxpos + around)).trim() + " [..]"; sentence = "[..] " + sentence.substring(minpos - around, ((maxpos + around) > sentence.length()) ? sentence.length() : (maxpos + around)).trim() + " [..]";
minpos = around; minpos = around;
maxpos = sentence.length() - around - 5; maxpos = sentence.length() - around - 5;