mirror of
https://github.com/yacy/yacy_search_server.git
synced 2024-09-19 00:01:41 +02:00
refactoring
git-svn-id: https://svn.berlios.de/svnroot/repos/yacy/trunk@7426 6c8d7289-2bf4-0310-a012-ef5d649a1542
This commit is contained in:
parent
f5baf53391
commit
54e77e6255
|
@ -26,7 +26,7 @@ import java.util.concurrent.TimeUnit;
|
|||
import net.yacy.cora.document.RSSMessage;
|
||||
import net.yacy.cora.protocol.HeaderFramework;
|
||||
import net.yacy.cora.protocol.RequestHeader;
|
||||
import net.yacy.cora.services.Search;
|
||||
import net.yacy.cora.services.SearchSRURSS;
|
||||
import net.yacy.document.geolocalization.Location;
|
||||
import de.anomic.data.LibraryProvider;
|
||||
import de.anomic.search.Switchboard;
|
||||
|
@ -93,7 +93,7 @@ public class yacysearch_location {
|
|||
// get a queue of search results
|
||||
String rssSearchServiceURL = "http://localhost:" + sb.getConfig("port", "8080") + "/yacysearch.rss";
|
||||
BlockingQueue<RSSMessage> results = new LinkedBlockingQueue<RSSMessage>();
|
||||
Search.searchSRURSS(rssSearchServiceURL, query, maximumTime, Integer.MAX_VALUE, false, false, results);
|
||||
SearchSRURSS.searchSRURSS(results, rssSearchServiceURL, query, maximumTime, Integer.MAX_VALUE, false, false);
|
||||
|
||||
// take the results and compute some locations
|
||||
RSSMessage message;
|
||||
|
|
|
@ -62,7 +62,7 @@ import net.yacy.cora.document.RSSFeed;
|
|||
import net.yacy.cora.document.RSSMessage;
|
||||
import net.yacy.cora.document.RSSReader;
|
||||
import net.yacy.cora.protocol.http.HTTPConnector;
|
||||
import net.yacy.cora.services.Search;
|
||||
import net.yacy.cora.services.SearchSRURSS;
|
||||
import net.yacy.kelondro.data.meta.URIMetadataRow;
|
||||
import net.yacy.kelondro.data.word.Word;
|
||||
import net.yacy.kelondro.data.word.WordReference;
|
||||
|
@ -372,7 +372,7 @@ public final class yacyClient {
|
|||
public static RSSFeed search(final yacySeed targetSeed, String query, boolean verify, boolean global, long timeout, int startRecord, int maximumRecords) throws IOException {
|
||||
String address = (targetSeed == null || targetSeed == Switchboard.getSwitchboard().peers.mySeed()) ? "localhost:" + Switchboard.getSwitchboard().getConfig("port", "8080") : targetSeed.getClusterAddress();
|
||||
String urlBase = "http://" + address + "/yacysearch.rss";
|
||||
return Search.loadSRURSS(urlBase, query, timeout, startRecord, maximumRecords, verify, global);
|
||||
return SearchSRURSS.loadSRURSS(urlBase, query, timeout, startRecord, maximumRecords, verify, global);
|
||||
}
|
||||
|
||||
@SuppressWarnings("unchecked")
|
||||
|
|
|
@ -45,6 +45,24 @@ public class RSSFeed implements Iterable<RSSMessage> {
|
|||
this.maxsize = maxsize;
|
||||
}
|
||||
|
||||
/**
|
||||
* make a RSS feed using a set of urls
|
||||
* the source string is assigned to all messages as author to mark the messages' origin
|
||||
* @param links
|
||||
* @param source
|
||||
*/
|
||||
public RSSFeed(Set<MultiProtocolURI> links, String source) {
|
||||
this(Integer.MAX_VALUE);
|
||||
String u;
|
||||
RSSMessage message;
|
||||
for (MultiProtocolURI uri: links) {
|
||||
u = uri.toNormalform(true, false);
|
||||
message = new RSSMessage(u, "", u);
|
||||
message.setAuthor(source);
|
||||
this.addMessage(message);
|
||||
}
|
||||
}
|
||||
|
||||
public void setChannel(final RSSMessage channelItem) {
|
||||
this.channel = channelItem;
|
||||
}
|
||||
|
|
|
@ -21,6 +21,7 @@
|
|||
package net.yacy.cora.document;
|
||||
|
||||
import java.text.ParseException;
|
||||
import java.util.Comparator;
|
||||
import java.util.Date;
|
||||
import java.util.HashSet;
|
||||
import java.util.Map;
|
||||
|
@ -31,7 +32,7 @@ import net.yacy.cora.date.GenericFormatter;
|
|||
import net.yacy.cora.date.ISO8601Formatter;
|
||||
import net.yacy.cora.protocol.HeaderFramework;
|
||||
|
||||
public class RSSMessage implements Hit {
|
||||
public class RSSMessage implements Hit, Comparable<RSSMessage>, Comparator<RSSMessage> {
|
||||
|
||||
public static enum Token {
|
||||
|
||||
|
@ -58,13 +59,13 @@ public class RSSMessage implements Hit {
|
|||
for (String s: k) this.keys.add(s);
|
||||
}
|
||||
|
||||
public String valueFrom(Map<String, String> map) {
|
||||
public String valueFrom(Map<String, String> map, String dflt) {
|
||||
String value;
|
||||
for (String key: this.keys) {
|
||||
value = map.get(key);
|
||||
if (value != null) return value;
|
||||
}
|
||||
return "";
|
||||
return dflt;
|
||||
}
|
||||
|
||||
public Set<String> keys() {
|
||||
|
@ -107,31 +108,50 @@ public class RSSMessage implements Hit {
|
|||
}
|
||||
|
||||
public String getTitle() {
|
||||
return Token.title.valueFrom(this.map);
|
||||
return Token.title.valueFrom(this.map, "");
|
||||
}
|
||||
|
||||
public String getLink() {
|
||||
return Token.link.valueFrom(this.map);
|
||||
return Token.link.valueFrom(this.map, "");
|
||||
}
|
||||
|
||||
public boolean equals(Object o) {
|
||||
return (o instanceof RSSMessage) && ((RSSMessage) o).getLink().equals(this.getLink());
|
||||
}
|
||||
|
||||
public int hashCode() {
|
||||
return getLink().hashCode();
|
||||
}
|
||||
|
||||
@Override
|
||||
public int compareTo(RSSMessage o) {
|
||||
if (!(o instanceof RSSMessage)) return 1;
|
||||
return this.getLink().compareTo(o.getLink());
|
||||
}
|
||||
|
||||
@Override
|
||||
public int compare(RSSMessage o1, RSSMessage o2) {
|
||||
return o1.compareTo(o2);
|
||||
}
|
||||
|
||||
public String getDescription() {
|
||||
return Token.description.valueFrom(this.map);
|
||||
return Token.description.valueFrom(this.map, "");
|
||||
}
|
||||
|
||||
public String getAuthor() {
|
||||
return Token.author.valueFrom(this.map);
|
||||
return Token.author.valueFrom(this.map, "");
|
||||
}
|
||||
|
||||
public String getCopyright() {
|
||||
return Token.copyright.valueFrom(this.map);
|
||||
return Token.copyright.valueFrom(this.map, "");
|
||||
}
|
||||
|
||||
public String getCategory() {
|
||||
return Token.category.valueFrom(this.map);
|
||||
return Token.category.valueFrom(this.map, "");
|
||||
}
|
||||
|
||||
public String[] getSubject() {
|
||||
String subject = Token.subject.valueFrom(this.map);
|
||||
String subject = Token.subject.valueFrom(this.map, "");
|
||||
if (subject.indexOf(',') >= 0) return subject.split(",");
|
||||
if (subject.indexOf(';') >= 0) return subject.split(";");
|
||||
if (subject.indexOf('|') >= 0) return subject.split("|");
|
||||
|
@ -139,15 +159,15 @@ public class RSSMessage implements Hit {
|
|||
}
|
||||
|
||||
public String getReferrer() {
|
||||
return Token.referrer.valueFrom(this.map);
|
||||
return Token.referrer.valueFrom(this.map, "");
|
||||
}
|
||||
|
||||
public String getLanguage() {
|
||||
return Token.language.valueFrom(this.map);
|
||||
return Token.language.valueFrom(this.map, "");
|
||||
}
|
||||
|
||||
public Date getPubDate() {
|
||||
String dateString = Token.pubDate.valueFrom(this.map);
|
||||
String dateString = Token.pubDate.valueFrom(this.map, "");
|
||||
Date date;
|
||||
try {
|
||||
date = ISO8601Formatter.FORMATTER.parse(dateString);
|
||||
|
@ -162,20 +182,20 @@ public class RSSMessage implements Hit {
|
|||
}
|
||||
|
||||
public String getGuid() {
|
||||
return Token.guid.valueFrom(this.map);
|
||||
return Token.guid.valueFrom(this.map, "");
|
||||
}
|
||||
|
||||
public String getTTL() {
|
||||
return Token.ttl.valueFrom(this.map);
|
||||
return Token.ttl.valueFrom(this.map, "");
|
||||
}
|
||||
|
||||
public String getDocs() {
|
||||
return Token.docs.valueFrom(this.map);
|
||||
return Token.docs.valueFrom(this.map, "");
|
||||
}
|
||||
|
||||
public long getSize() {
|
||||
String size = Token.size.valueFrom(this.map);
|
||||
return (size == null || size.length() == 0) ? 0 : Long.parseLong(size);
|
||||
String size = Token.size.valueFrom(this.map, "-1");
|
||||
return (size == null || size.length() == 0) ? -1 : Long.parseLong(size);
|
||||
}
|
||||
|
||||
public String getFulltext() {
|
||||
|
|
|
@ -1,313 +0,0 @@
|
|||
/**
|
||||
* Search
|
||||
* Copyright 2010 by Michael Peter Christen
|
||||
* First released 25.05.2010 at http://yacy.net
|
||||
*
|
||||
* This library is free software; you can redistribute it and/or
|
||||
* modify it under the terms of the GNU Lesser General Public
|
||||
* License as published by the Free Software Foundation; either
|
||||
* version 2.1 of the License, or (at your option) any later version.
|
||||
*
|
||||
* This library is distributed in the hope that it will be useful,
|
||||
* but WITHOUT ANY WARRANTY; without even the implied warranty of
|
||||
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
|
||||
* Lesser General Public License for more details.
|
||||
*
|
||||
* You should have received a copy of the GNU Lesser General Public License
|
||||
* along with this program in the file lgpl21.txt
|
||||
* If not, see <http://www.gnu.org/licenses/>.
|
||||
*/
|
||||
|
||||
package net.yacy.cora.services;
|
||||
|
||||
import java.io.IOException;
|
||||
import java.net.MalformedURLException;
|
||||
import java.util.ArrayList;
|
||||
import java.util.Iterator;
|
||||
import java.util.LinkedHashMap;
|
||||
import java.util.List;
|
||||
import java.util.Map;
|
||||
import java.util.Set;
|
||||
import java.util.concurrent.BlockingQueue;
|
||||
import java.util.concurrent.ConcurrentHashMap;
|
||||
import java.util.concurrent.LinkedBlockingQueue;
|
||||
import java.util.concurrent.TimeUnit;
|
||||
import java.util.regex.Pattern;
|
||||
|
||||
import net.yacy.cora.document.MultiProtocolURI;
|
||||
import net.yacy.cora.document.RSSFeed;
|
||||
import net.yacy.cora.document.RSSMessage;
|
||||
import net.yacy.cora.document.RSSReader;
|
||||
import net.yacy.cora.protocol.HeaderFramework;
|
||||
import net.yacy.cora.protocol.RequestHeader;
|
||||
import net.yacy.cora.protocol.http.HTTPClient;
|
||||
import net.yacy.cora.protocol.http.HTTPConnector;
|
||||
import net.yacy.cora.protocol.http.LinkExtractor;
|
||||
import net.yacy.cora.storage.ScoreMap;
|
||||
|
||||
import org.apache.http.entity.mime.content.ContentBody;
|
||||
import org.apache.http.entity.mime.content.StringBody;
|
||||
|
||||
public class Search extends Thread {
|
||||
|
||||
private final static int recordsPerSession = 10;
|
||||
|
||||
public static final String[] SRURSSServicesList = {
|
||||
"http://yacy.dyndns.org:8000/yacysearch.rss",
|
||||
"http://yacy.caloulinux.net:8085/yacysearch.rss",
|
||||
"http://algire.dyndns.org:8085/yacysearch.rss",
|
||||
"http://breyvogel.dyndns.org:8002/yacysearch.rss"
|
||||
};
|
||||
|
||||
public static final String[] genericServicesList = {
|
||||
"http://www.scroogle.org/cgi-bin/nbbw.cgi?Gw=$&n=2",
|
||||
"http://blekko.com/ws/$+/rss",
|
||||
"http://www.bing.com/search?q=$&format=rss",
|
||||
"http://search.twitter.com/search.atom?q=$"
|
||||
};
|
||||
|
||||
public static Thread accumulateSRURSS(
|
||||
final String urlBase,
|
||||
final String query,
|
||||
final long timeoutInit,
|
||||
final int maximumRecordsInit,
|
||||
final boolean verify,
|
||||
final boolean global,
|
||||
final Map<MultiProtocolURI, List<Integer>> result) {
|
||||
Thread t = new Thread() {
|
||||
BlockingQueue<RSSMessage> results = new LinkedBlockingQueue<RSSMessage>();
|
||||
public void run() {
|
||||
searchSRURSS(urlBase, query, timeoutInit, maximumRecordsInit, verify, global, results);
|
||||
int p = 1;
|
||||
RSSMessage message;
|
||||
try {
|
||||
while ((message = results.poll(timeoutInit, TimeUnit.MILLISECONDS)) != RSSMessage.POISON) {
|
||||
MultiProtocolURI uri;
|
||||
if (message == null) break;
|
||||
try {
|
||||
uri = new MultiProtocolURI(message.getLink());
|
||||
List<Integer> m = result.get(uri);
|
||||
if (m == null) m = new ArrayList<Integer>();
|
||||
m.add(new Integer(p++));
|
||||
result.put(uri, m);
|
||||
} catch (MalformedURLException e) {
|
||||
e.printStackTrace();
|
||||
}
|
||||
}
|
||||
} catch (InterruptedException e) {
|
||||
e.printStackTrace();
|
||||
}
|
||||
}
|
||||
};
|
||||
t.start();
|
||||
return t;
|
||||
}
|
||||
|
||||
public static Thread searchSRURSS(
|
||||
final String urlBase,
|
||||
final String query,
|
||||
final long timeoutInit,
|
||||
final int maximumRecordsInit,
|
||||
final boolean verify,
|
||||
final boolean global,
|
||||
final BlockingQueue<RSSMessage> queue) {
|
||||
Thread job = new Thread() {
|
||||
public void run() {
|
||||
int startRecord = 0;
|
||||
RSSMessage message;
|
||||
int maximumRecords = maximumRecordsInit;
|
||||
long timeout = timeoutInit;
|
||||
mainloop: while (timeout > 0 && maximumRecords > 0) {
|
||||
long st = System.currentTimeMillis();
|
||||
RSSFeed feed;
|
||||
try {
|
||||
feed = loadSRURSS(urlBase, query, timeout, startRecord, recordsPerSession, verify, global);
|
||||
} catch (IOException e1) {
|
||||
break mainloop;
|
||||
}
|
||||
if (feed == null || feed.isEmpty()) break mainloop;
|
||||
maximumRecords -= feed.size();
|
||||
innerloop: while (!feed.isEmpty()) {
|
||||
message = feed.pollMessage();
|
||||
if (message == null) break innerloop;
|
||||
try {
|
||||
queue.put(message);
|
||||
} catch (InterruptedException e) {
|
||||
break innerloop;
|
||||
}
|
||||
}
|
||||
startRecord += recordsPerSession;
|
||||
timeout -= System.currentTimeMillis() - st;
|
||||
}
|
||||
try { queue.put(RSSMessage.POISON); } catch (InterruptedException e) {}
|
||||
}
|
||||
};
|
||||
job.start();
|
||||
return job;
|
||||
}
|
||||
|
||||
/**
|
||||
* send a query to a yacy public search interface
|
||||
* @param rssSearchServiceURL the target url base (everything before the ? that follows the SRU request syntax properties). can null, then the local peer is used
|
||||
* @param query the query as string
|
||||
* @param startRecord number of first record
|
||||
* @param maximumRecords maximum number of records
|
||||
* @param verify if true, result entries are verified using the snippet fetch (slow); if false simply the result is returned
|
||||
* @param global if true also search results from other peers are included
|
||||
* @param timeout milliseconds that are waited at maximum for a search result
|
||||
* @return
|
||||
*/
|
||||
public static RSSFeed loadSRURSS(
|
||||
String rssSearchServiceURL,
|
||||
String query,
|
||||
long timeout,
|
||||
int startRecord,
|
||||
int maximumRecords,
|
||||
boolean verify,
|
||||
boolean global) throws IOException {
|
||||
MultiProtocolURI uri = null;
|
||||
try {
|
||||
uri = new MultiProtocolURI(rssSearchServiceURL);
|
||||
} catch (MalformedURLException e) {
|
||||
throw new IOException("cora.Search failed asking peer '" + rssSearchServiceURL + "': bad url, " + e.getMessage());
|
||||
}
|
||||
|
||||
// send request
|
||||
try {
|
||||
final LinkedHashMap<String,ContentBody> parts = new LinkedHashMap<String,ContentBody>();
|
||||
parts.put("query", new StringBody(query));
|
||||
parts.put("startRecord", new StringBody(Integer.toString(startRecord)));
|
||||
parts.put("maximumRecords", new StringBody(Long.toString(maximumRecords)));
|
||||
parts.put("verify", new StringBody(verify ? "true" : "false"));
|
||||
parts.put("resource", new StringBody(global ? "global" : "local"));
|
||||
final byte[] result = HTTPConnector.getConnector(MultiProtocolURI.yacybotUserAgent).post(new MultiProtocolURI(rssSearchServiceURL), (int) timeout, uri.getHost(), parts);
|
||||
//String debug = new String(result); System.out.println("*** DEBUG: " + debug);
|
||||
final RSSReader reader = RSSReader.parse(RSSFeed.DEFAULT_MAXSIZE, result);
|
||||
if (reader == null) {
|
||||
throw new IOException("cora.Search failed asking peer '" + uri.getHost() + "': probably bad response from remote peer (1), reader == null");
|
||||
}
|
||||
final RSSFeed feed = reader.getFeed();
|
||||
if (feed == null) {
|
||||
// case where the rss reader does not understand the content
|
||||
throw new IOException("cora.Search failed asking peer '" + uri.getHost() + "': probably bad response from remote peer (2)");
|
||||
}
|
||||
return feed;
|
||||
} catch (final IOException e) {
|
||||
throw new IOException("cora.Search error asking peer '" + uri.getHost() + "':" + e.toString());
|
||||
}
|
||||
}
|
||||
|
||||
public static Thread accumulateGeneric(
|
||||
String query,
|
||||
String service,
|
||||
final Map<MultiProtocolURI, List<Integer>> result,
|
||||
final int timeout) {
|
||||
query = query.replace(' ', '+');
|
||||
final String servicePatched = service.replaceAll("\\$", query);
|
||||
Thread t = new Thread() {
|
||||
public void run() {
|
||||
try {
|
||||
MultiProtocolURI[] sr = loadGeneric(new MultiProtocolURI(servicePatched), timeout);
|
||||
int p = 1;
|
||||
for (MultiProtocolURI u: sr) {
|
||||
List<Integer> m = result.get(u);
|
||||
if (m == null) m = new ArrayList<Integer>();
|
||||
m.add(new Integer(p++));
|
||||
result.put(u, m);
|
||||
}
|
||||
} catch (MalformedURLException e) {
|
||||
e.printStackTrace();
|
||||
} catch (IOException e) {
|
||||
e.printStackTrace();
|
||||
}
|
||||
}
|
||||
};
|
||||
t.start();
|
||||
return t;
|
||||
}
|
||||
|
||||
private static MultiProtocolURI[] loadGeneric(MultiProtocolURI uri, long timeout) throws IOException {
|
||||
final RequestHeader requestHeader = new RequestHeader();
|
||||
requestHeader.put(HeaderFramework.USER_AGENT, MultiProtocolURI.yacybotUserAgent);
|
||||
final HTTPClient client = new HTTPClient();
|
||||
client.setTimout((int) timeout);
|
||||
client.setHeader(requestHeader.entrySet());
|
||||
byte[] result = client.GETbytes(uri.toString());
|
||||
client.finish();
|
||||
if (client.getStatusCode() != 200) {
|
||||
throw new IOException("Server returned status: " + client.getHttpResponse().getStatusLine());
|
||||
}
|
||||
if (result == null) throw new IOException("cora.Search error asking peer '" + uri.getHost() + "': null");
|
||||
LinkExtractor le = new LinkExtractor(Pattern.compile(".*" + uri.getHost() + ".*"));
|
||||
le.scrape(new String(result));
|
||||
MultiProtocolURI[] links = le.getLinks();
|
||||
return links;
|
||||
}
|
||||
|
||||
public static RSSFeed links2feed(Set<MultiProtocolURI> links, String source) {
|
||||
RSSFeed feed = new RSSFeed(Integer.MAX_VALUE);
|
||||
String u;
|
||||
RSSMessage message;
|
||||
for (MultiProtocolURI uri: links) {
|
||||
u = uri.toNormalform(true, false);
|
||||
message = new RSSMessage(u, "", u);
|
||||
message.setAuthor(source);
|
||||
feed.addMessage(message);
|
||||
}
|
||||
return feed;
|
||||
}
|
||||
|
||||
private Map<MultiProtocolURI, List<Integer>> result;
|
||||
private String query;
|
||||
private int count;
|
||||
private String[] yacyServices, rssServices, genericServices;
|
||||
private List<Thread> threads;
|
||||
|
||||
public Search(String query, int count, String[] rssServices, String[] genericServices) {
|
||||
this.result = new ConcurrentHashMap<MultiProtocolURI, List<Integer>>();
|
||||
this.query = query;
|
||||
this.count = count;
|
||||
this.yacyServices = yacyServices;
|
||||
this.rssServices = rssServices;
|
||||
this.genericServices = genericServices;
|
||||
this.threads = new ArrayList<Thread>();
|
||||
}
|
||||
|
||||
public void run() {
|
||||
for (String service: this.rssServices) threads.add(accumulateSRURSS(service, this.query, 10000, this.count, false, true, this.result));
|
||||
for (String service: this.genericServices) threads.add(accumulateGeneric(this.query, service, this.result, 10000));
|
||||
}
|
||||
|
||||
public ScoreMap<MultiProtocolURI> getResults() {
|
||||
ScoreMap<MultiProtocolURI> scores = new ScoreMap<MultiProtocolURI>();
|
||||
int m = this.rssServices.length + this.genericServices.length;
|
||||
for (Map.Entry<MultiProtocolURI, List<Integer>> entry: this.result.entrySet()) {
|
||||
int a = 0;
|
||||
for (Integer i : entry.getValue()) a += i.intValue();
|
||||
scores.inc(entry.getKey(), a * m / entry.getValue().size());
|
||||
}
|
||||
return scores;
|
||||
}
|
||||
|
||||
public void waitTermination() {
|
||||
for (Thread t: threads) try {t.join();} catch (InterruptedException e) {}
|
||||
}
|
||||
|
||||
public static void main(String[] args) {
|
||||
StringBuilder sb = new StringBuilder();
|
||||
for (String s: args) sb.append(s).append(' ');
|
||||
String query = sb.toString().trim();
|
||||
Search search = new Search(query, 100, SRURSSServicesList, genericServicesList);
|
||||
search.start();
|
||||
try {Thread.sleep(100);} catch (InterruptedException e1) {}
|
||||
search.waitTermination();
|
||||
ScoreMap<MultiProtocolURI> result = search.getResults();
|
||||
Iterator<MultiProtocolURI> i = result.keys(true);
|
||||
MultiProtocolURI u;
|
||||
while (i.hasNext()) {
|
||||
u = i.next();
|
||||
System.out.println("[" + result.get(u) + "] " + u.toNormalform(true, false));
|
||||
}
|
||||
try {HTTPClient.closeConnectionManager();} catch (InterruptedException e) {}
|
||||
}
|
||||
}
|
40
source/net/yacy/cora/services/SearchAccumulator.java
Normal file
40
source/net/yacy/cora/services/SearchAccumulator.java
Normal file
|
@ -0,0 +1,40 @@
|
|||
/**
|
||||
* Accumulator
|
||||
* Copyright 2010 by Michael Peter Christen
|
||||
* First released 07.01.2011 at http://yacy.net
|
||||
*
|
||||
* This library is free software; you can redistribute it and/or
|
||||
* modify it under the terms of the GNU Lesser General Public
|
||||
* License as published by the Free Software Foundation; either
|
||||
* version 2.1 of the License, or (at your option) any later version.
|
||||
*
|
||||
* This library is distributed in the hope that it will be useful,
|
||||
* but WITHOUT ANY WARRANTY; without even the implied warranty of
|
||||
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
|
||||
* Lesser General Public License for more details.
|
||||
*
|
||||
* You should have received a copy of the GNU Lesser General Public License
|
||||
* along with this program in the file lgpl21.txt
|
||||
* If not, see <http://www.gnu.org/licenses/>.
|
||||
*/
|
||||
|
||||
package net.yacy.cora.services;
|
||||
|
||||
/**
|
||||
* place-holder class to provide a object declaration for threads in Search object
|
||||
*/
|
||||
public interface SearchAccumulator extends Runnable {
|
||||
|
||||
/**
|
||||
* join this accumulator: wait until it terminates
|
||||
* @throws InterruptedException
|
||||
*/
|
||||
public void join() throws InterruptedException;
|
||||
|
||||
/**
|
||||
* test if the accumulator is still running
|
||||
* @return
|
||||
*/
|
||||
public boolean isAlive();
|
||||
|
||||
}
|
164
source/net/yacy/cora/services/SearchHub.java
Normal file
164
source/net/yacy/cora/services/SearchHub.java
Normal file
|
@ -0,0 +1,164 @@
|
|||
/**
|
||||
* Search
|
||||
* Copyright 2010 by Michael Peter Christen
|
||||
* First released 25.05.2010 at http://yacy.net
|
||||
*
|
||||
* This library is free software; you can redistribute it and/or
|
||||
* modify it under the terms of the GNU Lesser General private
|
||||
* License as published by the Free Software Foundation; either
|
||||
* version 2.1 of the License, or (at your option) any later version.
|
||||
*
|
||||
* This library is distributed in the hope that it will be useful,
|
||||
* but WITHOUT ANY WARRANTY; without even the implied warranty of
|
||||
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
|
||||
* Lesser General private License for more details.
|
||||
*
|
||||
* You should have received a copy of the GNU Lesser General private License
|
||||
* along with this program in the file lgpl21.txt
|
||||
* If not, see <http://www.gnu.org/licenses/>.
|
||||
*/
|
||||
|
||||
package net.yacy.cora.services;
|
||||
|
||||
import java.util.ArrayList;
|
||||
import java.util.Iterator;
|
||||
import java.util.List;
|
||||
import java.util.Map;
|
||||
import java.util.concurrent.ConcurrentHashMap;
|
||||
|
||||
import net.yacy.cora.document.RSSMessage;
|
||||
import net.yacy.cora.protocol.http.HTTPClient;
|
||||
import net.yacy.cora.storage.ScoreMap;
|
||||
|
||||
public class SearchHub {
|
||||
|
||||
private static final String[] SRURSSServicesList = {
|
||||
"http://yacy.dyndns.org:8000/yacysearch.rss",
|
||||
"http://yacy.caloulinux.net:8085/yacysearch.rss",
|
||||
"http://algire.dyndns.org:8085/yacysearch.rss",
|
||||
"http://breyvogel.dyndns.org:8002/yacysearch.rss"
|
||||
};
|
||||
|
||||
public final static SearchHub EMPTY = new SearchHub("", 0);
|
||||
|
||||
private String query;
|
||||
private int timeout;
|
||||
private List<SearchAccumulator> threads;
|
||||
private Map<RSSMessage, List<Integer>> result;
|
||||
|
||||
public SearchHub(final String query, final int timeout) {
|
||||
this.query = query;
|
||||
this.timeout = timeout;
|
||||
this.threads = new ArrayList<SearchAccumulator>();
|
||||
this.result = new ConcurrentHashMap<RSSMessage, List<Integer>>();
|
||||
}
|
||||
|
||||
/**
|
||||
* get the result of the accumulation
|
||||
* @return
|
||||
*/
|
||||
public Map<RSSMessage, List<Integer>> getAccumulation() {
|
||||
return this.result;
|
||||
}
|
||||
|
||||
/**
|
||||
* add an accumulator to the list of accumulation theads.
|
||||
* this is mainly used for awaitTermination() and isTerminated()
|
||||
* @param a
|
||||
*/
|
||||
public void addAccumulator(SearchAccumulator a) {
|
||||
this.threads.add(a);
|
||||
}
|
||||
|
||||
/**
|
||||
* get the original query string
|
||||
* @return
|
||||
*/
|
||||
public String getQuery() {
|
||||
return this.query;
|
||||
}
|
||||
|
||||
/**
|
||||
* get the given time-out of the search request
|
||||
* @return
|
||||
*/
|
||||
public int getTimeout() {
|
||||
return this.timeout;
|
||||
}
|
||||
|
||||
/**
|
||||
* get the list of search results as scored map.
|
||||
* The results are combined using their appearance positions.
|
||||
* Every time this method is called the list is re-computed to reflect the latest results
|
||||
* @return a score map of urls
|
||||
*/
|
||||
public ScoreMap<String> getResults() {
|
||||
ScoreMap<String> scores = new ScoreMap<String>();
|
||||
int m = threads.size();
|
||||
for (Map.Entry<RSSMessage, List<Integer>> entry: this.result.entrySet()) {
|
||||
int a = 0;
|
||||
for (Integer i : entry.getValue()) a += i.intValue();
|
||||
scores.inc(entry.getKey().getLink(), a * m / entry.getValue().size());
|
||||
}
|
||||
return scores;
|
||||
}
|
||||
|
||||
/**
|
||||
* wait until all accumulation threads have terminated
|
||||
*/
|
||||
public void waitTermination() {
|
||||
for (SearchAccumulator t: threads) try {t.join();} catch (InterruptedException e) {}
|
||||
}
|
||||
|
||||
/**
|
||||
* return true if all accumulation threads have terminated
|
||||
* @return
|
||||
*/
|
||||
public boolean isTerminated() {
|
||||
for (SearchAccumulator t: threads) if (t.isAlive()) return false;
|
||||
return true;
|
||||
}
|
||||
|
||||
/**
|
||||
* return a hash code of the search hub.
|
||||
* This is computed using only the query string because that identifies the object
|
||||
*/
|
||||
public int hashCode() {
|
||||
return query.hashCode();
|
||||
}
|
||||
|
||||
/**
|
||||
* test method to add a list of SRU RSS services.
|
||||
* such services are provided by YaCy peers
|
||||
* @param search
|
||||
* @param rssServices
|
||||
* @param count
|
||||
* @param verify
|
||||
* @param global
|
||||
*/
|
||||
public static void addSRURSSServices(SearchHub search, String[] rssServices, int count, boolean verify, boolean global) {
|
||||
for (String service: rssServices) {
|
||||
SearchSRURSS accumulator = new SearchSRURSS(search, service, count, verify, global);
|
||||
accumulator.start();
|
||||
search.addAccumulator(accumulator);
|
||||
}
|
||||
}
|
||||
|
||||
public static void main(String[] args) {
|
||||
StringBuilder sb = new StringBuilder();
|
||||
for (String s: args) sb.append(s).append(' ');
|
||||
String query = sb.toString().trim();
|
||||
SearchHub search = new SearchHub(query, 10000);
|
||||
addSRURSSServices(search, SRURSSServicesList, 100, false, false);
|
||||
try {Thread.sleep(100);} catch (InterruptedException e1) {}
|
||||
search.waitTermination();
|
||||
ScoreMap<String> result = search.getResults();
|
||||
Iterator<String> i = result.keys(true);
|
||||
String u;
|
||||
while (i.hasNext()) {
|
||||
u = i.next();
|
||||
System.out.println("[" + result.get(u) + "] " + u);
|
||||
}
|
||||
try {HTTPClient.closeConnectionManager();} catch (InterruptedException e) {}
|
||||
}
|
||||
}
|
201
source/net/yacy/cora/services/SearchSRURSS.java
Normal file
201
source/net/yacy/cora/services/SearchSRURSS.java
Normal file
|
@ -0,0 +1,201 @@
|
|||
/**
|
||||
* AccumulateSRURSS
|
||||
* Copyright 2010 by Michael Peter Christen
|
||||
* First released 06.01.2011 at http://yacy.net
|
||||
*
|
||||
* This library is free software; you can redistribute it and/or
|
||||
* modify it under the terms of the GNU Lesser General Public
|
||||
* License as published by the Free Software Foundation; either
|
||||
* version 2.1 of the License, or (at your option) any later version.
|
||||
*
|
||||
* This library is distributed in the hope that it will be useful,
|
||||
* but WITHOUT ANY WARRANTY; without even the implied warranty of
|
||||
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
|
||||
* Lesser General Public License for more details.
|
||||
*
|
||||
* You should have received a copy of the GNU Lesser General Public License
|
||||
* along with this program in the file lgpl21.txt
|
||||
* If not, see <http://www.gnu.org/licenses/>.
|
||||
*/
|
||||
|
||||
package net.yacy.cora.services;
|
||||
|
||||
import java.io.IOException;
|
||||
import java.net.MalformedURLException;
|
||||
import java.util.ArrayList;
|
||||
import java.util.LinkedHashMap;
|
||||
import java.util.List;
|
||||
import java.util.Map;
|
||||
import java.util.concurrent.BlockingQueue;
|
||||
import java.util.concurrent.LinkedBlockingQueue;
|
||||
import java.util.concurrent.TimeUnit;
|
||||
|
||||
import org.apache.http.entity.mime.content.ContentBody;
|
||||
import org.apache.http.entity.mime.content.StringBody;
|
||||
|
||||
import net.yacy.cora.document.MultiProtocolURI;
|
||||
import net.yacy.cora.document.RSSFeed;
|
||||
import net.yacy.cora.document.RSSMessage;
|
||||
import net.yacy.cora.document.RSSReader;
|
||||
import net.yacy.cora.protocol.http.HTTPConnector;
|
||||
|
||||
public class SearchSRURSS extends Thread implements SearchAccumulator {
|
||||
|
||||
private final static int recordsPerSession = 10;
|
||||
|
||||
final String urlBase;
|
||||
final String query;
|
||||
final long timeoutInit;
|
||||
final int maximumRecordsInit;
|
||||
final boolean verify;
|
||||
final boolean global;
|
||||
final Map<RSSMessage, List<Integer>> result;
|
||||
|
||||
private final BlockingQueue<RSSMessage> results;
|
||||
|
||||
public SearchSRURSS(
|
||||
final Map<RSSMessage, List<Integer>> result,
|
||||
final String query,
|
||||
final long timeoutInit,
|
||||
final String urlBase,
|
||||
final int maximumRecordsInit,
|
||||
final boolean verify,
|
||||
final boolean global) {
|
||||
this.results = new LinkedBlockingQueue<RSSMessage>();
|
||||
this.result = result;
|
||||
this.query = query;
|
||||
this.timeoutInit = timeoutInit;
|
||||
this.urlBase = urlBase;
|
||||
this.maximumRecordsInit = maximumRecordsInit;
|
||||
this.verify = verify;
|
||||
this.global = global;
|
||||
}
|
||||
|
||||
public SearchSRURSS(
|
||||
final SearchHub search,
|
||||
final String urlBase,
|
||||
final int maximumRecordsInit,
|
||||
final boolean verify,
|
||||
final boolean global) {
|
||||
this.results = new LinkedBlockingQueue<RSSMessage>();
|
||||
this.result = search.getAccumulation();
|
||||
this.query = search.getQuery();
|
||||
this.timeoutInit = search.getTimeout();
|
||||
this.urlBase = urlBase;
|
||||
this.maximumRecordsInit = maximumRecordsInit;
|
||||
this.verify = verify;
|
||||
this.global = global;
|
||||
}
|
||||
|
||||
public void run() {
|
||||
searchSRURSS(results, urlBase, query, timeoutInit, maximumRecordsInit, verify, global);
|
||||
int p = 1;
|
||||
RSSMessage message;
|
||||
try {
|
||||
while ((message = results.poll(timeoutInit, TimeUnit.MILLISECONDS)) != RSSMessage.POISON) {
|
||||
if (message == null) break;
|
||||
List<Integer> m = result.get(message.getLink());
|
||||
if (m == null) m = new ArrayList<Integer>();
|
||||
m.add(new Integer(p++));
|
||||
result.put(message, m);
|
||||
}
|
||||
} catch (InterruptedException e) {
|
||||
e.printStackTrace();
|
||||
}
|
||||
}
|
||||
|
||||
public static Thread searchSRURSS(
|
||||
final BlockingQueue<RSSMessage> queue,
|
||||
final String urlBase,
|
||||
final String query,
|
||||
final long timeoutInit,
|
||||
final int maximumRecordsInit,
|
||||
final boolean verify,
|
||||
final boolean global) {
|
||||
Thread job = new Thread() {
|
||||
public void run() {
|
||||
int startRecord = 0;
|
||||
RSSMessage message;
|
||||
int maximumRecords = maximumRecordsInit;
|
||||
long timeout = timeoutInit;
|
||||
mainloop: while (timeout > 0 && maximumRecords > 0) {
|
||||
long st = System.currentTimeMillis();
|
||||
RSSFeed feed;
|
||||
try {
|
||||
feed = loadSRURSS(urlBase, query, timeout, startRecord, recordsPerSession, verify, global);
|
||||
} catch (IOException e1) {
|
||||
break mainloop;
|
||||
}
|
||||
if (feed == null || feed.isEmpty()) break mainloop;
|
||||
maximumRecords -= feed.size();
|
||||
innerloop: while (!feed.isEmpty()) {
|
||||
message = feed.pollMessage();
|
||||
if (message == null) break innerloop;
|
||||
try {
|
||||
queue.put(message);
|
||||
} catch (InterruptedException e) {
|
||||
break innerloop;
|
||||
}
|
||||
}
|
||||
startRecord += recordsPerSession;
|
||||
timeout -= System.currentTimeMillis() - st;
|
||||
}
|
||||
try { queue.put(RSSMessage.POISON); } catch (InterruptedException e) {}
|
||||
}
|
||||
};
|
||||
job.start();
|
||||
return job;
|
||||
}
|
||||
|
||||
/**
|
||||
* send a query to a yacy public search interface
|
||||
* @param rssSearchServiceURL the target url base (everything before the ? that follows the SRU request syntax properties). can null, then the local peer is used
|
||||
* @param query the query as string
|
||||
* @param startRecord number of first record
|
||||
* @param maximumRecords maximum number of records
|
||||
* @param verify if true, result entries are verified using the snippet fetch (slow); if false simply the result is returned
|
||||
* @param global if true also search results from other peers are included
|
||||
* @param timeout milliseconds that are waited at maximum for a search result
|
||||
* @return
|
||||
*/
|
||||
public static RSSFeed loadSRURSS(
|
||||
String rssSearchServiceURL,
|
||||
String query,
|
||||
long timeout,
|
||||
int startRecord,
|
||||
int maximumRecords,
|
||||
boolean verify,
|
||||
boolean global) throws IOException {
|
||||
MultiProtocolURI uri = null;
|
||||
try {
|
||||
uri = new MultiProtocolURI(rssSearchServiceURL);
|
||||
} catch (MalformedURLException e) {
|
||||
throw new IOException("cora.Search failed asking peer '" + rssSearchServiceURL + "': bad url, " + e.getMessage());
|
||||
}
|
||||
|
||||
// send request
|
||||
try {
|
||||
final LinkedHashMap<String,ContentBody> parts = new LinkedHashMap<String,ContentBody>();
|
||||
parts.put("query", new StringBody(query));
|
||||
parts.put("startRecord", new StringBody(Integer.toString(startRecord)));
|
||||
parts.put("maximumRecords", new StringBody(Long.toString(maximumRecords)));
|
||||
parts.put("verify", new StringBody(verify ? "true" : "false"));
|
||||
parts.put("resource", new StringBody(global ? "global" : "local"));
|
||||
final byte[] result = HTTPConnector.getConnector(MultiProtocolURI.yacybotUserAgent).post(new MultiProtocolURI(rssSearchServiceURL), (int) timeout, uri.getHost(), parts);
|
||||
//String debug = new String(result); System.out.println("*** DEBUG: " + debug);
|
||||
final RSSReader reader = RSSReader.parse(RSSFeed.DEFAULT_MAXSIZE, result);
|
||||
if (reader == null) {
|
||||
throw new IOException("cora.Search failed asking peer '" + uri.getHost() + "': probably bad response from remote peer (1), reader == null");
|
||||
}
|
||||
final RSSFeed feed = reader.getFeed();
|
||||
if (feed == null) {
|
||||
// case where the rss reader does not understand the content
|
||||
throw new IOException("cora.Search failed asking peer '" + uri.getHost() + "': probably bad response from remote peer (2)");
|
||||
}
|
||||
return feed;
|
||||
} catch (final IOException e) {
|
||||
throw new IOException("cora.Search error asking peer '" + uri.getHost() + "':" + e.toString());
|
||||
}
|
||||
}
|
||||
|
||||
}
|
|
@ -166,8 +166,7 @@ public class SnippetExtractor {
|
|||
assert maxpos >= minpos;
|
||||
final int newlen = Math.max(10, maxpos - minpos + 10);
|
||||
final int around = (maxLength - newlen) / 2;
|
||||
assert minpos - around < sentence.length() : "maxpos = " + maxpos + ", minpos = " + minpos + ", around = " + around + ", sentence.length() = " + sentence.length();
|
||||
//assert ((maxpos + around) <= sentence.length()) && ((maxpos + around) <= sentence.length()) : "maxpos = " + maxpos + ", minpos = " + minpos + ", around = " + around + ", sentence.length() = " + sentence.length();
|
||||
assert minpos - around < sentence.length() : "maxpos = " + maxpos + ", minpos = " + minpos + ", around = " + around + ", sentence.length() = " + sentence.length(); //maxpos = 435, minpos = 17, around = -124, sentence.length() = 44
|
||||
sentence = "[..] " + sentence.substring(minpos - around, ((maxpos + around) > sentence.length()) ? sentence.length() : (maxpos + around)).trim() + " [..]";
|
||||
minpos = around;
|
||||
maxpos = sentence.length() - around - 5;
|
||||
|
|
Loading…
Reference in New Issue
Block a user