diff --git a/defaults/yacy.init b/defaults/yacy.init index 7ff695c60..fc5c7af5b 100644 --- a/defaults/yacy.init +++ b/defaults/yacy.init @@ -919,6 +919,7 @@ about.body = # search heuristics heuristic.site = false heuristic.scroogle = false +heuristic.blekko = false # colours for generic design color_background = #FFFFFF diff --git a/htroot/ConfigHeuristics_p.html b/htroot/ConfigHeuristics_p.html index 8aec6a5c2..90f96655f 100644 --- a/htroot/ConfigHeuristics_p.html +++ b/htroot/ConfigHeuristics_p.html @@ -43,11 +43,12 @@

+
- +

When using this heuristic, then every search request line is used for a call to scroogle. @@ -56,6 +57,19 @@

+
+
+ + + + +

+ When using this heuristic, then every search request line is used for a call to blekko. + 20 results are taken from blekko and loaded simultanously, parsed and indexed immediately. +

+
+
+ #%env/templates/footer.template%# diff --git a/htroot/ConfigHeuristics_p.java b/htroot/ConfigHeuristics_p.java index 7a89a7bb5..7d62fec95 100644 --- a/htroot/ConfigHeuristics_p.java +++ b/htroot/ConfigHeuristics_p.java @@ -47,10 +47,13 @@ public class ConfigHeuristics_p { if (post.containsKey("site_off")) sb.setConfig("heuristic.site", false); if (post.containsKey("scroogle_on")) sb.setConfig("heuristic.scroogle", true); if (post.containsKey("scroogle_off")) sb.setConfig("heuristic.scroogle", false); + if (post.containsKey("blekko_on")) sb.setConfig("heuristic.blekko", true); + if (post.containsKey("blekko_off")) sb.setConfig("heuristic.blekko", false); } prop.put("site.checked", sb.getConfigBool("heuristic.site", false) ? 1 : 0); prop.put("scroogle.checked", sb.getConfigBool("heuristic.scroogle", false) ? 1 : 0); + prop.put("blekko.checked", sb.getConfigBool("heuristic.blekko", false) ? 1 : 0); return prop; } diff --git a/htroot/yacysearch.java b/htroot/yacysearch.java index a85ec6fa2..825e69dc4 100644 --- a/htroot/yacysearch.java +++ b/htroot/yacysearch.java @@ -287,20 +287,20 @@ public class yacysearch { final RankingProfile ranking = sb.getRanking(); - if (querystring.indexOf("NEAR") >= 0) { - querystring = querystring.replace("NEAR", ""); + if (querystring.indexOf("/near") >= 0) { + querystring = querystring.replace("/near", ""); ranking.coeff_worddistance = RankingProfile.COEFF_MAX; } - if (querystring.indexOf("RECENT") >= 0) { - querystring = querystring.replace("RECENT", ""); + if (querystring.indexOf("/date") >= 0) { + querystring = querystring.replace("/date", ""); ranking.coeff_date = RankingProfile.COEFF_MAX; } - int lrp = querystring.indexOf("LANGUAGE:"); + int lrp = querystring.indexOf("/language/"); String lr = ""; if (lrp >= 0) { if (querystring.length() >= (lrp + 11)) lr = querystring.substring(lrp + 9, lrp + 11); - querystring = querystring.replace("LANGUAGE:" + lr, ""); + querystring = querystring.replace("/language/" + lr, ""); lr = lr.toLowerCase(); } int inurl = querystring.indexOf("inurl:"); @@ -347,11 +347,16 @@ public class yacysearch { sitehash = DigestURI.domhash(sitehost); } - int heuristic = querystring.indexOf("heuristic:scroogle"); - if (heuristic >= 0) { + int heuristicScroogle = querystring.indexOf("heuristic:scroogle"); + if (heuristicScroogle >= 0) { querystring = querystring.replace("heuristic:scroogle", ""); } + int heuristicBlekko = querystring.indexOf("heuristic:blekko"); + if (heuristicBlekko >= 0) { + querystring = querystring.replace("heuristic:blekko", ""); + } + int authori = querystring.indexOf("author:"); String authorhash = null; if (authori >= 0) { @@ -525,8 +530,11 @@ public class yacysearch { final SearchEvent theSearch = SearchEventCache.getEvent(theQuery, sb.peers, sb.crawlResults, (sb.isRobinsonMode()) ? sb.clusterhashes : null, false, sb.loader); try {Thread.sleep(global ? 100 : 10);} catch (InterruptedException e1) {} // wait a little time to get first results in the search - if (sitehost != null && sb.getConfigBool("heuristic.site", false) && authenticated) sb.heuristicSite(theSearch, sitehost); - if ((heuristic >= 0 || sb.getConfigBool("heuristic.scroogle", false)) && authenticated) sb.heuristicScroogle(theSearch); + if (offset == 0) { + if (sitehost != null && sb.getConfigBool("heuristic.site", false) && authenticated) sb.heuristicSite(theSearch, sitehost); + if ((heuristicScroogle >= 0 || sb.getConfigBool("heuristic.scroogle", false)) && authenticated) sb.heuristicScroogle(theSearch); + if ((heuristicBlekko >= 0 || sb.getConfigBool("heuristic.blekko", false)) && authenticated) sb.heuristicRSS("http://blekko.com/ws/$+/rss", theSearch, "blekko"); + } // generate result object //serverLog.logFine("LOCAL_SEARCH", "SEARCH TIME AFTER ORDERING OF SEARCH RESULTS: " + (System.currentTimeMillis() - timestamp) + " ms"); diff --git a/source/de/anomic/search/RankingProcess.java b/source/de/anomic/search/RankingProcess.java index f3dd579d8..2a7259378 100644 --- a/source/de/anomic/search/RankingProcess.java +++ b/source/de/anomic/search/RankingProcess.java @@ -616,6 +616,7 @@ public final class RankingProcess extends Thread { int ic = count; while (ic-- > 0 && i.hasNext()) { word = i.next(); + if (word == null) continue; termHash = Word.word2hash(word); c = this.query.getSegment().termIndex().count(termHash); if (c > 0) { diff --git a/source/de/anomic/search/Switchboard.java b/source/de/anomic/search/Switchboard.java index b0c6c0b62..253f5e577 100644 --- a/source/de/anomic/search/Switchboard.java +++ b/source/de/anomic/search/Switchboard.java @@ -76,7 +76,9 @@ import java.util.zip.ZipEntry; import java.util.zip.ZipInputStream; import net.yacy.cora.document.MultiProtocolURI; +import net.yacy.cora.document.RSSFeed; import net.yacy.cora.document.RSSMessage; +import net.yacy.cora.document.RSSReader; import net.yacy.cora.protocol.ConnectionInfo; import net.yacy.cora.protocol.Domains; import net.yacy.cora.protocol.HeaderFramework; @@ -865,6 +867,7 @@ public final class Switchboard extends serverSwitch { // remove heuristics setConfig("heuristic.site", false); setConfig("heuristic.scroogle", false); + setConfig("heuristic.blekko", false); // relocate this.crawlQueues.relocate(this.queuesRoot); // cannot be closed because the busy threads are working with that object @@ -2320,7 +2323,7 @@ public final class Switchboard extends serverSwitch { } }.start(); } - + public final void heuristicScroogle(final SearchEvent searchEvent) { new Thread() { @Override @@ -2336,6 +2339,7 @@ public final class Switchboard extends serverSwitch { try { url = new DigestURI(MultiProtocolURI.unescape(urlString)); } catch (MalformedURLException e1) { + Log.logWarning("heuristicScroogle", "url not well-formed: '" + urlString + "'"); return; } @@ -2357,6 +2361,59 @@ public final class Switchboard extends serverSwitch { }.start(); } + // blekko pattern: http://blekko.com/ws/$+/rss + public final void heuristicRSS(final String urlpattern, final SearchEvent searchEvent, final String feedName) { + final int p = urlpattern.indexOf('$'); + if (p < 0) return; + new Thread() { + @Override + public void run() { + String query = searchEvent.getQuery().queryString(true); + int meta = query.indexOf("heuristic:"); + if (meta >= 0) { + final int q = query.indexOf(' ', meta); + if (q >= 0) query = query.substring(0, meta) + query.substring(q + 1); else query = query.substring(0, meta); + } + + final String urlString = urlpattern.substring(0, p) + query.trim().replaceAll(" ", "+") + urlpattern.substring(p + 1); + final DigestURI url; + try { + url = new DigestURI(MultiProtocolURI.unescape(urlString)); + } catch (MalformedURLException e1) { + Log.logWarning("heuristicRSS", "url not well-formed: '" + urlString + "'"); + return; + } + + // if we have an url then try to load the rss + RSSReader rss = null; + try { + Response response = sb.loader.load(sb.loader.request(url, true, false), CrawlProfile.CacheStrategy.NOCACHE, Long.MAX_VALUE); + byte[] resource = response == null ? null : response.getContent(); + //System.out.println("BLEKKO: " + new String(resource)); + rss = resource == null ? null : RSSReader.parse(RSSFeed.DEFAULT_MAXSIZE, resource); + } catch (IOException e) { + Log.logException(e); + } + if (rss == null) { + Log.logInfo("heuristicRSS", "rss result not parsed from " + feedName); + return; + } + + final Map links = new TreeMap(); + MultiProtocolURI uri; + for (RSSMessage message: rss.getFeed()) try { + uri = new MultiProtocolURI(message.getLink()); + links.put(uri, message.getTitle()); + } catch (MalformedURLException e) { + } + + Log.logInfo("heuristicRSS", "Heuristic: adding " + links.size() + " links from '" + feedName + "' rss feed"); + // add all pages to the index + addAllToIndex(null, links, searchEvent, feedName); + } + }.start(); + } + public int currentPPM() { return EventTracker.countEvents(EventTracker.EClass.INDEX, 20000) * 3; } diff --git a/source/net/yacy/cora/document/RSSReader.java b/source/net/yacy/cora/document/RSSReader.java index 78ef85565..e4555cda4 100644 --- a/source/net/yacy/cora/document/RSSReader.java +++ b/source/net/yacy/cora/document/RSSReader.java @@ -94,7 +94,7 @@ public class RSSReader extends DefaultHandler { if (a.length < 100) { throw new IOException("response=" + new String(a)); } - if (!equals(a, "