- added http://blekko.com as search heuristic (like scroogle). This was easy since they deliver their search results also as rss feed

- renamed YaCys search result modifications keywords for RECENT, NEAR and language: to the blekko slashtag naming scheme. YaCy now supports the following blekko-like slash built-in slashtags:
/date
 - for search results ordered by date (most recent up)
 /near
 - for search results where search words appear near to each other (closest up)
 /language/<lang>
 - for a sorting by language where the wanted language gets up. Example: /language/de
  

git-svn-id: https://svn.berlios.de/svnroot/repos/yacy/trunk@7350 6c8d7289-2bf4-0310-a012-ef5d649a1542
This commit is contained in:
orbiter 2010-11-29 18:08:20 +00:00
parent a9f754c45f
commit cc6499bf8d
7 changed files with 97 additions and 13 deletions

View File

@ -919,6 +919,7 @@ about.body =
# search heuristics
heuristic.site = false
heuristic.scroogle = false
heuristic.blekko = false
# colours for generic design
color_background = #FFFFFF

View File

@ -43,11 +43,12 @@
</p>
</fieldset>
</form>
<form id="HeuristicFormScroogle" method="post" action="ConfigHeuristics_p.html" enctype="multipart/form-data" accept-charset="UTF-8">
<fieldset>
<legend>
<input type="checkbox" name="scroogle_check" id="scroogle" onclick="window.location.href='ConfigHeuristics_p.html?#(scroogle.checked)#scroogle_on=::scroogle_off=#(/scroogle.checked)#'" value="scroogle"#(scroogle.checked)#:: checked="checked"#(/scroogle.checked)# />
<label for="scroogle">scroogle: load external search result list</label>
<label for="scroogle">scroogle: load external search result list from <a href="http://scroogle.org">scroogle</a></label>
</legend>
<p>
When using this heuristic, then every search request line is used for a call to scroogle.
@ -56,6 +57,19 @@
</fieldset>
</form>
<form id="HeuristicFormBlekko" method="post" action="ConfigHeuristics_p.html" enctype="multipart/form-data" accept-charset="UTF-8">
<fieldset>
<legend>
<input type="checkbox" name="blekko_check" id="scroogle" onclick="window.location.href='ConfigHeuristics_p.html?#(blekko.checked)#blekko_on=::blekko_off=#(/blekko.checked)#'" value="blekko"#(blekko.checked)#:: checked="checked"#(/blekko.checked)# />
<label for="blekko">blekko: load external search result list from <a href="http://blekko.com">blekko</a></label>
</legend>
<p>
When using this heuristic, then every search request line is used for a call to blekko.
20 results are taken from blekko and loaded simultanously, parsed and indexed immediately.
</p>
</fieldset>
</form>
#%env/templates/footer.template%#
</body>

View File

@ -47,10 +47,13 @@ public class ConfigHeuristics_p {
if (post.containsKey("site_off")) sb.setConfig("heuristic.site", false);
if (post.containsKey("scroogle_on")) sb.setConfig("heuristic.scroogle", true);
if (post.containsKey("scroogle_off")) sb.setConfig("heuristic.scroogle", false);
if (post.containsKey("blekko_on")) sb.setConfig("heuristic.blekko", true);
if (post.containsKey("blekko_off")) sb.setConfig("heuristic.blekko", false);
}
prop.put("site.checked", sb.getConfigBool("heuristic.site", false) ? 1 : 0);
prop.put("scroogle.checked", sb.getConfigBool("heuristic.scroogle", false) ? 1 : 0);
prop.put("blekko.checked", sb.getConfigBool("heuristic.blekko", false) ? 1 : 0);
return prop;
}

View File

@ -287,20 +287,20 @@ public class yacysearch {
final RankingProfile ranking = sb.getRanking();
if (querystring.indexOf("NEAR") >= 0) {
querystring = querystring.replace("NEAR", "");
if (querystring.indexOf("/near") >= 0) {
querystring = querystring.replace("/near", "");
ranking.coeff_worddistance = RankingProfile.COEFF_MAX;
}
if (querystring.indexOf("RECENT") >= 0) {
querystring = querystring.replace("RECENT", "");
if (querystring.indexOf("/date") >= 0) {
querystring = querystring.replace("/date", "");
ranking.coeff_date = RankingProfile.COEFF_MAX;
}
int lrp = querystring.indexOf("LANGUAGE:");
int lrp = querystring.indexOf("/language/");
String lr = "";
if (lrp >= 0) {
if (querystring.length() >= (lrp + 11))
lr = querystring.substring(lrp + 9, lrp + 11);
querystring = querystring.replace("LANGUAGE:" + lr, "");
querystring = querystring.replace("/language/" + lr, "");
lr = lr.toLowerCase();
}
int inurl = querystring.indexOf("inurl:");
@ -347,11 +347,16 @@ public class yacysearch {
sitehash = DigestURI.domhash(sitehost);
}
int heuristic = querystring.indexOf("heuristic:scroogle");
if (heuristic >= 0) {
int heuristicScroogle = querystring.indexOf("heuristic:scroogle");
if (heuristicScroogle >= 0) {
querystring = querystring.replace("heuristic:scroogle", "");
}
int heuristicBlekko = querystring.indexOf("heuristic:blekko");
if (heuristicBlekko >= 0) {
querystring = querystring.replace("heuristic:blekko", "");
}
int authori = querystring.indexOf("author:");
String authorhash = null;
if (authori >= 0) {
@ -525,8 +530,11 @@ public class yacysearch {
final SearchEvent theSearch = SearchEventCache.getEvent(theQuery, sb.peers, sb.crawlResults, (sb.isRobinsonMode()) ? sb.clusterhashes : null, false, sb.loader);
try {Thread.sleep(global ? 100 : 10);} catch (InterruptedException e1) {} // wait a little time to get first results in the search
if (sitehost != null && sb.getConfigBool("heuristic.site", false) && authenticated) sb.heuristicSite(theSearch, sitehost);
if ((heuristic >= 0 || sb.getConfigBool("heuristic.scroogle", false)) && authenticated) sb.heuristicScroogle(theSearch);
if (offset == 0) {
if (sitehost != null && sb.getConfigBool("heuristic.site", false) && authenticated) sb.heuristicSite(theSearch, sitehost);
if ((heuristicScroogle >= 0 || sb.getConfigBool("heuristic.scroogle", false)) && authenticated) sb.heuristicScroogle(theSearch);
if ((heuristicBlekko >= 0 || sb.getConfigBool("heuristic.blekko", false)) && authenticated) sb.heuristicRSS("http://blekko.com/ws/$+/rss", theSearch, "blekko");
}
// generate result object
//serverLog.logFine("LOCAL_SEARCH", "SEARCH TIME AFTER ORDERING OF SEARCH RESULTS: " + (System.currentTimeMillis() - timestamp) + " ms");

View File

@ -616,6 +616,7 @@ public final class RankingProcess extends Thread {
int ic = count;
while (ic-- > 0 && i.hasNext()) {
word = i.next();
if (word == null) continue;
termHash = Word.word2hash(word);
c = this.query.getSegment().termIndex().count(termHash);
if (c > 0) {

View File

@ -76,7 +76,9 @@ import java.util.zip.ZipEntry;
import java.util.zip.ZipInputStream;
import net.yacy.cora.document.MultiProtocolURI;
import net.yacy.cora.document.RSSFeed;
import net.yacy.cora.document.RSSMessage;
import net.yacy.cora.document.RSSReader;
import net.yacy.cora.protocol.ConnectionInfo;
import net.yacy.cora.protocol.Domains;
import net.yacy.cora.protocol.HeaderFramework;
@ -865,6 +867,7 @@ public final class Switchboard extends serverSwitch {
// remove heuristics
setConfig("heuristic.site", false);
setConfig("heuristic.scroogle", false);
setConfig("heuristic.blekko", false);
// relocate
this.crawlQueues.relocate(this.queuesRoot); // cannot be closed because the busy threads are working with that object
@ -2320,7 +2323,7 @@ public final class Switchboard extends serverSwitch {
}
}.start();
}
public final void heuristicScroogle(final SearchEvent searchEvent) {
new Thread() {
@Override
@ -2336,6 +2339,7 @@ public final class Switchboard extends serverSwitch {
try {
url = new DigestURI(MultiProtocolURI.unescape(urlString));
} catch (MalformedURLException e1) {
Log.logWarning("heuristicScroogle", "url not well-formed: '" + urlString + "'");
return;
}
@ -2357,6 +2361,59 @@ public final class Switchboard extends serverSwitch {
}.start();
}
// blekko pattern: http://blekko.com/ws/$+/rss
public final void heuristicRSS(final String urlpattern, final SearchEvent searchEvent, final String feedName) {
final int p = urlpattern.indexOf('$');
if (p < 0) return;
new Thread() {
@Override
public void run() {
String query = searchEvent.getQuery().queryString(true);
int meta = query.indexOf("heuristic:");
if (meta >= 0) {
final int q = query.indexOf(' ', meta);
if (q >= 0) query = query.substring(0, meta) + query.substring(q + 1); else query = query.substring(0, meta);
}
final String urlString = urlpattern.substring(0, p) + query.trim().replaceAll(" ", "+") + urlpattern.substring(p + 1);
final DigestURI url;
try {
url = new DigestURI(MultiProtocolURI.unescape(urlString));
} catch (MalformedURLException e1) {
Log.logWarning("heuristicRSS", "url not well-formed: '" + urlString + "'");
return;
}
// if we have an url then try to load the rss
RSSReader rss = null;
try {
Response response = sb.loader.load(sb.loader.request(url, true, false), CrawlProfile.CacheStrategy.NOCACHE, Long.MAX_VALUE);
byte[] resource = response == null ? null : response.getContent();
//System.out.println("BLEKKO: " + new String(resource));
rss = resource == null ? null : RSSReader.parse(RSSFeed.DEFAULT_MAXSIZE, resource);
} catch (IOException e) {
Log.logException(e);
}
if (rss == null) {
Log.logInfo("heuristicRSS", "rss result not parsed from " + feedName);
return;
}
final Map<MultiProtocolURI, String> links = new TreeMap<MultiProtocolURI, String>();
MultiProtocolURI uri;
for (RSSMessage message: rss.getFeed()) try {
uri = new MultiProtocolURI(message.getLink());
links.put(uri, message.getTitle());
} catch (MalformedURLException e) {
}
Log.logInfo("heuristicRSS", "Heuristic: adding " + links.size() + " links from '" + feedName + "' rss feed");
// add all pages to the index
addAllToIndex(null, links, searchEvent, feedName);
}
}.start();
}
public int currentPPM() {
return EventTracker.countEvents(EventTracker.EClass.INDEX, 20000) * 3;
}

View File

@ -94,7 +94,7 @@ public class RSSReader extends DefaultHandler {
if (a.length < 100) {
throw new IOException("response=" + new String(a));
}
if (!equals(a, "<?xml".getBytes())) {
if (!equals(a, "<?xml".getBytes()) && !equals(a, "<rss".getBytes())) {
throw new IOException("response does not contain valid xml");
}
final String end = new String(a, a.length - 80, 80);