mirror of
https://github.com/yacy/yacy_search_server.git
synced 2024-09-19 00:01:41 +02:00
- added http://blekko.com as search heuristic (like scroogle). This was easy since they deliver their search results also as rss feed
- renamed YaCys search result modifications keywords for RECENT, NEAR and language: to the blekko slashtag naming scheme. YaCy now supports the following blekko-like slash built-in slashtags: /date - for search results ordered by date (most recent up) /near - for search results where search words appear near to each other (closest up) /language/<lang> - for a sorting by language where the wanted language gets up. Example: /language/de git-svn-id: https://svn.berlios.de/svnroot/repos/yacy/trunk@7350 6c8d7289-2bf4-0310-a012-ef5d649a1542
This commit is contained in:
parent
a9f754c45f
commit
cc6499bf8d
|
@ -919,6 +919,7 @@ about.body =
|
|||
# search heuristics
|
||||
heuristic.site = false
|
||||
heuristic.scroogle = false
|
||||
heuristic.blekko = false
|
||||
|
||||
# colours for generic design
|
||||
color_background = #FFFFFF
|
||||
|
|
|
@ -43,11 +43,12 @@
|
|||
</p>
|
||||
</fieldset>
|
||||
</form>
|
||||
|
||||
<form id="HeuristicFormScroogle" method="post" action="ConfigHeuristics_p.html" enctype="multipart/form-data" accept-charset="UTF-8">
|
||||
<fieldset>
|
||||
<legend>
|
||||
<input type="checkbox" name="scroogle_check" id="scroogle" onclick="window.location.href='ConfigHeuristics_p.html?#(scroogle.checked)#scroogle_on=::scroogle_off=#(/scroogle.checked)#'" value="scroogle"#(scroogle.checked)#:: checked="checked"#(/scroogle.checked)# />
|
||||
<label for="scroogle">scroogle: load external search result list</label>
|
||||
<label for="scroogle">scroogle: load external search result list from <a href="http://scroogle.org">scroogle</a></label>
|
||||
</legend>
|
||||
<p>
|
||||
When using this heuristic, then every search request line is used for a call to scroogle.
|
||||
|
@ -56,6 +57,19 @@
|
|||
</fieldset>
|
||||
</form>
|
||||
|
||||
<form id="HeuristicFormBlekko" method="post" action="ConfigHeuristics_p.html" enctype="multipart/form-data" accept-charset="UTF-8">
|
||||
<fieldset>
|
||||
<legend>
|
||||
<input type="checkbox" name="blekko_check" id="scroogle" onclick="window.location.href='ConfigHeuristics_p.html?#(blekko.checked)#blekko_on=::blekko_off=#(/blekko.checked)#'" value="blekko"#(blekko.checked)#:: checked="checked"#(/blekko.checked)# />
|
||||
<label for="blekko">blekko: load external search result list from <a href="http://blekko.com">blekko</a></label>
|
||||
</legend>
|
||||
<p>
|
||||
When using this heuristic, then every search request line is used for a call to blekko.
|
||||
20 results are taken from blekko and loaded simultanously, parsed and indexed immediately.
|
||||
</p>
|
||||
</fieldset>
|
||||
</form>
|
||||
|
||||
|
||||
#%env/templates/footer.template%#
|
||||
</body>
|
||||
|
|
|
@ -47,10 +47,13 @@ public class ConfigHeuristics_p {
|
|||
if (post.containsKey("site_off")) sb.setConfig("heuristic.site", false);
|
||||
if (post.containsKey("scroogle_on")) sb.setConfig("heuristic.scroogle", true);
|
||||
if (post.containsKey("scroogle_off")) sb.setConfig("heuristic.scroogle", false);
|
||||
if (post.containsKey("blekko_on")) sb.setConfig("heuristic.blekko", true);
|
||||
if (post.containsKey("blekko_off")) sb.setConfig("heuristic.blekko", false);
|
||||
}
|
||||
|
||||
prop.put("site.checked", sb.getConfigBool("heuristic.site", false) ? 1 : 0);
|
||||
prop.put("scroogle.checked", sb.getConfigBool("heuristic.scroogle", false) ? 1 : 0);
|
||||
prop.put("blekko.checked", sb.getConfigBool("heuristic.blekko", false) ? 1 : 0);
|
||||
|
||||
return prop;
|
||||
}
|
||||
|
|
|
@ -287,20 +287,20 @@ public class yacysearch {
|
|||
|
||||
final RankingProfile ranking = sb.getRanking();
|
||||
|
||||
if (querystring.indexOf("NEAR") >= 0) {
|
||||
querystring = querystring.replace("NEAR", "");
|
||||
if (querystring.indexOf("/near") >= 0) {
|
||||
querystring = querystring.replace("/near", "");
|
||||
ranking.coeff_worddistance = RankingProfile.COEFF_MAX;
|
||||
}
|
||||
if (querystring.indexOf("RECENT") >= 0) {
|
||||
querystring = querystring.replace("RECENT", "");
|
||||
if (querystring.indexOf("/date") >= 0) {
|
||||
querystring = querystring.replace("/date", "");
|
||||
ranking.coeff_date = RankingProfile.COEFF_MAX;
|
||||
}
|
||||
int lrp = querystring.indexOf("LANGUAGE:");
|
||||
int lrp = querystring.indexOf("/language/");
|
||||
String lr = "";
|
||||
if (lrp >= 0) {
|
||||
if (querystring.length() >= (lrp + 11))
|
||||
lr = querystring.substring(lrp + 9, lrp + 11);
|
||||
querystring = querystring.replace("LANGUAGE:" + lr, "");
|
||||
querystring = querystring.replace("/language/" + lr, "");
|
||||
lr = lr.toLowerCase();
|
||||
}
|
||||
int inurl = querystring.indexOf("inurl:");
|
||||
|
@ -347,11 +347,16 @@ public class yacysearch {
|
|||
sitehash = DigestURI.domhash(sitehost);
|
||||
}
|
||||
|
||||
int heuristic = querystring.indexOf("heuristic:scroogle");
|
||||
if (heuristic >= 0) {
|
||||
int heuristicScroogle = querystring.indexOf("heuristic:scroogle");
|
||||
if (heuristicScroogle >= 0) {
|
||||
querystring = querystring.replace("heuristic:scroogle", "");
|
||||
}
|
||||
|
||||
int heuristicBlekko = querystring.indexOf("heuristic:blekko");
|
||||
if (heuristicBlekko >= 0) {
|
||||
querystring = querystring.replace("heuristic:blekko", "");
|
||||
}
|
||||
|
||||
int authori = querystring.indexOf("author:");
|
||||
String authorhash = null;
|
||||
if (authori >= 0) {
|
||||
|
@ -525,8 +530,11 @@ public class yacysearch {
|
|||
final SearchEvent theSearch = SearchEventCache.getEvent(theQuery, sb.peers, sb.crawlResults, (sb.isRobinsonMode()) ? sb.clusterhashes : null, false, sb.loader);
|
||||
try {Thread.sleep(global ? 100 : 10);} catch (InterruptedException e1) {} // wait a little time to get first results in the search
|
||||
|
||||
if (sitehost != null && sb.getConfigBool("heuristic.site", false) && authenticated) sb.heuristicSite(theSearch, sitehost);
|
||||
if ((heuristic >= 0 || sb.getConfigBool("heuristic.scroogle", false)) && authenticated) sb.heuristicScroogle(theSearch);
|
||||
if (offset == 0) {
|
||||
if (sitehost != null && sb.getConfigBool("heuristic.site", false) && authenticated) sb.heuristicSite(theSearch, sitehost);
|
||||
if ((heuristicScroogle >= 0 || sb.getConfigBool("heuristic.scroogle", false)) && authenticated) sb.heuristicScroogle(theSearch);
|
||||
if ((heuristicBlekko >= 0 || sb.getConfigBool("heuristic.blekko", false)) && authenticated) sb.heuristicRSS("http://blekko.com/ws/$+/rss", theSearch, "blekko");
|
||||
}
|
||||
|
||||
// generate result object
|
||||
//serverLog.logFine("LOCAL_SEARCH", "SEARCH TIME AFTER ORDERING OF SEARCH RESULTS: " + (System.currentTimeMillis() - timestamp) + " ms");
|
||||
|
|
|
@ -616,6 +616,7 @@ public final class RankingProcess extends Thread {
|
|||
int ic = count;
|
||||
while (ic-- > 0 && i.hasNext()) {
|
||||
word = i.next();
|
||||
if (word == null) continue;
|
||||
termHash = Word.word2hash(word);
|
||||
c = this.query.getSegment().termIndex().count(termHash);
|
||||
if (c > 0) {
|
||||
|
|
|
@ -76,7 +76,9 @@ import java.util.zip.ZipEntry;
|
|||
import java.util.zip.ZipInputStream;
|
||||
|
||||
import net.yacy.cora.document.MultiProtocolURI;
|
||||
import net.yacy.cora.document.RSSFeed;
|
||||
import net.yacy.cora.document.RSSMessage;
|
||||
import net.yacy.cora.document.RSSReader;
|
||||
import net.yacy.cora.protocol.ConnectionInfo;
|
||||
import net.yacy.cora.protocol.Domains;
|
||||
import net.yacy.cora.protocol.HeaderFramework;
|
||||
|
@ -865,6 +867,7 @@ public final class Switchboard extends serverSwitch {
|
|||
// remove heuristics
|
||||
setConfig("heuristic.site", false);
|
||||
setConfig("heuristic.scroogle", false);
|
||||
setConfig("heuristic.blekko", false);
|
||||
|
||||
// relocate
|
||||
this.crawlQueues.relocate(this.queuesRoot); // cannot be closed because the busy threads are working with that object
|
||||
|
@ -2320,7 +2323,7 @@ public final class Switchboard extends serverSwitch {
|
|||
}
|
||||
}.start();
|
||||
}
|
||||
|
||||
|
||||
public final void heuristicScroogle(final SearchEvent searchEvent) {
|
||||
new Thread() {
|
||||
@Override
|
||||
|
@ -2336,6 +2339,7 @@ public final class Switchboard extends serverSwitch {
|
|||
try {
|
||||
url = new DigestURI(MultiProtocolURI.unescape(urlString));
|
||||
} catch (MalformedURLException e1) {
|
||||
Log.logWarning("heuristicScroogle", "url not well-formed: '" + urlString + "'");
|
||||
return;
|
||||
}
|
||||
|
||||
|
@ -2357,6 +2361,59 @@ public final class Switchboard extends serverSwitch {
|
|||
}.start();
|
||||
}
|
||||
|
||||
// blekko pattern: http://blekko.com/ws/$+/rss
|
||||
public final void heuristicRSS(final String urlpattern, final SearchEvent searchEvent, final String feedName) {
|
||||
final int p = urlpattern.indexOf('$');
|
||||
if (p < 0) return;
|
||||
new Thread() {
|
||||
@Override
|
||||
public void run() {
|
||||
String query = searchEvent.getQuery().queryString(true);
|
||||
int meta = query.indexOf("heuristic:");
|
||||
if (meta >= 0) {
|
||||
final int q = query.indexOf(' ', meta);
|
||||
if (q >= 0) query = query.substring(0, meta) + query.substring(q + 1); else query = query.substring(0, meta);
|
||||
}
|
||||
|
||||
final String urlString = urlpattern.substring(0, p) + query.trim().replaceAll(" ", "+") + urlpattern.substring(p + 1);
|
||||
final DigestURI url;
|
||||
try {
|
||||
url = new DigestURI(MultiProtocolURI.unescape(urlString));
|
||||
} catch (MalformedURLException e1) {
|
||||
Log.logWarning("heuristicRSS", "url not well-formed: '" + urlString + "'");
|
||||
return;
|
||||
}
|
||||
|
||||
// if we have an url then try to load the rss
|
||||
RSSReader rss = null;
|
||||
try {
|
||||
Response response = sb.loader.load(sb.loader.request(url, true, false), CrawlProfile.CacheStrategy.NOCACHE, Long.MAX_VALUE);
|
||||
byte[] resource = response == null ? null : response.getContent();
|
||||
//System.out.println("BLEKKO: " + new String(resource));
|
||||
rss = resource == null ? null : RSSReader.parse(RSSFeed.DEFAULT_MAXSIZE, resource);
|
||||
} catch (IOException e) {
|
||||
Log.logException(e);
|
||||
}
|
||||
if (rss == null) {
|
||||
Log.logInfo("heuristicRSS", "rss result not parsed from " + feedName);
|
||||
return;
|
||||
}
|
||||
|
||||
final Map<MultiProtocolURI, String> links = new TreeMap<MultiProtocolURI, String>();
|
||||
MultiProtocolURI uri;
|
||||
for (RSSMessage message: rss.getFeed()) try {
|
||||
uri = new MultiProtocolURI(message.getLink());
|
||||
links.put(uri, message.getTitle());
|
||||
} catch (MalformedURLException e) {
|
||||
}
|
||||
|
||||
Log.logInfo("heuristicRSS", "Heuristic: adding " + links.size() + " links from '" + feedName + "' rss feed");
|
||||
// add all pages to the index
|
||||
addAllToIndex(null, links, searchEvent, feedName);
|
||||
}
|
||||
}.start();
|
||||
}
|
||||
|
||||
public int currentPPM() {
|
||||
return EventTracker.countEvents(EventTracker.EClass.INDEX, 20000) * 3;
|
||||
}
|
||||
|
|
|
@ -94,7 +94,7 @@ public class RSSReader extends DefaultHandler {
|
|||
if (a.length < 100) {
|
||||
throw new IOException("response=" + new String(a));
|
||||
}
|
||||
if (!equals(a, "<?xml".getBytes())) {
|
||||
if (!equals(a, "<?xml".getBytes()) && !equals(a, "<rss".getBytes())) {
|
||||
throw new IOException("response does not contain valid xml");
|
||||
}
|
||||
final String end = new String(a, a.length - 80, 80);
|
||||
|
|
Loading…
Reference in New Issue
Block a user