add search result heuristic. adding a crawl job with depth-1 for every displayed search result (crawling every external linked page of displayed search result pages)

This commit is contained in:
reger 2012-07-01 00:12:20 +02:00
parent 03280fb161
commit 067728bccc
5 changed files with 124 additions and 9 deletions

View File

@ -1015,6 +1015,8 @@ about.body =
# search heuristics
heuristic.site = false
heuristic.blekko = false
heuristic.searchresults = false
heuristic.searchresults.crawlglobal = false
# colours for generic design
color_background = #FFFFFF
@ -1054,7 +1056,7 @@ federated.service.solr.indexing.commitWithinMs = 1000
federated.service.solr.indexing.sharding = MODULO_HOST_MD5
federated.service.solr.indexing.schemefile = solr.keys.default.list
# the lazy attribute causes that fields containing "" or 0 are not added and not written
federated.service.solr.indexing.lazy = true
federated.service.solr.indexing.lazy = true
# the indexing engine in YaCy can be switched off or on
# (off may make sense if federated.service.solr.indexing.enabled = true)
@ -1095,4 +1097,4 @@ interaction.dontimportbookmarks =
interaction.autocrawler.enabled = false
interaction.autocrawler.domainfilter = .*
interaction.autocrawler.categoryfilter = .*
interaction.autocrawler.categoryfilter = .*

View File

@ -43,6 +43,34 @@
</p>
</fieldset>
</form>
<form id="HeuristicFormSearchResult" method="post" action="ConfigHeuristics_p.html" enctype="multipart/form-data" accept-charset="UTF-8">
<fieldset>
<table>
<tr>
<td>
<legend>
<input type="checkbox" name="searchresult_check" id="searchresult" onclick="window.location.href='ConfigHeuristics_p.html?#(searchresult.checked)#searchresult_on=::searchresult_off=#(/searchresult.checked)#'" value="searchresult"#(searchresult.checked)#:: checked="checked"#(/searchresult.checked)# />
<label for="searchresult">search-result: shallow crawl on all displayed search results</label>
</legend>
</td>
<td>
<legend>
<input type="checkbox" name="searchresultglobal_check" id="searchresultglobal" onclick="window.location.href='ConfigHeuristics_p.html?#(searchresultglobal.checked)#searchresultglobal_on=::searchresultglobal_off=#(/searchresultglobal.checked)#'" value="siteresultglobal"#(searchresultglobal.checked)#:: checked="checked"#(/searchresultglobal.checked)# />
<label for="searchresultglobal">add as global crawl job</label>
</legend>
</td>
</tr>
</table>
<p>
When a search is made then all displayed result links are crawled with a depth-1 crawl.
This means: right after the search request every page is loaded and every page that is linked on this page.
If you check 'add as global crawl job' the pages to be crawled are added to the global crawl queue (remote peers can pickup pages to be crawled).
Default is to add the links to the local crawl queue (your peer crawls the linked pages).
</p>
</fieldset>
</form>
<form id="HeuristicFormBlekko" method="post" action="ConfigHeuristics_p.html" enctype="multipart/form-data" accept-charset="UTF-8">
<fieldset>
<legend>
@ -55,8 +83,7 @@
</p>
</fieldset>
</form>
#%env/templates/footer.template%#
</body>
</html>

View File

@ -45,13 +45,19 @@ public class ConfigHeuristics_p {
if (post.containsKey("site_on")) sb.setConfig("heuristic.site", true);
if (post.containsKey("site_off")) sb.setConfig("heuristic.site", false);
if (post.containsKey("searchresult_on")) sb.setConfig("heuristic.searchresults", true);
if (post.containsKey("searchresult_off")) sb.setConfig("heuristic.searchresults", false);
if (post.containsKey("searchresultglobal_on")) sb.setConfig("heuristic.searchresults.crawlglobal", true);
if (post.containsKey("searchresultglobal_off")) sb.setConfig("heuristic.searchresults.crawlglobal", false);
if (post.containsKey("blekko_on")) sb.setConfig("heuristic.blekko", true);
if (post.containsKey("blekko_off")) sb.setConfig("heuristic.blekko", false);
}
prop.put("site.checked", sb.getConfigBool("heuristic.site", false) ? 1 : 0);
prop.put("searchresult.checked", sb.getConfigBool("heuristic.searchresults", false) ? 1 : 0);
prop.put("searchresultglobal.checked", sb.getConfigBool("heuristic.searchresults.crawlglobal", false) ? 1 : 0);
prop.put("blekko.checked", sb.getConfigBool("heuristic.blekko", false) ? 1 : 0);
return prop;
}
}

View File

@ -248,6 +248,7 @@ public class yacysearchitem {
prop.put("content_loc_lat", result.lat());
prop.put("content_loc_lon", result.lon());
}
if (sb.getConfigBool("heuristic.searchresults",false)) sb.heuristicSearchResults(resultUrlstring);
theQuery.transmitcount = item + 1;
return prop;
}

View File

@ -160,6 +160,7 @@ import de.anomic.crawler.CrawlQueues;
import de.anomic.crawler.CrawlStacker;
import de.anomic.crawler.CrawlSwitchboard;
import de.anomic.crawler.NoticedURL;
import de.anomic.crawler.NoticedURL.StackType;
import de.anomic.crawler.ResourceObserver;
import de.anomic.crawler.ResultImages;
import de.anomic.crawler.ResultURLs;
@ -2573,7 +2574,7 @@ public final class Switchboard extends serverSwitch
"denied by profile rule, process case="
+ processCase
+ ", profile name = "
+ queueEntry.profile().name());
+ queueEntry.profile().name());
return;
}
@ -2785,6 +2786,44 @@ public final class Switchboard extends serverSwitch
}.start();
}
/**
* add url to Crawler - which itself loads the URL, parses the content and adds it to the index
* transparent alternative to "addToIndex" including, double in crawler check, display in crawl monitor
* but doesn't return results for a ongoing search
*
* @param url the url that shall be indexed
* @param asglobal true adds the url to global crawl queue (for remote crawling), false to the local crawler
*/
public void addToCrawler(final DigestURI url, final boolean asglobal) {
if ( this.index.exists(url.hash()) ) {
return; // don't do double-work
}
final Request request = this.loader.request(url, true, true);
final CrawlProfile profile = sb.crawler.getActive(ASCII.getBytes(request.profileHandle()));
final String acceptedError = this.crawlStacker.checkAcceptance(url, profile, 0);
if (acceptedError != null) {
this.log.logInfo("addToCrawler: cannot load "
+ url.toNormalform(false, false)
+ ": "
+ acceptedError);
return;
}
final String s;
if (asglobal) {
s = sb.crawlQueues.noticeURL.push(StackType.GLOBAL, request);
} else {
s = sb.crawlQueues.noticeURL.push(StackType.LOCAL, request);
}
if (s != null) {
Switchboard.this.log.logInfo("addToCrawler: failed to add "
+ url.toNormalform(false, false)
+ ": "
+ s);
}
}
public class receiptSending implements Runnable
{
private final Seed initiatorPeer;
@ -3125,8 +3164,8 @@ public final class Switchboard extends serverSwitch
final Map<MultiProtocolURI, String> links;
searchEvent.getRankingResult().oneFeederStarted();
try {
links = Switchboard.this.loader.loadLinks(url, CacheStrategy.NOCACHE);
try {
links = Switchboard.this.loader.loadLinks(url, CacheStrategy.NOCACHE);
if ( links != null ) {
final Iterator<MultiProtocolURI> i = links.keySet().iterator();
while ( i.hasNext() ) {
@ -3139,7 +3178,7 @@ public final class Switchboard extends serverSwitch
addAllToIndex(url, links, searchEvent, "site");
}
} catch ( final Throwable e ) {
Log.logException(e);
Log.logException(e);
} finally {
searchEvent.getRankingResult().oneFeederTerminated();
}
@ -3147,6 +3186,46 @@ public final class Switchboard extends serverSwitch
}.start();
}
public final void heuristicSearchResults(final String host) {
new Thread() {
@Override
public void run() {
// get the links for a specific site
final DigestURI startUrl;
try {
startUrl = new DigestURI(host);
} catch (final MalformedURLException e) {
Log.logException(e);
return;
}
final Map<MultiProtocolURI, String> links;
DigestURI url;
try {
links = Switchboard.this.loader.loadLinks(startUrl, CacheStrategy.IFFRESH);
if (links != null) {
if (links.size() < 1000) { // limit to 1000 to skip large index pages
final Iterator<MultiProtocolURI> i = links.keySet().iterator();
final boolean globalcrawljob = sb.getConfigBool("heuristic.searchresults.crawlglobal",false);
while (i.hasNext()) {
url = new DigestURI(i.next());
boolean islocal = url.getHost().contentEquals(startUrl.getHost());
// add all external links or links to different page to crawler
if ( !islocal ) {// || (!startUrl.getPath().endsWith(url.getPath()))) {
addToCrawler(url,globalcrawljob);
}
}
}
}
} catch (final Throwable e) {
Log.logException(e);
}
}
}.start();
}
// blekko pattern: http://blekko.com/ws/$+/rss
public final void heuristicRSS(
final String urlpattern,