diff --git a/defaults/yacy.init b/defaults/yacy.init index 84e4656a0..6479bd42e 100644 --- a/defaults/yacy.init +++ b/defaults/yacy.init @@ -1015,6 +1015,8 @@ about.body = # search heuristics heuristic.site = false heuristic.blekko = false +heuristic.searchresults = false +heuristic.searchresults.crawlglobal = false # colours for generic design color_background = #FFFFFF @@ -1054,7 +1056,7 @@ federated.service.solr.indexing.commitWithinMs = 1000 federated.service.solr.indexing.sharding = MODULO_HOST_MD5 federated.service.solr.indexing.schemefile = solr.keys.default.list # the lazy attribute causes that fields containing "" or 0 are not added and not written -federated.service.solr.indexing.lazy = true +federated.service.solr.indexing.lazy = true # the indexing engine in YaCy can be switched off or on # (off may make sense if federated.service.solr.indexing.enabled = true) @@ -1095,4 +1097,4 @@ interaction.dontimportbookmarks = interaction.autocrawler.enabled = false interaction.autocrawler.domainfilter = .* -interaction.autocrawler.categoryfilter = .* +interaction.autocrawler.categoryfilter = .* diff --git a/htroot/ConfigHeuristics_p.html b/htroot/ConfigHeuristics_p.html index 0778c9396..57fc7cd5e 100644 --- a/htroot/ConfigHeuristics_p.html +++ b/htroot/ConfigHeuristics_p.html @@ -43,6 +43,34 @@

+ +
+
+ + + + + +
+ + + + + + + + + +
+

+ When a search is made then all displayed result links are crawled with a depth-1 crawl. + This means: right after the search request every page is loaded and every page that is linked on this page. + If you check 'add as global crawl job' the pages to be crawled are added to the global crawl queue (remote peers can pickup pages to be crawled). + Default is to add the links to the local crawl queue (your peer crawls the linked pages). +

+
+
+
@@ -55,8 +83,7 @@

- - + #%env/templates/footer.template%# diff --git a/htroot/ConfigHeuristics_p.java b/htroot/ConfigHeuristics_p.java index 3ea6031cd..4dc7ff57b 100644 --- a/htroot/ConfigHeuristics_p.java +++ b/htroot/ConfigHeuristics_p.java @@ -45,13 +45,19 @@ public class ConfigHeuristics_p { if (post.containsKey("site_on")) sb.setConfig("heuristic.site", true); if (post.containsKey("site_off")) sb.setConfig("heuristic.site", false); + if (post.containsKey("searchresult_on")) sb.setConfig("heuristic.searchresults", true); + if (post.containsKey("searchresult_off")) sb.setConfig("heuristic.searchresults", false); + if (post.containsKey("searchresultglobal_on")) sb.setConfig("heuristic.searchresults.crawlglobal", true); + if (post.containsKey("searchresultglobal_off")) sb.setConfig("heuristic.searchresults.crawlglobal", false); if (post.containsKey("blekko_on")) sb.setConfig("heuristic.blekko", true); if (post.containsKey("blekko_off")) sb.setConfig("heuristic.blekko", false); } prop.put("site.checked", sb.getConfigBool("heuristic.site", false) ? 1 : 0); + prop.put("searchresult.checked", sb.getConfigBool("heuristic.searchresults", false) ? 1 : 0); + prop.put("searchresultglobal.checked", sb.getConfigBool("heuristic.searchresults.crawlglobal", false) ? 1 : 0); prop.put("blekko.checked", sb.getConfigBool("heuristic.blekko", false) ? 1 : 0); - + return prop; } } diff --git a/htroot/yacysearchitem.java b/htroot/yacysearchitem.java index d1fe7ec0f..3a69f2007 100644 --- a/htroot/yacysearchitem.java +++ b/htroot/yacysearchitem.java @@ -248,6 +248,7 @@ public class yacysearchitem { prop.put("content_loc_lat", result.lat()); prop.put("content_loc_lon", result.lon()); } + if (sb.getConfigBool("heuristic.searchresults",false)) sb.heuristicSearchResults(resultUrlstring); theQuery.transmitcount = item + 1; return prop; } diff --git a/source/net/yacy/search/Switchboard.java b/source/net/yacy/search/Switchboard.java index 0167a2e38..40269af83 100644 --- a/source/net/yacy/search/Switchboard.java +++ b/source/net/yacy/search/Switchboard.java @@ -160,6 +160,7 @@ import de.anomic.crawler.CrawlQueues; import de.anomic.crawler.CrawlStacker; import de.anomic.crawler.CrawlSwitchboard; import de.anomic.crawler.NoticedURL; +import de.anomic.crawler.NoticedURL.StackType; import de.anomic.crawler.ResourceObserver; import de.anomic.crawler.ResultImages; import de.anomic.crawler.ResultURLs; @@ -2573,7 +2574,7 @@ public final class Switchboard extends serverSwitch "denied by profile rule, process case=" + processCase + ", profile name = " - + queueEntry.profile().name()); + + queueEntry.profile().name()); return; } @@ -2785,6 +2786,44 @@ public final class Switchboard extends serverSwitch }.start(); } + /** + * add url to Crawler - which itself loads the URL, parses the content and adds it to the index + * transparent alternative to "addToIndex" including, double in crawler check, display in crawl monitor + * but doesn't return results for a ongoing search + * + * @param url the url that shall be indexed + * @param asglobal true adds the url to global crawl queue (for remote crawling), false to the local crawler + */ + public void addToCrawler(final DigestURI url, final boolean asglobal) { + + if ( this.index.exists(url.hash()) ) { + return; // don't do double-work + } + final Request request = this.loader.request(url, true, true); + final CrawlProfile profile = sb.crawler.getActive(ASCII.getBytes(request.profileHandle())); + final String acceptedError = this.crawlStacker.checkAcceptance(url, profile, 0); + if (acceptedError != null) { + this.log.logInfo("addToCrawler: cannot load " + + url.toNormalform(false, false) + + ": " + + acceptedError); + return; + } + final String s; + if (asglobal) { + s = sb.crawlQueues.noticeURL.push(StackType.GLOBAL, request); + } else { + s = sb.crawlQueues.noticeURL.push(StackType.LOCAL, request); + } + + if (s != null) { + Switchboard.this.log.logInfo("addToCrawler: failed to add " + + url.toNormalform(false, false) + + ": " + + s); + } + } + public class receiptSending implements Runnable { private final Seed initiatorPeer; @@ -3125,8 +3164,8 @@ public final class Switchboard extends serverSwitch final Map links; searchEvent.getRankingResult().oneFeederStarted(); - try { - links = Switchboard.this.loader.loadLinks(url, CacheStrategy.NOCACHE); + try { + links = Switchboard.this.loader.loadLinks(url, CacheStrategy.NOCACHE); if ( links != null ) { final Iterator i = links.keySet().iterator(); while ( i.hasNext() ) { @@ -3139,7 +3178,7 @@ public final class Switchboard extends serverSwitch addAllToIndex(url, links, searchEvent, "site"); } } catch ( final Throwable e ) { - Log.logException(e); + Log.logException(e); } finally { searchEvent.getRankingResult().oneFeederTerminated(); } @@ -3147,6 +3186,46 @@ public final class Switchboard extends serverSwitch }.start(); } + public final void heuristicSearchResults(final String host) { + new Thread() { + + @Override + public void run() { + + // get the links for a specific site + final DigestURI startUrl; + try { + startUrl = new DigestURI(host); + } catch (final MalformedURLException e) { + Log.logException(e); + return; + } + + final Map links; + DigestURI url; + try { + links = Switchboard.this.loader.loadLinks(startUrl, CacheStrategy.IFFRESH); + if (links != null) { + if (links.size() < 1000) { // limit to 1000 to skip large index pages + final Iterator i = links.keySet().iterator(); + final boolean globalcrawljob = sb.getConfigBool("heuristic.searchresults.crawlglobal",false); + while (i.hasNext()) { + url = new DigestURI(i.next()); + boolean islocal = url.getHost().contentEquals(startUrl.getHost()); + // add all external links or links to different page to crawler + if ( !islocal ) {// || (!startUrl.getPath().endsWith(url.getPath()))) { + addToCrawler(url,globalcrawljob); + } + } + } + } + } catch (final Throwable e) { + Log.logException(e); + } + } + }.start(); + } + // blekko pattern: http://blekko.com/ws/$+/rss public final void heuristicRSS( final String urlpattern,