add search result heuristic. adding a crawl job with depth-1 for every displayed search result (crawling every external linked page of displayed search result pages)

2024-09-19 00:01:41 +02:00 · 2012-07-01 00:12:20 +02:00 · 2012-07-01 00:12:20 +02:00 · 067728bccc
commit 067728bccc
parent 03280fb161
5 changed files with 124 additions and 9 deletions
--- a/defaults/yacy.init
+++ b/defaults/yacy.init
@ -1015,6 +1015,8 @@ about.body =
 # search heuristics
 heuristic.site = false
 heuristic.blekko = false
+heuristic.searchresults = false
+heuristic.searchresults.crawlglobal = false

 # colours for generic design
 color_background = #FFFFFF
@ -1054,7 +1056,7 @@ federated.service.solr.indexing.commitWithinMs = 1000
 federated.service.solr.indexing.sharding = MODULO_HOST_MD5
 federated.service.solr.indexing.schemefile = solr.keys.default.list
 # the lazy attribute causes that fields containing "" or 0 are not added and not written
-federated.service.solr.indexing.lazy = true
+federated.service.solr.indexing.lazy = true

 # the indexing engine in YaCy can be switched off or on
 # (off may make sense if federated.service.solr.indexing.enabled = true)
@ -1095,4 +1097,4 @@ interaction.dontimportbookmarks =

 interaction.autocrawler.enabled = false
 interaction.autocrawler.domainfilter = .*
-interaction.autocrawler.categoryfilter = .*
+interaction.autocrawler.categoryfilter = .*
--- a/htroot/ConfigHeuristics_p.html
+++ b/htroot/ConfigHeuristics_p.html
@ -43,6 +43,34 @@
      </p>
    </fieldset>
    </form>
+    
+    <form id="HeuristicFormSearchResult" method="post" action="ConfigHeuristics_p.html" enctype="multipart/form-data" accept-charset="UTF-8">
+        <fieldset>
+            <table>
+                <tr>
+                    <td>
+                        <legend>
+                            <input type="checkbox" name="searchresult_check" id="searchresult" onclick="window.location.href='ConfigHeuristics_p.html?#(searchresult.checked)#searchresult_on=::searchresult_off=#(/searchresult.checked)#'" value="searchresult"#(searchresult.checked)#:: checked="checked"#(/searchresult.checked)# />
+                            <label for="searchresult">search-result: shallow crawl on all displayed search results</label>
+                        </legend>
+                    </td>
+                    <td>
+                        <legend>
+                            <input type="checkbox" name="searchresultglobal_check" id="searchresultglobal" onclick="window.location.href='ConfigHeuristics_p.html?#(searchresultglobal.checked)#searchresultglobal_on=::searchresultglobal_off=#(/searchresultglobal.checked)#'" value="siteresultglobal"#(searchresultglobal.checked)#:: checked="checked"#(/searchresultglobal.checked)# />
+                            <label for="searchresultglobal">add as global crawl job</label>
+                        </legend>
+                    </td>
+                </tr>
+            </table>
+      <p>
+      When a search is made then all displayed result links are crawled with a depth-1 crawl.
+      This means: right after the search request every page is loaded and every page that is linked on this page.
+      If you check 'add as global crawl job' the pages to be crawled are added to the global crawl queue (remote peers can pickup pages to be crawled).
+      Default is to add the links to the local crawl queue (your peer crawls the linked pages).
+      </p>
+    </fieldset>
+    </form>
+    
    <form id="HeuristicFormBlekko" method="post" action="ConfigHeuristics_p.html" enctype="multipart/form-data" accept-charset="UTF-8">
    <fieldset>
      <legend>
@ -55,8 +83,7 @@
      </p>
    </fieldset>
    </form>
-    
-    
+
    #%env/templates/footer.template%#
  </body>
 </html>
--- a/htroot/ConfigHeuristics_p.java
+++ b/htroot/ConfigHeuristics_p.java
@ -45,13 +45,19 @@ public class ConfigHeuristics_p {
            
            if (post.containsKey("site_on")) sb.setConfig("heuristic.site", true);
            if (post.containsKey("site_off")) sb.setConfig("heuristic.site", false);
+            if (post.containsKey("searchresult_on")) sb.setConfig("heuristic.searchresults", true);
+            if (post.containsKey("searchresult_off")) sb.setConfig("heuristic.searchresults", false);
+            if (post.containsKey("searchresultglobal_on")) sb.setConfig("heuristic.searchresults.crawlglobal", true);
+            if (post.containsKey("searchresultglobal_off")) sb.setConfig("heuristic.searchresults.crawlglobal", false);
            if (post.containsKey("blekko_on")) sb.setConfig("heuristic.blekko", true);
            if (post.containsKey("blekko_off")) sb.setConfig("heuristic.blekko", false);
        }
        
        prop.put("site.checked", sb.getConfigBool("heuristic.site", false) ? 1 : 0);
+        prop.put("searchresult.checked", sb.getConfigBool("heuristic.searchresults", false) ? 1 : 0);
+        prop.put("searchresultglobal.checked", sb.getConfigBool("heuristic.searchresults.crawlglobal", false) ? 1 : 0);
        prop.put("blekko.checked", sb.getConfigBool("heuristic.blekko", false) ? 1 : 0);
-        
+
        return prop;
    }
 }
--- a/htroot/yacysearchitem.java
+++ b/htroot/yacysearchitem.java
@ -248,6 +248,7 @@ public class yacysearchitem {
                prop.put("content_loc_lat", result.lat());
                prop.put("content_loc_lon", result.lon());
            }
+            if (sb.getConfigBool("heuristic.searchresults",false)) sb.heuristicSearchResults(resultUrlstring);
            theQuery.transmitcount = item + 1;
            return prop;
        }
--- a/source/net/yacy/search/Switchboard.java
+++ b/source/net/yacy/search/Switchboard.java
@ -160,6 +160,7 @@ import de.anomic.crawler.CrawlQueues;
 import de.anomic.crawler.CrawlStacker;
 import de.anomic.crawler.CrawlSwitchboard;
 import de.anomic.crawler.NoticedURL;
+import de.anomic.crawler.NoticedURL.StackType;
 import de.anomic.crawler.ResourceObserver;
 import de.anomic.crawler.ResultImages;
 import de.anomic.crawler.ResultURLs;
@ -2573,7 +2574,7 @@ public final class Switchboard extends serverSwitch
                "denied by profile rule, process case="
                    + processCase
                    + ", profile name = "
-                    + queueEntry.profile().name());
+                    + queueEntry.profile().name());
            return;
        }

@ -2785,6 +2786,44 @@ public final class Switchboard extends serverSwitch
        }.start();
    }

+     /**
+     * add url to Crawler - which itself loads the URL, parses the content and adds it to the index
+     * transparent alternative to "addToIndex" including, double in crawler check, display in crawl monitor
+     * but doesn't return results for a ongoing search
+     *
+     * @param url the url that shall be indexed
+     * @param asglobal true adds the url to global crawl queue (for remote crawling), false to the local crawler
+     */
+    public void addToCrawler(final DigestURI url, final boolean asglobal) {
+
+        if ( this.index.exists(url.hash()) ) {
+            return; // don't do double-work
+        }
+        final Request request = this.loader.request(url, true, true);
+        final CrawlProfile profile = sb.crawler.getActive(ASCII.getBytes(request.profileHandle()));
+        final String acceptedError = this.crawlStacker.checkAcceptance(url, profile, 0);
+        if (acceptedError != null) {
+            this.log.logInfo("addToCrawler: cannot load "
+                    + url.toNormalform(false, false)
+                    + ": "
+                    + acceptedError);
+            return;
+        }
+        final String s;
+        if (asglobal) {
+            s = sb.crawlQueues.noticeURL.push(StackType.GLOBAL, request);
+        } else {
+            s = sb.crawlQueues.noticeURL.push(StackType.LOCAL, request);
+        }
+
+        if (s != null) {
+            Switchboard.this.log.logInfo("addToCrawler: failed to add "
+                    + url.toNormalform(false, false)
+                    + ": "
+                    + s);
+        }
+    }
+
    public class receiptSending implements Runnable
    {
        private final Seed initiatorPeer;
@ -3125,8 +3164,8 @@ public final class Switchboard extends serverSwitch

                final Map<MultiProtocolURI, String> links;
                searchEvent.getRankingResult().oneFeederStarted();
-                try {
-                    links = Switchboard.this.loader.loadLinks(url, CacheStrategy.NOCACHE);
+                try {
+                    links = Switchboard.this.loader.loadLinks(url, CacheStrategy.NOCACHE);
                    if ( links != null ) {
                        final Iterator<MultiProtocolURI> i = links.keySet().iterator();
                        while ( i.hasNext() ) {
@ -3139,7 +3178,7 @@ public final class Switchboard extends serverSwitch
                        addAllToIndex(url, links, searchEvent, "site");
                    }
                } catch ( final Throwable e ) {
-                    Log.logException(e);
+                    Log.logException(e);
                } finally {
                    searchEvent.getRankingResult().oneFeederTerminated();
                }
@ -3147,6 +3186,46 @@ public final class Switchboard extends serverSwitch
        }.start();
    }

+    public final void heuristicSearchResults(final String host) {
+        new Thread() {
+
+            @Override
+            public void run() {
+
+                // get the links for a specific site
+                final DigestURI startUrl;
+                try {
+                    startUrl = new DigestURI(host);
+                } catch (final MalformedURLException e) {
+                    Log.logException(e);
+                    return;
+                }
+
+                final Map<MultiProtocolURI, String> links;
+                DigestURI url;
+                try {
+                    links = Switchboard.this.loader.loadLinks(startUrl, CacheStrategy.IFFRESH);
+                    if (links != null) {
+                        if (links.size() < 1000) { // limit to 1000 to skip large index pages
+                            final Iterator<MultiProtocolURI> i = links.keySet().iterator();
+                            final boolean globalcrawljob = sb.getConfigBool("heuristic.searchresults.crawlglobal",false);
+                            while (i.hasNext()) {
+                                url = new DigestURI(i.next());
+                                boolean islocal = url.getHost().contentEquals(startUrl.getHost());
+                                // add all external links or links to different page to crawler
+                                if ( !islocal ) {// || (!startUrl.getPath().endsWith(url.getPath()))) {
+                                    addToCrawler(url,globalcrawljob);
+                                }
+                            }
+                        }
+                    }
+                } catch (final Throwable e) {
+                    Log.logException(e);
+                }
+            }
+        }.start();
+    }
+
    // blekko pattern: http://blekko.com/ws/$+/rss
    public final void heuristicRSS(
        final String urlpattern,