mirror of
https://github.com/yacy/yacy_search_server.git
synced 2024-09-19 00:01:41 +02:00
add search result heuristic. adding a crawl job with depth-1 for every displayed search result (crawling every external linked page of displayed search result pages)
This commit is contained in:
parent
03280fb161
commit
067728bccc
|
@ -1015,6 +1015,8 @@ about.body =
|
|||
# search heuristics
|
||||
heuristic.site = false
|
||||
heuristic.blekko = false
|
||||
heuristic.searchresults = false
|
||||
heuristic.searchresults.crawlglobal = false
|
||||
|
||||
# colours for generic design
|
||||
color_background = #FFFFFF
|
||||
|
@ -1054,7 +1056,7 @@ federated.service.solr.indexing.commitWithinMs = 1000
|
|||
federated.service.solr.indexing.sharding = MODULO_HOST_MD5
|
||||
federated.service.solr.indexing.schemefile = solr.keys.default.list
|
||||
# the lazy attribute causes that fields containing "" or 0 are not added and not written
|
||||
federated.service.solr.indexing.lazy = true
|
||||
federated.service.solr.indexing.lazy = true
|
||||
|
||||
# the indexing engine in YaCy can be switched off or on
|
||||
# (off may make sense if federated.service.solr.indexing.enabled = true)
|
||||
|
@ -1095,4 +1097,4 @@ interaction.dontimportbookmarks =
|
|||
|
||||
interaction.autocrawler.enabled = false
|
||||
interaction.autocrawler.domainfilter = .*
|
||||
interaction.autocrawler.categoryfilter = .*
|
||||
interaction.autocrawler.categoryfilter = .*
|
||||
|
|
|
@ -43,6 +43,34 @@
|
|||
</p>
|
||||
</fieldset>
|
||||
</form>
|
||||
|
||||
<form id="HeuristicFormSearchResult" method="post" action="ConfigHeuristics_p.html" enctype="multipart/form-data" accept-charset="UTF-8">
|
||||
<fieldset>
|
||||
<table>
|
||||
<tr>
|
||||
<td>
|
||||
<legend>
|
||||
<input type="checkbox" name="searchresult_check" id="searchresult" onclick="window.location.href='ConfigHeuristics_p.html?#(searchresult.checked)#searchresult_on=::searchresult_off=#(/searchresult.checked)#'" value="searchresult"#(searchresult.checked)#:: checked="checked"#(/searchresult.checked)# />
|
||||
<label for="searchresult">search-result: shallow crawl on all displayed search results</label>
|
||||
</legend>
|
||||
</td>
|
||||
<td>
|
||||
<legend>
|
||||
<input type="checkbox" name="searchresultglobal_check" id="searchresultglobal" onclick="window.location.href='ConfigHeuristics_p.html?#(searchresultglobal.checked)#searchresultglobal_on=::searchresultglobal_off=#(/searchresultglobal.checked)#'" value="siteresultglobal"#(searchresultglobal.checked)#:: checked="checked"#(/searchresultglobal.checked)# />
|
||||
<label for="searchresultglobal">add as global crawl job</label>
|
||||
</legend>
|
||||
</td>
|
||||
</tr>
|
||||
</table>
|
||||
<p>
|
||||
When a search is made then all displayed result links are crawled with a depth-1 crawl.
|
||||
This means: right after the search request every page is loaded and every page that is linked on this page.
|
||||
If you check 'add as global crawl job' the pages to be crawled are added to the global crawl queue (remote peers can pickup pages to be crawled).
|
||||
Default is to add the links to the local crawl queue (your peer crawls the linked pages).
|
||||
</p>
|
||||
</fieldset>
|
||||
</form>
|
||||
|
||||
<form id="HeuristicFormBlekko" method="post" action="ConfigHeuristics_p.html" enctype="multipart/form-data" accept-charset="UTF-8">
|
||||
<fieldset>
|
||||
<legend>
|
||||
|
@ -55,8 +83,7 @@
|
|||
</p>
|
||||
</fieldset>
|
||||
</form>
|
||||
|
||||
|
||||
|
||||
#%env/templates/footer.template%#
|
||||
</body>
|
||||
</html>
|
||||
|
|
|
@ -45,13 +45,19 @@ public class ConfigHeuristics_p {
|
|||
|
||||
if (post.containsKey("site_on")) sb.setConfig("heuristic.site", true);
|
||||
if (post.containsKey("site_off")) sb.setConfig("heuristic.site", false);
|
||||
if (post.containsKey("searchresult_on")) sb.setConfig("heuristic.searchresults", true);
|
||||
if (post.containsKey("searchresult_off")) sb.setConfig("heuristic.searchresults", false);
|
||||
if (post.containsKey("searchresultglobal_on")) sb.setConfig("heuristic.searchresults.crawlglobal", true);
|
||||
if (post.containsKey("searchresultglobal_off")) sb.setConfig("heuristic.searchresults.crawlglobal", false);
|
||||
if (post.containsKey("blekko_on")) sb.setConfig("heuristic.blekko", true);
|
||||
if (post.containsKey("blekko_off")) sb.setConfig("heuristic.blekko", false);
|
||||
}
|
||||
|
||||
prop.put("site.checked", sb.getConfigBool("heuristic.site", false) ? 1 : 0);
|
||||
prop.put("searchresult.checked", sb.getConfigBool("heuristic.searchresults", false) ? 1 : 0);
|
||||
prop.put("searchresultglobal.checked", sb.getConfigBool("heuristic.searchresults.crawlglobal", false) ? 1 : 0);
|
||||
prop.put("blekko.checked", sb.getConfigBool("heuristic.blekko", false) ? 1 : 0);
|
||||
|
||||
|
||||
return prop;
|
||||
}
|
||||
}
|
||||
|
|
|
@ -248,6 +248,7 @@ public class yacysearchitem {
|
|||
prop.put("content_loc_lat", result.lat());
|
||||
prop.put("content_loc_lon", result.lon());
|
||||
}
|
||||
if (sb.getConfigBool("heuristic.searchresults",false)) sb.heuristicSearchResults(resultUrlstring);
|
||||
theQuery.transmitcount = item + 1;
|
||||
return prop;
|
||||
}
|
||||
|
|
|
@ -160,6 +160,7 @@ import de.anomic.crawler.CrawlQueues;
|
|||
import de.anomic.crawler.CrawlStacker;
|
||||
import de.anomic.crawler.CrawlSwitchboard;
|
||||
import de.anomic.crawler.NoticedURL;
|
||||
import de.anomic.crawler.NoticedURL.StackType;
|
||||
import de.anomic.crawler.ResourceObserver;
|
||||
import de.anomic.crawler.ResultImages;
|
||||
import de.anomic.crawler.ResultURLs;
|
||||
|
@ -2573,7 +2574,7 @@ public final class Switchboard extends serverSwitch
|
|||
"denied by profile rule, process case="
|
||||
+ processCase
|
||||
+ ", profile name = "
|
||||
+ queueEntry.profile().name());
|
||||
+ queueEntry.profile().name());
|
||||
return;
|
||||
}
|
||||
|
||||
|
@ -2785,6 +2786,44 @@ public final class Switchboard extends serverSwitch
|
|||
}.start();
|
||||
}
|
||||
|
||||
/**
|
||||
* add url to Crawler - which itself loads the URL, parses the content and adds it to the index
|
||||
* transparent alternative to "addToIndex" including, double in crawler check, display in crawl monitor
|
||||
* but doesn't return results for a ongoing search
|
||||
*
|
||||
* @param url the url that shall be indexed
|
||||
* @param asglobal true adds the url to global crawl queue (for remote crawling), false to the local crawler
|
||||
*/
|
||||
public void addToCrawler(final DigestURI url, final boolean asglobal) {
|
||||
|
||||
if ( this.index.exists(url.hash()) ) {
|
||||
return; // don't do double-work
|
||||
}
|
||||
final Request request = this.loader.request(url, true, true);
|
||||
final CrawlProfile profile = sb.crawler.getActive(ASCII.getBytes(request.profileHandle()));
|
||||
final String acceptedError = this.crawlStacker.checkAcceptance(url, profile, 0);
|
||||
if (acceptedError != null) {
|
||||
this.log.logInfo("addToCrawler: cannot load "
|
||||
+ url.toNormalform(false, false)
|
||||
+ ": "
|
||||
+ acceptedError);
|
||||
return;
|
||||
}
|
||||
final String s;
|
||||
if (asglobal) {
|
||||
s = sb.crawlQueues.noticeURL.push(StackType.GLOBAL, request);
|
||||
} else {
|
||||
s = sb.crawlQueues.noticeURL.push(StackType.LOCAL, request);
|
||||
}
|
||||
|
||||
if (s != null) {
|
||||
Switchboard.this.log.logInfo("addToCrawler: failed to add "
|
||||
+ url.toNormalform(false, false)
|
||||
+ ": "
|
||||
+ s);
|
||||
}
|
||||
}
|
||||
|
||||
public class receiptSending implements Runnable
|
||||
{
|
||||
private final Seed initiatorPeer;
|
||||
|
@ -3125,8 +3164,8 @@ public final class Switchboard extends serverSwitch
|
|||
|
||||
final Map<MultiProtocolURI, String> links;
|
||||
searchEvent.getRankingResult().oneFeederStarted();
|
||||
try {
|
||||
links = Switchboard.this.loader.loadLinks(url, CacheStrategy.NOCACHE);
|
||||
try {
|
||||
links = Switchboard.this.loader.loadLinks(url, CacheStrategy.NOCACHE);
|
||||
if ( links != null ) {
|
||||
final Iterator<MultiProtocolURI> i = links.keySet().iterator();
|
||||
while ( i.hasNext() ) {
|
||||
|
@ -3139,7 +3178,7 @@ public final class Switchboard extends serverSwitch
|
|||
addAllToIndex(url, links, searchEvent, "site");
|
||||
}
|
||||
} catch ( final Throwable e ) {
|
||||
Log.logException(e);
|
||||
Log.logException(e);
|
||||
} finally {
|
||||
searchEvent.getRankingResult().oneFeederTerminated();
|
||||
}
|
||||
|
@ -3147,6 +3186,46 @@ public final class Switchboard extends serverSwitch
|
|||
}.start();
|
||||
}
|
||||
|
||||
public final void heuristicSearchResults(final String host) {
|
||||
new Thread() {
|
||||
|
||||
@Override
|
||||
public void run() {
|
||||
|
||||
// get the links for a specific site
|
||||
final DigestURI startUrl;
|
||||
try {
|
||||
startUrl = new DigestURI(host);
|
||||
} catch (final MalformedURLException e) {
|
||||
Log.logException(e);
|
||||
return;
|
||||
}
|
||||
|
||||
final Map<MultiProtocolURI, String> links;
|
||||
DigestURI url;
|
||||
try {
|
||||
links = Switchboard.this.loader.loadLinks(startUrl, CacheStrategy.IFFRESH);
|
||||
if (links != null) {
|
||||
if (links.size() < 1000) { // limit to 1000 to skip large index pages
|
||||
final Iterator<MultiProtocolURI> i = links.keySet().iterator();
|
||||
final boolean globalcrawljob = sb.getConfigBool("heuristic.searchresults.crawlglobal",false);
|
||||
while (i.hasNext()) {
|
||||
url = new DigestURI(i.next());
|
||||
boolean islocal = url.getHost().contentEquals(startUrl.getHost());
|
||||
// add all external links or links to different page to crawler
|
||||
if ( !islocal ) {// || (!startUrl.getPath().endsWith(url.getPath()))) {
|
||||
addToCrawler(url,globalcrawljob);
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
} catch (final Throwable e) {
|
||||
Log.logException(e);
|
||||
}
|
||||
}
|
||||
}.start();
|
||||
}
|
||||
|
||||
// blekko pattern: http://blekko.com/ws/$+/rss
|
||||
public final void heuristicRSS(
|
||||
final String urlpattern,
|
||||
|
|
Loading…
Reference in New Issue
Block a user