mirror of
https://github.com/yacy/yacy_search_server.git
synced 2024-09-19 00:01:41 +02:00
added a 'greedy learning' mechanismn which will cause that a 'fresh'
yacy will load linked web pages from search results until the total number of web pages reaches 15000. This shall give fresh peers a 'boost' to get faster a personalized search index.
This commit is contained in:
parent
a5e328d7c5
commit
6115bef335
|
@ -1156,3 +1156,14 @@ interaction.autocrawler.categoryfilter = .*
|
|||
# host browser settings
|
||||
browser.autoload = false
|
||||
browser.load4everyone = false
|
||||
|
||||
|
||||
# greedy learning: fast information acquisition heuristic for new peers
|
||||
# to make greedy learning work, it must be enabled in the network definition
|
||||
# the user may switch it off at any time, but if the automatic learning limit is reached
|
||||
# then the active flag is set to false automatically and this will switch to that state
|
||||
# automatically by the cleanup process each time if the user switches it on again.
|
||||
# While the switch in on, it will cause that the user-submitted search will be done along
|
||||
# with some heuristics like: loading linked documents and adding a twitter search.
|
||||
# When the learning mode is finished, the user may switch on individual heuristics by himself.
|
||||
greedylearning.active = true
|
|
@ -73,11 +73,13 @@ network.unit.remotecrawl.speed = 300
|
|||
# addresses of seed-list bootstrap locations
|
||||
network.unit.bootstrap.seedlist0 = http://www.yacy.net/seed.txt
|
||||
network.unit.bootstrap.seedlist1 = http://home.arcor.de/hermens/yacy/seed.txt
|
||||
network.unit.bootstrap.seedlist2 = http://low.audioattack.de/yacy/seed.txt
|
||||
network.unit.bootstrap.seedlist3 = http://www.lulabad.de/seed.txt
|
||||
network.unit.bootstrap.seedlist4 = http://sixcooler.de/yacy/seed.txt
|
||||
network.unit.bootstrap.seedlist5 = http://headrift.dyndns.org/yacy/seed.txt
|
||||
network.unit.bootstrap.seedlist6 = http://dk5ras.dyndns.org/seed.txt
|
||||
network.unit.bootstrap.seedlist2 = http://www.lulabad.de/seed.txt
|
||||
network.unit.bootstrap.seedlist3 = http://sixcooler.de/yacy/seed.txt
|
||||
network.unit.bootstrap.seedlist4 = http://img.homepage.bluewin.ch/352348/seed.txt
|
||||
network.unit.bootstrap.seedlist5 = https://esbek.iv.net.pl/yacy/seed.txt
|
||||
network.unit.bootstrap.seedlist6 = http://yacy.seed.mylookr.com/seed.txt
|
||||
network.unit.bootstrap.seedlist7 = http://mary.dyndns.biz/yacy/seed.txt
|
||||
|
||||
|
||||
# each network may use different yacy distributions.
|
||||
# the auto-updater can access network-specific update locations
|
||||
|
@ -94,3 +96,7 @@ network.unit.protocol.control = uncontrolled
|
|||
# white/blacklists
|
||||
network.unit.access.whitelist = 10\..*,127\..*,172\.(1[6-9]|2[0-9]|3[0-1])\..*,169\.254\..*,192\.168\..*,localhost
|
||||
network.unit.access.blacklist =
|
||||
|
||||
# greedy learning: fast information acquisition heuristic for new peers
|
||||
greedylearning.enabled = true
|
||||
greedylearning.limit.doccount = 15000
|
||||
|
|
|
@ -31,4 +31,8 @@ network.unit.protocol.control = uncontrolled
|
|||
|
||||
# white/blacklists
|
||||
network.unit.access.whitelist = 10\..*,127\..*,172\.(1[6-9]|2[0-9]|3[0-1])\..*,169\.254\..*,192\.168\..*,localhost
|
||||
network.unit.access.blacklist =
|
||||
network.unit.access.blacklist =
|
||||
|
||||
# greedy learning: fast information acquisition heuristic for new peers
|
||||
greedylearning.enabled = false
|
||||
greedylearning.limit.doccount = 15000
|
|
@ -90,4 +90,8 @@ network.unit.protocol.control = uncontrolled
|
|||
|
||||
# white/blacklists
|
||||
network.unit.access.whitelist = 10\..*,127\..*,172\.(1[6-9]|2[0-9]|3[0-1])\..*,169\.254\..*,192\.168\..*,213.183.195.83,130.75.2.35,85.31.186.137,localhost
|
||||
network.unit.access.blacklist =
|
||||
network.unit.access.blacklist =
|
||||
|
||||
# greedy learning: fast information acquisition heuristic for new peers
|
||||
greedylearning.enabled = false
|
||||
greedylearning.limit.doccount = 15000
|
|
@ -28,4 +28,8 @@ network.unit.protocol.control = uncontrolled
|
|||
|
||||
# white/blacklists
|
||||
network.unit.access.whitelist = 10\..*,127\..*,172\.(1[6-9]|2[0-9]|3[0-1])\..*,169\.254\..*,192\.168\..*,localhost
|
||||
network.unit.access.blacklist =
|
||||
network.unit.access.blacklist =
|
||||
|
||||
# greedy learning: fast information acquisition heuristic for new peers
|
||||
greedylearning.enabled = false
|
||||
greedylearning.limit.doccount = 15000
|
|
@ -55,25 +55,25 @@ public class ConfigHeuristics_p {
|
|||
// store this call as api call
|
||||
sb.tables.recordAPICall(post, "ConfigHeuristics.html", WorkTables.TABLE_API_TYPE_CONFIGURATION, "heuristic settings");
|
||||
|
||||
if (post.containsKey("site_on")) sb.setConfig("heuristic.site", true);
|
||||
if (post.containsKey("site_off")) sb.setConfig("heuristic.site", false);
|
||||
if (post.containsKey("searchresult_on")) sb.setConfig("heuristic.searchresults", true);
|
||||
if (post.containsKey("searchresult_off")) sb.setConfig("heuristic.searchresults", false);
|
||||
if (post.containsKey("searchresultglobal_on")) sb.setConfig("heuristic.searchresults.crawlglobal", true);
|
||||
if (post.containsKey("searchresultglobal_off")) sb.setConfig("heuristic.searchresults.crawlglobal", false);
|
||||
if (post.containsKey("blekko_on")) sb.setConfig("heuristic.blekko", true);
|
||||
if (post.containsKey("blekko_off")) sb.setConfig("heuristic.blekko", false);
|
||||
if (post.containsKey("twitter_on")) sb.setConfig("heuristic.twitter", true);
|
||||
if (post.containsKey("twitter_off")) sb.setConfig("heuristic.twitter", false);
|
||||
if (post.containsKey("site_on")) sb.setConfig(SwitchboardConstants.HEURISTIC_SITE, true);
|
||||
if (post.containsKey("site_off")) sb.setConfig(SwitchboardConstants.HEURISTIC_SITE, false);
|
||||
if (post.containsKey("searchresult_on")) sb.setConfig(SwitchboardConstants.HEURISTIC_SEARCHRESULTS, true);
|
||||
if (post.containsKey("searchresult_off")) sb.setConfig(SwitchboardConstants.HEURISTIC_SEARCHRESULTS, false);
|
||||
if (post.containsKey("searchresultglobal_on")) sb.setConfig(SwitchboardConstants.HEURISTIC_SEARCHRESULTS_CRAWLGLOBAL, true);
|
||||
if (post.containsKey("searchresultglobal_off")) sb.setConfig(SwitchboardConstants.HEURISTIC_SEARCHRESULTS_CRAWLGLOBAL, false);
|
||||
if (post.containsKey("blekko_on")) sb.setConfig(SwitchboardConstants.HEURISTIC_BLEKKO, true);
|
||||
if (post.containsKey("blekko_off")) sb.setConfig(SwitchboardConstants.HEURISTIC_BLEKKO, false);
|
||||
if (post.containsKey("twitter_on")) sb.setConfig(SwitchboardConstants.HEURISTIC_TWITTER, true);
|
||||
if (post.containsKey("twitter_off")) sb.setConfig(SwitchboardConstants.HEURISTIC_TWITTER, false);
|
||||
if (post.containsKey("opensearch_on")) {
|
||||
sb.setConfig("heuristic.opensearch", true);
|
||||
sb.setConfig(SwitchboardConstants.HEURISTIC_OPENSEARCH, true);
|
||||
// re-read config (and create work table)
|
||||
OpenSearchConnector os = new OpenSearchConnector(sb, true);
|
||||
if (os.getSize() == 0) {
|
||||
osderrmsg = "no active search targets are configured";
|
||||
}
|
||||
}
|
||||
if (post.containsKey("opensearch_off")) sb.setConfig("heuristic.opensearch", false);
|
||||
if (post.containsKey("opensearch_off")) sb.setConfig(SwitchboardConstants.HEURISTIC_OPENSEARCH, false);
|
||||
if (post.containsKey("discoverosd")) {
|
||||
final boolean metafieldavailable = sb.index.fulltext().getWebgraphConfiguration().contains(WebgraphSchema.target_rel_s.name())
|
||||
&& (sb.index.fulltext().getWebgraphConfiguration().contains(WebgraphSchema.target_protocol_s.name()) && sb.index.fulltext().getWebgraphConfiguration().contains(WebgraphSchema.target_urlstub_s.name()));
|
||||
|
@ -155,12 +155,12 @@ public class ConfigHeuristics_p {
|
|||
&& (sb.index.fulltext().getWebgraphConfiguration().contains(WebgraphSchema.target_protocol_s.name()) && sb.index.fulltext().getWebgraphConfiguration().contains(WebgraphSchema.target_urlstub_s.name()))
|
||||
&& sb.getConfigBool(SwitchboardConstants.CORE_SERVICE_WEBGRAPH, false);
|
||||
if (!showmetafieldbutton) prop.put("osdsolrfieldswitch",1);
|
||||
prop.put("site.checked", sb.getConfigBool("heuristic.site", false) ? 1 : 0);
|
||||
prop.put("searchresult.checked", sb.getConfigBool("heuristic.searchresults", false) ? 1 : 0);
|
||||
prop.put("searchresultglobal.checked", sb.getConfigBool("heuristic.searchresults.crawlglobal", false) ? 1 : 0);
|
||||
prop.put("blekko.checked", sb.getConfigBool("heuristic.blekko", false) ? 1 : 0);
|
||||
prop.put("twitter.checked", sb.getConfigBool("heuristic.twitter", false) ? 1 : 0);
|
||||
prop.put("opensearch.checked", sb.getConfigBool("heuristic.opensearch", false) ? 1 : 0);
|
||||
prop.put("site.checked", sb.getConfigBool(SwitchboardConstants.HEURISTIC_SITE, false) ? 1 : 0);
|
||||
prop.put("searchresult.checked", sb.getConfigBool(SwitchboardConstants.HEURISTIC_SEARCHRESULTS, false) ? 1 : 0);
|
||||
prop.put("searchresultglobal.checked", sb.getConfigBool(SwitchboardConstants.HEURISTIC_SEARCHRESULTS_CRAWLGLOBAL, false) ? 1 : 0);
|
||||
prop.put("blekko.checked", sb.getConfigBool(SwitchboardConstants.HEURISTIC_BLEKKO, false) ? 1 : 0);
|
||||
prop.put("twitter.checked", sb.getConfigBool(SwitchboardConstants.HEURISTIC_TWITTER, false) ? 1 : 0);
|
||||
prop.put("opensearch.checked", sb.getConfigBool(SwitchboardConstants.HEURISTIC_OPENSEARCH, false) ? 1 : 0);
|
||||
|
||||
// display config file content
|
||||
final File f = new File (sb.getDataPath(),"DATA/SETTINGS/heuristicopensearch.conf");
|
||||
|
@ -238,7 +238,7 @@ public class ConfigHeuristics_p {
|
|||
}
|
||||
|
||||
// re-read config (and create/update work table)
|
||||
if (sb.getConfigBool("heuristic.opensearch", true)) {
|
||||
if (sb.getConfigBool(SwitchboardConstants.HEURISTIC_OPENSEARCH, true)) {
|
||||
OpenSearchConnector os = new OpenSearchConnector(sb, true);
|
||||
}
|
||||
}
|
||||
|
|
|
@ -91,9 +91,9 @@ public class ConfigNetwork_p
|
|||
boolean indexReceive = "on".equals(post.get("indexReceive", ""));
|
||||
if ( !indexReceive ) {
|
||||
// remove heuristics
|
||||
sb.setConfig("heuristic.site", false);
|
||||
sb.setConfig("heuristic.blekko", false);
|
||||
sb.setConfig("heuristic.twitter", false);
|
||||
sb.setConfig(SwitchboardConstants.HEURISTIC_SITE, false);
|
||||
sb.setConfig(SwitchboardConstants.HEURISTIC_BLEKKO, false);
|
||||
sb.setConfig(SwitchboardConstants.HEURISTIC_TWITTER, false);
|
||||
}
|
||||
final boolean robinsonmode = "robinson".equals(post.get("network", ""));
|
||||
if ( robinsonmode ) {
|
||||
|
|
|
@ -158,10 +158,13 @@ public class yacysearch {
|
|||
sb.getConfigBool(SwitchboardConstants.INDEX_RECEIVE_ALLOW, true)
|
||||
|| sb.getConfigBool(SwitchboardConstants.INDEX_RECEIVE_AUTODISABLED, true)
|
||||
|| clustersearch;
|
||||
boolean global = post == null || (post.get("resource", "local").equals("global") && sb.peers.sizeConnected() > 0 && indexReceiveGranted);
|
||||
prop.put("topmenu_resource-select", (sb.peers == null || sb.peers.sizeConnected() == 0 || !indexReceiveGranted) ? 0 : global ? 1 : 2);
|
||||
boolean p2pmode = sb.peers != null && sb.peers.sizeConnected() > 0 && indexReceiveGranted;
|
||||
boolean global = post == null || (post.get("resource", "local").equals("global") && p2pmode);
|
||||
boolean stealthmode = p2pmode && !global;
|
||||
prop.put("topmenu_resource-select", stealthmode ? 2 : global ? 1 : 0);
|
||||
|
||||
if ( post == null || indexSegment == null || env == null || !searchAllowed ) {
|
||||
if (indexSegment == null) Log.logInfo("yacysearch", "indexSegment == null");
|
||||
// we create empty entries for template strings
|
||||
prop.put("searchagain", "0");
|
||||
prop.put("former", "");
|
||||
|
@ -483,7 +486,7 @@ public class yacysearch {
|
|||
}
|
||||
|
||||
final int heuristicTwitter = querystring.indexOf("/heuristic/twitter", 0);
|
||||
if ( heuristicBlekko >= 0 ) {
|
||||
if ( heuristicTwitter >= 0 ) {
|
||||
querystring = querystring.replace("/heuristic/twitter", "");
|
||||
modifier.add("/heuristic/twitter");
|
||||
}
|
||||
|
@ -723,16 +726,16 @@ public class yacysearch {
|
|||
(int) sb.getConfigLong(SwitchboardConstants.DHT_BURST_MULTIWORD, 0));
|
||||
|
||||
if ( startRecord == 0 ) {
|
||||
if ( modifier.sitehost != null && sb.getConfigBool("heuristic.site", false) && authenticated ) {
|
||||
if ( modifier.sitehost != null && sb.getConfigBool(SwitchboardConstants.HEURISTIC_SITE, false) && authenticated && !stealthmode) {
|
||||
sb.heuristicSite(theSearch, modifier.sitehost);
|
||||
}
|
||||
if ( (heuristicBlekko >= 0 || sb.getConfigBool("heuristic.blekko", false)) && authenticated ) {
|
||||
if ( (heuristicBlekko >= 0 || sb.getConfigBool(SwitchboardConstants.HEURISTIC_BLEKKO, false)) && authenticated && !stealthmode ) {
|
||||
sb.heuristicRSS("http://blekko.com/ws/$+/rss", theSearch, "blekko");
|
||||
}
|
||||
if ( (heuristicTwitter >= 0 || sb.getConfigBool("heuristic.twitter", false)) && authenticated ) {
|
||||
if ( (heuristicTwitter >= 0 || sb.getConfigBool(SwitchboardConstants.HEURISTIC_TWITTER, false)) && authenticated && !stealthmode ) {
|
||||
sb.heuristicRSS("http://search.twitter.com/search.rss?rpp=50&q=$", theSearch, "twitter");
|
||||
}
|
||||
if (sb.getConfigBool("heuristic.opensearch", false) && authenticated) {
|
||||
if (sb.getConfigBool(SwitchboardConstants.HEURISTIC_OPENSEARCH, false) && authenticated && !stealthmode) {
|
||||
OpenSearchConnector.query(sb, theSearch);
|
||||
}
|
||||
}
|
||||
|
|
|
@ -255,7 +255,17 @@ public class yacysearchitem {
|
|||
prop.put("content_loc_lat", result.lat());
|
||||
prop.put("content_loc_lon", result.lon());
|
||||
}
|
||||
if (sb.getConfigBool("heuristic.searchresults",false)) sb.heuristicSearchResults(resultUrlstring);
|
||||
final boolean clustersearch = sb.isRobinsonMode() && sb.getConfig(SwitchboardConstants.CLUSTER_MODE, "").equals(SwitchboardConstants.CLUSTER_MODE_PUBLIC_CLUSTER);
|
||||
final boolean indexReceiveGranted =
|
||||
sb.getConfigBool(SwitchboardConstants.INDEX_RECEIVE_ALLOW, true)
|
||||
|| sb.getConfigBool(SwitchboardConstants.INDEX_RECEIVE_AUTODISABLED, true)
|
||||
|| clustersearch;
|
||||
boolean p2pmode = sb.peers != null && sb.peers.sizeConnected() > 0 && indexReceiveGranted;
|
||||
boolean global = post == null || (post.get("resource", "local").equals("global") && p2pmode);
|
||||
boolean stealthmode = p2pmode && !global;
|
||||
if ((sb.getConfigBool(SwitchboardConstants.HEURISTIC_SEARCHRESULTS, false) ||
|
||||
(sb.getConfigBool(SwitchboardConstants.GREEDYLEARNING_ACTIVE, false) && sb.getConfigBool(SwitchboardConstants.GREEDYLEARNING_ENABLED, false))) &&
|
||||
!stealthmode) sb.heuristicSearchResults(resultUrlstring);
|
||||
theSearch.query.transmitcount = item + 1;
|
||||
return prop;
|
||||
}
|
||||
|
|
|
@ -1303,9 +1303,9 @@ public final class Switchboard extends serverSwitch {
|
|||
ResultURLs.clearStacks();
|
||||
|
||||
// remove heuristics
|
||||
setConfig("heuristic.site", false);
|
||||
setConfig("heuristic.blekko", false);
|
||||
setConfig("heuristic.twitter", false);
|
||||
setConfig(SwitchboardConstants.HEURISTIC_SITE, false);
|
||||
setConfig(SwitchboardConstants.HEURISTIC_BLEKKO, false);
|
||||
setConfig(SwitchboardConstants.HEURISTIC_TWITTER, false);
|
||||
|
||||
// relocate
|
||||
this.peers.relocate(
|
||||
|
@ -2041,6 +2041,15 @@ public final class Switchboard extends serverSwitch {
|
|||
setConfig("adminAccount", "");
|
||||
}
|
||||
|
||||
// stop greedylearning if limit is reached
|
||||
if (getConfigBool(SwitchboardConstants.GREEDYLEARNING_ACTIVE, false)) {
|
||||
long cs = this.index.fulltext().collectionSize();
|
||||
if (cs > getConfigInt(SwitchboardConstants.GREEDYLEARNING_LIMIT_DOCCOUNT, 0)) {
|
||||
setConfig(SwitchboardConstants.GREEDYLEARNING_ACTIVE, false);
|
||||
log.logInfo("finishing greedy learning phase, size=" +cs);
|
||||
}
|
||||
}
|
||||
|
||||
// refresh recrawl dates
|
||||
try {
|
||||
CrawlProfile selentry;
|
||||
|
@ -2265,6 +2274,7 @@ public final class Switchboard extends serverSwitch {
|
|||
// if no crawl is running and processing is activated:
|
||||
// execute the (post-) processing steps for all entries that have a process tag assigned
|
||||
if (this.crawlQueues.coreCrawlJobSize() == 0) {
|
||||
if (this.crawlQueues.noticeURL.isEmpty()) this.crawlQueues.noticeURL.clear(); // flushes more caches
|
||||
index.fulltext().getDefaultConfiguration().postprocessing(index);
|
||||
index.fulltext().getWebgraphConfiguration().postprocessing(index);
|
||||
}
|
||||
|
@ -3371,7 +3381,7 @@ public final class Switchboard extends serverSwitch {
|
|||
}.start();
|
||||
}
|
||||
|
||||
public final void heuristicSearchResults(final String host) {
|
||||
public final void heuristicSearchResults(final String url) {
|
||||
new Thread() {
|
||||
|
||||
@Override
|
||||
|
@ -3380,7 +3390,7 @@ public final class Switchboard extends serverSwitch {
|
|||
// get the links for a specific site
|
||||
final DigestURI startUrl;
|
||||
try {
|
||||
startUrl = new DigestURI(host);
|
||||
startUrl = new DigestURI(url);
|
||||
} catch (final MalformedURLException e) {
|
||||
Log.logException(e);
|
||||
return;
|
||||
|
@ -3393,7 +3403,7 @@ public final class Switchboard extends serverSwitch {
|
|||
if (links != null) {
|
||||
if (links.size() < 1000) { // limit to 1000 to skip large index pages
|
||||
final Iterator<DigestURI> i = links.keySet().iterator();
|
||||
final boolean globalcrawljob = Switchboard.this.getConfigBool("heuristic.searchresults.crawlglobal",false);
|
||||
final boolean globalcrawljob = Switchboard.this.getConfigBool(SwitchboardConstants.HEURISTIC_SEARCHRESULTS_CRAWLGLOBAL,false);
|
||||
Collection<DigestURI> urls = new ArrayList<DigestURI>();
|
||||
while (i.hasNext()) {
|
||||
url = i.next();
|
||||
|
|
|
@ -498,8 +498,25 @@ public final class SwitchboardConstants {
|
|||
/**
|
||||
* system tray
|
||||
*/
|
||||
public static final String TRAY_ICON_ENABLED = "tray.icon.enabled";
|
||||
public static final String TRAY_ICON_FORCED = "tray.icon.force";
|
||||
public static final String TRAY_ICON_LABEL = "tray.icon.label";
|
||||
public static final String TRAY_MENU_ENABLED = "tray.menu.enabled";
|
||||
public static final String TRAY_ICON_ENABLED = "tray.icon.enabled";
|
||||
public static final String TRAY_ICON_FORCED = "tray.icon.force";
|
||||
public static final String TRAY_ICON_LABEL = "tray.icon.label";
|
||||
public static final String TRAY_MENU_ENABLED = "tray.menu.enabled";
|
||||
|
||||
/*
|
||||
* search heuristics
|
||||
*/
|
||||
public static final String HEURISTIC_SITE = "heuristic.site";
|
||||
public static final String HEURISTIC_SEARCHRESULTS = "heuristic.searchresults";
|
||||
public static final String HEURISTIC_SEARCHRESULTS_CRAWLGLOBAL = "heuristic.searchresults.crawlglobal";
|
||||
public static final String HEURISTIC_BLEKKO = "heuristic.blekko";
|
||||
public static final String HEURISTIC_TWITTER = "heuristic.twitter";
|
||||
public static final String HEURISTIC_OPENSEARCH = "heuristic.opensearch";
|
||||
|
||||
/*
|
||||
* automatic learning heuristic
|
||||
*/
|
||||
public static final String GREEDYLEARNING_ENABLED = "greedylearning.enabled";
|
||||
public static final String GREEDYLEARNING_LIMIT_DOCCOUNT = "greedylearning.limit.doccount";
|
||||
public static final String GREEDYLEARNING_ACTIVE = "greedylearning.active";
|
||||
}
|
||||
|
|
Loading…
Reference in New Issue
Block a user