added a 'greedy learning' mechanismn which will cause that a 'fresh'

yacy will load linked web pages from search results until the total
number of web pages reaches 15000. This shall give fresh peers a 'boost'
to get faster a personalized search index.
This commit is contained in:
Michael Peter Christen 2013-06-11 14:42:30 +02:00
parent a5e328d7c5
commit 6115bef335
11 changed files with 117 additions and 48 deletions

View File

@ -1156,3 +1156,14 @@ interaction.autocrawler.categoryfilter = .*
# host browser settings
browser.autoload = false
browser.load4everyone = false
# greedy learning: fast information acquisition heuristic for new peers
# to make greedy learning work, it must be enabled in the network definition
# the user may switch it off at any time, but if the automatic learning limit is reached
# then the active flag is set to false automatically and this will switch to that state
# automatically by the cleanup process each time if the user switches it on again.
# While the switch in on, it will cause that the user-submitted search will be done along
# with some heuristics like: loading linked documents and adding a twitter search.
# When the learning mode is finished, the user may switch on individual heuristics by himself.
greedylearning.active = true

View File

@ -73,11 +73,13 @@ network.unit.remotecrawl.speed = 300
# addresses of seed-list bootstrap locations
network.unit.bootstrap.seedlist0 = http://www.yacy.net/seed.txt
network.unit.bootstrap.seedlist1 = http://home.arcor.de/hermens/yacy/seed.txt
network.unit.bootstrap.seedlist2 = http://low.audioattack.de/yacy/seed.txt
network.unit.bootstrap.seedlist3 = http://www.lulabad.de/seed.txt
network.unit.bootstrap.seedlist4 = http://sixcooler.de/yacy/seed.txt
network.unit.bootstrap.seedlist5 = http://headrift.dyndns.org/yacy/seed.txt
network.unit.bootstrap.seedlist6 = http://dk5ras.dyndns.org/seed.txt
network.unit.bootstrap.seedlist2 = http://www.lulabad.de/seed.txt
network.unit.bootstrap.seedlist3 = http://sixcooler.de/yacy/seed.txt
network.unit.bootstrap.seedlist4 = http://img.homepage.bluewin.ch/352348/seed.txt
network.unit.bootstrap.seedlist5 = https://esbek.iv.net.pl/yacy/seed.txt
network.unit.bootstrap.seedlist6 = http://yacy.seed.mylookr.com/seed.txt
network.unit.bootstrap.seedlist7 = http://mary.dyndns.biz/yacy/seed.txt
# each network may use different yacy distributions.
# the auto-updater can access network-specific update locations
@ -94,3 +96,7 @@ network.unit.protocol.control = uncontrolled
# white/blacklists
network.unit.access.whitelist = 10\..*,127\..*,172\.(1[6-9]|2[0-9]|3[0-1])\..*,169\.254\..*,192\.168\..*,localhost
network.unit.access.blacklist =
# greedy learning: fast information acquisition heuristic for new peers
greedylearning.enabled = true
greedylearning.limit.doccount = 15000

View File

@ -31,4 +31,8 @@ network.unit.protocol.control = uncontrolled
# white/blacklists
network.unit.access.whitelist = 10\..*,127\..*,172\.(1[6-9]|2[0-9]|3[0-1])\..*,169\.254\..*,192\.168\..*,localhost
network.unit.access.blacklist =
network.unit.access.blacklist =
# greedy learning: fast information acquisition heuristic for new peers
greedylearning.enabled = false
greedylearning.limit.doccount = 15000

View File

@ -90,4 +90,8 @@ network.unit.protocol.control = uncontrolled
# white/blacklists
network.unit.access.whitelist = 10\..*,127\..*,172\.(1[6-9]|2[0-9]|3[0-1])\..*,169\.254\..*,192\.168\..*,213.183.195.83,130.75.2.35,85.31.186.137,localhost
network.unit.access.blacklist =
network.unit.access.blacklist =
# greedy learning: fast information acquisition heuristic for new peers
greedylearning.enabled = false
greedylearning.limit.doccount = 15000

View File

@ -28,4 +28,8 @@ network.unit.protocol.control = uncontrolled
# white/blacklists
network.unit.access.whitelist = 10\..*,127\..*,172\.(1[6-9]|2[0-9]|3[0-1])\..*,169\.254\..*,192\.168\..*,localhost
network.unit.access.blacklist =
network.unit.access.blacklist =
# greedy learning: fast information acquisition heuristic for new peers
greedylearning.enabled = false
greedylearning.limit.doccount = 15000

View File

@ -55,25 +55,25 @@ public class ConfigHeuristics_p {
// store this call as api call
sb.tables.recordAPICall(post, "ConfigHeuristics.html", WorkTables.TABLE_API_TYPE_CONFIGURATION, "heuristic settings");
if (post.containsKey("site_on")) sb.setConfig("heuristic.site", true);
if (post.containsKey("site_off")) sb.setConfig("heuristic.site", false);
if (post.containsKey("searchresult_on")) sb.setConfig("heuristic.searchresults", true);
if (post.containsKey("searchresult_off")) sb.setConfig("heuristic.searchresults", false);
if (post.containsKey("searchresultglobal_on")) sb.setConfig("heuristic.searchresults.crawlglobal", true);
if (post.containsKey("searchresultglobal_off")) sb.setConfig("heuristic.searchresults.crawlglobal", false);
if (post.containsKey("blekko_on")) sb.setConfig("heuristic.blekko", true);
if (post.containsKey("blekko_off")) sb.setConfig("heuristic.blekko", false);
if (post.containsKey("twitter_on")) sb.setConfig("heuristic.twitter", true);
if (post.containsKey("twitter_off")) sb.setConfig("heuristic.twitter", false);
if (post.containsKey("site_on")) sb.setConfig(SwitchboardConstants.HEURISTIC_SITE, true);
if (post.containsKey("site_off")) sb.setConfig(SwitchboardConstants.HEURISTIC_SITE, false);
if (post.containsKey("searchresult_on")) sb.setConfig(SwitchboardConstants.HEURISTIC_SEARCHRESULTS, true);
if (post.containsKey("searchresult_off")) sb.setConfig(SwitchboardConstants.HEURISTIC_SEARCHRESULTS, false);
if (post.containsKey("searchresultglobal_on")) sb.setConfig(SwitchboardConstants.HEURISTIC_SEARCHRESULTS_CRAWLGLOBAL, true);
if (post.containsKey("searchresultglobal_off")) sb.setConfig(SwitchboardConstants.HEURISTIC_SEARCHRESULTS_CRAWLGLOBAL, false);
if (post.containsKey("blekko_on")) sb.setConfig(SwitchboardConstants.HEURISTIC_BLEKKO, true);
if (post.containsKey("blekko_off")) sb.setConfig(SwitchboardConstants.HEURISTIC_BLEKKO, false);
if (post.containsKey("twitter_on")) sb.setConfig(SwitchboardConstants.HEURISTIC_TWITTER, true);
if (post.containsKey("twitter_off")) sb.setConfig(SwitchboardConstants.HEURISTIC_TWITTER, false);
if (post.containsKey("opensearch_on")) {
sb.setConfig("heuristic.opensearch", true);
sb.setConfig(SwitchboardConstants.HEURISTIC_OPENSEARCH, true);
// re-read config (and create work table)
OpenSearchConnector os = new OpenSearchConnector(sb, true);
if (os.getSize() == 0) {
osderrmsg = "no active search targets are configured";
}
}
if (post.containsKey("opensearch_off")) sb.setConfig("heuristic.opensearch", false);
if (post.containsKey("opensearch_off")) sb.setConfig(SwitchboardConstants.HEURISTIC_OPENSEARCH, false);
if (post.containsKey("discoverosd")) {
final boolean metafieldavailable = sb.index.fulltext().getWebgraphConfiguration().contains(WebgraphSchema.target_rel_s.name())
&& (sb.index.fulltext().getWebgraphConfiguration().contains(WebgraphSchema.target_protocol_s.name()) && sb.index.fulltext().getWebgraphConfiguration().contains(WebgraphSchema.target_urlstub_s.name()));
@ -155,12 +155,12 @@ public class ConfigHeuristics_p {
&& (sb.index.fulltext().getWebgraphConfiguration().contains(WebgraphSchema.target_protocol_s.name()) && sb.index.fulltext().getWebgraphConfiguration().contains(WebgraphSchema.target_urlstub_s.name()))
&& sb.getConfigBool(SwitchboardConstants.CORE_SERVICE_WEBGRAPH, false);
if (!showmetafieldbutton) prop.put("osdsolrfieldswitch",1);
prop.put("site.checked", sb.getConfigBool("heuristic.site", false) ? 1 : 0);
prop.put("searchresult.checked", sb.getConfigBool("heuristic.searchresults", false) ? 1 : 0);
prop.put("searchresultglobal.checked", sb.getConfigBool("heuristic.searchresults.crawlglobal", false) ? 1 : 0);
prop.put("blekko.checked", sb.getConfigBool("heuristic.blekko", false) ? 1 : 0);
prop.put("twitter.checked", sb.getConfigBool("heuristic.twitter", false) ? 1 : 0);
prop.put("opensearch.checked", sb.getConfigBool("heuristic.opensearch", false) ? 1 : 0);
prop.put("site.checked", sb.getConfigBool(SwitchboardConstants.HEURISTIC_SITE, false) ? 1 : 0);
prop.put("searchresult.checked", sb.getConfigBool(SwitchboardConstants.HEURISTIC_SEARCHRESULTS, false) ? 1 : 0);
prop.put("searchresultglobal.checked", sb.getConfigBool(SwitchboardConstants.HEURISTIC_SEARCHRESULTS_CRAWLGLOBAL, false) ? 1 : 0);
prop.put("blekko.checked", sb.getConfigBool(SwitchboardConstants.HEURISTIC_BLEKKO, false) ? 1 : 0);
prop.put("twitter.checked", sb.getConfigBool(SwitchboardConstants.HEURISTIC_TWITTER, false) ? 1 : 0);
prop.put("opensearch.checked", sb.getConfigBool(SwitchboardConstants.HEURISTIC_OPENSEARCH, false) ? 1 : 0);
// display config file content
final File f = new File (sb.getDataPath(),"DATA/SETTINGS/heuristicopensearch.conf");
@ -238,7 +238,7 @@ public class ConfigHeuristics_p {
}
// re-read config (and create/update work table)
if (sb.getConfigBool("heuristic.opensearch", true)) {
if (sb.getConfigBool(SwitchboardConstants.HEURISTIC_OPENSEARCH, true)) {
OpenSearchConnector os = new OpenSearchConnector(sb, true);
}
}

View File

@ -91,9 +91,9 @@ public class ConfigNetwork_p
boolean indexReceive = "on".equals(post.get("indexReceive", ""));
if ( !indexReceive ) {
// remove heuristics
sb.setConfig("heuristic.site", false);
sb.setConfig("heuristic.blekko", false);
sb.setConfig("heuristic.twitter", false);
sb.setConfig(SwitchboardConstants.HEURISTIC_SITE, false);
sb.setConfig(SwitchboardConstants.HEURISTIC_BLEKKO, false);
sb.setConfig(SwitchboardConstants.HEURISTIC_TWITTER, false);
}
final boolean robinsonmode = "robinson".equals(post.get("network", ""));
if ( robinsonmode ) {

View File

@ -158,10 +158,13 @@ public class yacysearch {
sb.getConfigBool(SwitchboardConstants.INDEX_RECEIVE_ALLOW, true)
|| sb.getConfigBool(SwitchboardConstants.INDEX_RECEIVE_AUTODISABLED, true)
|| clustersearch;
boolean global = post == null || (post.get("resource", "local").equals("global") && sb.peers.sizeConnected() > 0 && indexReceiveGranted);
prop.put("topmenu_resource-select", (sb.peers == null || sb.peers.sizeConnected() == 0 || !indexReceiveGranted) ? 0 : global ? 1 : 2);
boolean p2pmode = sb.peers != null && sb.peers.sizeConnected() > 0 && indexReceiveGranted;
boolean global = post == null || (post.get("resource", "local").equals("global") && p2pmode);
boolean stealthmode = p2pmode && !global;
prop.put("topmenu_resource-select", stealthmode ? 2 : global ? 1 : 0);
if ( post == null || indexSegment == null || env == null || !searchAllowed ) {
if (indexSegment == null) Log.logInfo("yacysearch", "indexSegment == null");
// we create empty entries for template strings
prop.put("searchagain", "0");
prop.put("former", "");
@ -483,7 +486,7 @@ public class yacysearch {
}
final int heuristicTwitter = querystring.indexOf("/heuristic/twitter", 0);
if ( heuristicBlekko >= 0 ) {
if ( heuristicTwitter >= 0 ) {
querystring = querystring.replace("/heuristic/twitter", "");
modifier.add("/heuristic/twitter");
}
@ -723,16 +726,16 @@ public class yacysearch {
(int) sb.getConfigLong(SwitchboardConstants.DHT_BURST_MULTIWORD, 0));
if ( startRecord == 0 ) {
if ( modifier.sitehost != null && sb.getConfigBool("heuristic.site", false) && authenticated ) {
if ( modifier.sitehost != null && sb.getConfigBool(SwitchboardConstants.HEURISTIC_SITE, false) && authenticated && !stealthmode) {
sb.heuristicSite(theSearch, modifier.sitehost);
}
if ( (heuristicBlekko >= 0 || sb.getConfigBool("heuristic.blekko", false)) && authenticated ) {
if ( (heuristicBlekko >= 0 || sb.getConfigBool(SwitchboardConstants.HEURISTIC_BLEKKO, false)) && authenticated && !stealthmode ) {
sb.heuristicRSS("http://blekko.com/ws/$+/rss", theSearch, "blekko");
}
if ( (heuristicTwitter >= 0 || sb.getConfigBool("heuristic.twitter", false)) && authenticated ) {
if ( (heuristicTwitter >= 0 || sb.getConfigBool(SwitchboardConstants.HEURISTIC_TWITTER, false)) && authenticated && !stealthmode ) {
sb.heuristicRSS("http://search.twitter.com/search.rss?rpp=50&q=$", theSearch, "twitter");
}
if (sb.getConfigBool("heuristic.opensearch", false) && authenticated) {
if (sb.getConfigBool(SwitchboardConstants.HEURISTIC_OPENSEARCH, false) && authenticated && !stealthmode) {
OpenSearchConnector.query(sb, theSearch);
}
}

View File

@ -255,7 +255,17 @@ public class yacysearchitem {
prop.put("content_loc_lat", result.lat());
prop.put("content_loc_lon", result.lon());
}
if (sb.getConfigBool("heuristic.searchresults",false)) sb.heuristicSearchResults(resultUrlstring);
final boolean clustersearch = sb.isRobinsonMode() && sb.getConfig(SwitchboardConstants.CLUSTER_MODE, "").equals(SwitchboardConstants.CLUSTER_MODE_PUBLIC_CLUSTER);
final boolean indexReceiveGranted =
sb.getConfigBool(SwitchboardConstants.INDEX_RECEIVE_ALLOW, true)
|| sb.getConfigBool(SwitchboardConstants.INDEX_RECEIVE_AUTODISABLED, true)
|| clustersearch;
boolean p2pmode = sb.peers != null && sb.peers.sizeConnected() > 0 && indexReceiveGranted;
boolean global = post == null || (post.get("resource", "local").equals("global") && p2pmode);
boolean stealthmode = p2pmode && !global;
if ((sb.getConfigBool(SwitchboardConstants.HEURISTIC_SEARCHRESULTS, false) ||
(sb.getConfigBool(SwitchboardConstants.GREEDYLEARNING_ACTIVE, false) && sb.getConfigBool(SwitchboardConstants.GREEDYLEARNING_ENABLED, false))) &&
!stealthmode) sb.heuristicSearchResults(resultUrlstring);
theSearch.query.transmitcount = item + 1;
return prop;
}

View File

@ -1303,9 +1303,9 @@ public final class Switchboard extends serverSwitch {
ResultURLs.clearStacks();
// remove heuristics
setConfig("heuristic.site", false);
setConfig("heuristic.blekko", false);
setConfig("heuristic.twitter", false);
setConfig(SwitchboardConstants.HEURISTIC_SITE, false);
setConfig(SwitchboardConstants.HEURISTIC_BLEKKO, false);
setConfig(SwitchboardConstants.HEURISTIC_TWITTER, false);
// relocate
this.peers.relocate(
@ -2041,6 +2041,15 @@ public final class Switchboard extends serverSwitch {
setConfig("adminAccount", "");
}
// stop greedylearning if limit is reached
if (getConfigBool(SwitchboardConstants.GREEDYLEARNING_ACTIVE, false)) {
long cs = this.index.fulltext().collectionSize();
if (cs > getConfigInt(SwitchboardConstants.GREEDYLEARNING_LIMIT_DOCCOUNT, 0)) {
setConfig(SwitchboardConstants.GREEDYLEARNING_ACTIVE, false);
log.logInfo("finishing greedy learning phase, size=" +cs);
}
}
// refresh recrawl dates
try {
CrawlProfile selentry;
@ -2265,6 +2274,7 @@ public final class Switchboard extends serverSwitch {
// if no crawl is running and processing is activated:
// execute the (post-) processing steps for all entries that have a process tag assigned
if (this.crawlQueues.coreCrawlJobSize() == 0) {
if (this.crawlQueues.noticeURL.isEmpty()) this.crawlQueues.noticeURL.clear(); // flushes more caches
index.fulltext().getDefaultConfiguration().postprocessing(index);
index.fulltext().getWebgraphConfiguration().postprocessing(index);
}
@ -3371,7 +3381,7 @@ public final class Switchboard extends serverSwitch {
}.start();
}
public final void heuristicSearchResults(final String host) {
public final void heuristicSearchResults(final String url) {
new Thread() {
@Override
@ -3380,7 +3390,7 @@ public final class Switchboard extends serverSwitch {
// get the links for a specific site
final DigestURI startUrl;
try {
startUrl = new DigestURI(host);
startUrl = new DigestURI(url);
} catch (final MalformedURLException e) {
Log.logException(e);
return;
@ -3393,7 +3403,7 @@ public final class Switchboard extends serverSwitch {
if (links != null) {
if (links.size() < 1000) { // limit to 1000 to skip large index pages
final Iterator<DigestURI> i = links.keySet().iterator();
final boolean globalcrawljob = Switchboard.this.getConfigBool("heuristic.searchresults.crawlglobal",false);
final boolean globalcrawljob = Switchboard.this.getConfigBool(SwitchboardConstants.HEURISTIC_SEARCHRESULTS_CRAWLGLOBAL,false);
Collection<DigestURI> urls = new ArrayList<DigestURI>();
while (i.hasNext()) {
url = i.next();

View File

@ -498,8 +498,25 @@ public final class SwitchboardConstants {
/**
* system tray
*/
public static final String TRAY_ICON_ENABLED = "tray.icon.enabled";
public static final String TRAY_ICON_FORCED = "tray.icon.force";
public static final String TRAY_ICON_LABEL = "tray.icon.label";
public static final String TRAY_MENU_ENABLED = "tray.menu.enabled";
public static final String TRAY_ICON_ENABLED = "tray.icon.enabled";
public static final String TRAY_ICON_FORCED = "tray.icon.force";
public static final String TRAY_ICON_LABEL = "tray.icon.label";
public static final String TRAY_MENU_ENABLED = "tray.menu.enabled";
/*
* search heuristics
*/
public static final String HEURISTIC_SITE = "heuristic.site";
public static final String HEURISTIC_SEARCHRESULTS = "heuristic.searchresults";
public static final String HEURISTIC_SEARCHRESULTS_CRAWLGLOBAL = "heuristic.searchresults.crawlglobal";
public static final String HEURISTIC_BLEKKO = "heuristic.blekko";
public static final String HEURISTIC_TWITTER = "heuristic.twitter";
public static final String HEURISTIC_OPENSEARCH = "heuristic.opensearch";
/*
* automatic learning heuristic
*/
public static final String GREEDYLEARNING_ENABLED = "greedylearning.enabled";
public static final String GREEDYLEARNING_LIMIT_DOCCOUNT = "greedylearning.limit.doccount";
public static final String GREEDYLEARNING_ACTIVE = "greedylearning.active";
}