changed behavior of crawl stacker

- final flush only when tabletype = RAM
- prestacker (dns prefetch) only if tabletype = RAM and busytime <= 100
- number of maximun entries in stacker is configurable in yacy.init (stacker.slots)

git-svn-id: https://svn.berlios.de/svnroot/repos/yacy/trunk@4186 6c8d7289-2bf4-0310-a012-ef5d649a1542
This commit is contained in:
orbiter 2007-10-31 11:32:40 +00:00
parent 18144043e6
commit 55c87b3b12
8 changed files with 26 additions and 30 deletions

View File

@ -204,11 +204,8 @@ public class PerformanceQueues_p {
// getting the current crawler pool configuration
int maxActive = Integer.parseInt(post.get("Crawler Pool_maxActive","8"));
// accept new crawler pool settings
plasmaSwitchboard.crawlSlots = maxActive;
// storing the new values into configfile
switchboard.setConfig("crawler.MaxActiveThreads",maxActive);
switchboard.setConfig(plasmaSwitchboard.CRAWLER_THREADS_ACTIVE_MAX,maxActive);
//switchboard.setConfig("crawler.MinIdleThreads",minIdle);
/*

View File

@ -328,14 +328,14 @@ public class Status {
// Queue information
int indexingJobCount = sb.getThread("80_indexing").getJobCount()+sb.indexingTasksInProcess.size();
int indexingMaxCount = plasmaSwitchboard.indexingSlots;
int indexingMaxCount = (int) sb.getConfigLong(plasmaSwitchboard.INDEXER_SLOTS, 30);
int indexingPercent = (indexingMaxCount==0)?0:indexingJobCount*100/indexingMaxCount;
prop.putNum("indexingQueueSize", indexingJobCount);
prop.putNum("indexingQueueMax", indexingMaxCount);
prop.put("indexingQueuePercent",(indexingPercent>100) ? 100 : indexingPercent);
int loaderJobCount = sb.crawlQueues.size();
int loaderMaxCount = plasmaSwitchboard.crawlSlots;
int loaderMaxCount = Integer.parseInt(sb.getConfig(plasmaSwitchboard.CRAWLER_THREADS_ACTIVE_MAX, "10"));
int loaderPercent = (loaderMaxCount==0)?0:loaderJobCount*100/loaderMaxCount;
prop.putNum("loaderQueueSize", loaderJobCount);
prop.putNum("loaderQueueMax", loaderMaxCount);

View File

@ -88,7 +88,7 @@ public class queues_p {
//indexing queue
prop.putNum("indexingSize", sb.getThread(plasmaSwitchboard.INDEXER).getJobCount()+sb.indexingTasksInProcess.size());
prop.putNum("indexingMax", plasmaSwitchboard.indexingSlots);
prop.putNum("indexingMax", (int) sb.getConfigLong(plasmaSwitchboard.INDEXER_SLOTS, 30));
prop.putNum("urlpublictextSize", sb.wordIndex.loadedURL.size());
prop.putNum("rwipublictextSize", sb.wordIndex.size());
if ((sb.sbQueue.size() == 0) && (sb.indexingTasksInProcess.size() == 0)) {
@ -140,7 +140,7 @@ public class queues_p {
//loader queue
prop.put("loaderSize", Integer.toString(sb.crawlQueues.size()));
prop.put("loaderMax", Integer.toString(plasmaSwitchboard.crawlSlots));
prop.put("loaderMax", sb.getConfig(plasmaSwitchboard.CRAWLER_THREADS_ACTIVE_MAX, "10"));
if (sb.crawlQueues.size() == 0) {
prop.put("list-loader", "0");
} else {

View File

@ -134,12 +134,12 @@ public class plasmaCrawlQueues {
//log.logDebug("CoreCrawl: queue is empty");
return false;
}
if (sb.sbQueue.size() >= plasmaSwitchboard.indexingSlots) {
if (sb.sbQueue.size() >= (int) sb.getConfigLong(plasmaSwitchboard.INDEXER_SLOTS, 30)) {
log.logFine("CoreCrawl: too many processes in indexing queue, dismissed (" +
"sbQueueSize=" + sb.sbQueue.size() + ")");
return false;
}
if (this.size() >= plasmaSwitchboard.crawlSlots) {
if (this.size() >= sb.getConfigLong(plasmaSwitchboard.CRAWLER_THREADS_ACTIVE_MAX, 10)) {
log.logFine("CoreCrawl: too many processes in loader queue, dismissed (" +
"cacheLoader=" + this.size() + ")");
return false;
@ -230,12 +230,12 @@ public class plasmaCrawlQueues {
// check local indexing queues
// in case the placing of remote crawl fails, there must be space in the local queue to work off the remote crawl
if (sb.sbQueue.size() >= plasmaSwitchboard.indexingSlots * 2) {
if (sb.sbQueue.size() >= (int) sb.getConfigLong(plasmaSwitchboard.INDEXER_SLOTS, 30) * 2) {
log.logFine("LimitCrawl: too many processes in indexing queue, dismissed (" +
"sbQueueSize=" + sb.sbQueue.size() + ")");
return false;
}
if (this.size() >= plasmaSwitchboard.crawlSlots) {
if (this.size() >= sb.getConfigLong(plasmaSwitchboard.CRAWLER_THREADS_ACTIVE_MAX, 10)) {
log.logFine("LimitCrawl: too many processes in loader queue, dismissed (" +
"cacheLoader=" + this.size() + ")");
return false;
@ -318,12 +318,12 @@ public class plasmaCrawlQueues {
//log.logDebug("GlobalCrawl: queue is empty");
return false;
}
if (sb.sbQueue.size() >= plasmaSwitchboard.indexingSlots) {
if (sb.sbQueue.size() >= (int) sb.getConfigLong(plasmaSwitchboard.INDEXER_SLOTS, 30)) {
log.logFine("GlobalCrawl: too many processes in indexing queue, dismissed (" +
"sbQueueSize=" + sb.sbQueue.size() + ")");
return false;
}
if (this.size() >= plasmaSwitchboard.crawlSlots) {
if (this.size() >= sb.getConfigLong(plasmaSwitchboard.CRAWLER_THREADS_ACTIVE_MAX, 10)) {
log.logFine("GlobalCrawl: too many processes in loader queue, dismissed (" +
"cacheLoader=" + this.size() + ")");
return false;

View File

@ -85,12 +85,14 @@ public final class plasmaCrawlStacker extends Thread {
private File cacheStacksPath;
private long preloadTime;
private int dbtype;
private boolean prequeue;
// objects for the prefetch task
private ArrayList dnsfetchHosts = new ArrayList();
public plasmaCrawlStacker(plasmaSwitchboard sb, File dbPath, long preloadTime, int dbtype) {
public plasmaCrawlStacker(plasmaSwitchboard sb, File dbPath, long preloadTime, int dbtype, boolean prequeue) {
this.sb = sb;
this.prequeue = prequeue;
// init the message list
this.urlEntryHashCache = new LinkedList();
@ -168,13 +170,11 @@ public final class plasmaCrawlStacker extends Thread {
}
public void close() {
try {
if (this.dbtype == QUEUE_DB_TYPE_RAM) {
this.log.logFine("Shutdown. Flushing remaining " + size() + " crawl stacker job entries. please wait.");
while (size() > 0) {
if (!job()) break;
}
} catch (Exception e1) {
this.log.logSevere("Unable to shutdown all remaining stackCrawl threads", e1);
}
terminateDNSPrefetcher();
@ -240,7 +240,7 @@ public final class plasmaCrawlStacker extends Thread {
synchronized(this.urlEntryHashCache) {
kelondroRow.Entry oldValue;
prefetchHost(nexturl.getHost());
if (prequeue) prefetchHost(nexturl.getHost());
try {
oldValue = this.urlEntryCache.put(newEntry.toRow());
} catch (IOException e) {

View File

@ -175,9 +175,7 @@ import de.anomic.yacy.yacySeed;
public final class plasmaSwitchboard extends serverAbstractSwitch implements serverSwitch {
// load slots
public static int crawlSlots = 10;
public static int indexingSlots = 30;
public static int stackCrawlSlots = 2000;
public static int xstackCrawlSlots = 2000;
private int dhtTransferIndexCount = 100;
@ -410,6 +408,7 @@ public final class plasmaSwitchboard extends serverAbstractSwitch implements ser
public static final String CRAWLSTACK_METHOD_FREEMEM = null;
public static final String CRAWLSTACK_IDLESLEEP = "82_crawlstack_idlesleep";
public static final String CRAWLSTACK_BUSYSLEEP = "82_crawlstack_busysleep";
public static final String CRAWLSTACK_SLOTS = "stacker.slots";
// 90_cleanup
/**
@ -1170,9 +1169,6 @@ public final class plasmaSwitchboard extends serverAbstractSwitch implements ser
// create queue
this.sbQueue = new plasmaSwitchboardQueue(this.wordIndex.loadedURL, new File(this.plasmaPath, "switchboardQueue2.stack"), this.profilesActiveCrawls);
// setting the indexing queue slots
indexingSlots = (int) getConfigLong(INDEXER_SLOTS, 30);
// create in process list
this.indexingTasksInProcess = new HashMap();
@ -1204,7 +1200,6 @@ public final class plasmaSwitchboard extends serverAbstractSwitch implements ser
// start a loader
log.logConfig("Starting Crawl Loader");
crawlSlots = Integer.parseInt(getConfig(CRAWLER_THREADS_ACTIVE_MAX, "10"));
this.crawlQueues = new plasmaCrawlQueues(this, plasmaPath);
/*
@ -1307,7 +1302,7 @@ public final class plasmaSwitchboard extends serverAbstractSwitch implements ser
}
// initializing the stackCrawlThread
this.crawlStacker = new plasmaCrawlStacker(this, this.plasmaPath, ramPreNURL_time, (int) getConfigLong("tableTypeForPreNURL", 0));
this.crawlStacker = new plasmaCrawlStacker(this, this.plasmaPath, ramPreNURL_time, (int) getConfigLong("tableTypeForPreNURL", 0), (((int) getConfigLong("tableTypeForPreNURL", 0) == 0) && (getConfigLong(CRAWLSTACK_BUSYSLEEP, 0) <= 100)));
//this.sbStackCrawlThread = new plasmaStackCrawlThread(this,this.plasmaPath,ramPreNURL);
//this.sbStackCrawlThread.start();
@ -1850,7 +1845,7 @@ public final class plasmaSwitchboard extends serverAbstractSwitch implements ser
return doneSomething; // nothing to do
}
if (crawlStacker.size() >= stackCrawlSlots) {
if (crawlStacker.size() >= getConfigLong(CRAWLSTACK_SLOTS, 2000)) {
log.logFine("deQueue: too many processes in stack crawl thread queue (" + "stackCrawlQueue=" + crawlStacker.size() + ")");
return doneSomething;
}

View File

@ -60,7 +60,7 @@ public final class plasmaWordIndex implements indexRI {
// environment constants
public static final long wCacheMaxAge = 1000 * 60 * 30; // milliseconds; 30 minutes
public static final int wCacheMaxChunk = 1000; // number of references for each urlhash
public static final int wCacheMaxChunk = 400; // maximum number of references for each urlhash
public static final int lowcachedivisor = 320;
public static final int maxCollectionPartition = 7; // should be 7

View File

@ -734,6 +734,9 @@ crawler.MaxActiveThreads = 30
indexer.slots = 40
indexer.slots__pro = 80
# maximum size of stacker queue
stacker.slots = 2000
# specifies if yacy should set it's own referer if no referer URL
# was set by the client.
useYacyReferer = true
@ -888,6 +891,7 @@ currentSkin=
# temporary flag for new database structure. set only true for testing
# ALL DATA THAT IS CREATED WITH THIS FLAG ON WILL BE VOID IN A FINAL VERSION
# table-types: RAM = 0, TREE = 1, FLEX = 2;
# if you set this to a non-RAM value, you should increase the stacker.slots value
tableTypeForPreNURL=0
# flag to show if pages shall be usable for non-admin users