mirror of
https://github.com/yacy/yacy_search_server.git
synced 2024-09-19 00:01:41 +02:00
changed behavior of crawl stacker
- final flush only when tabletype = RAM - prestacker (dns prefetch) only if tabletype = RAM and busytime <= 100 - number of maximun entries in stacker is configurable in yacy.init (stacker.slots) git-svn-id: https://svn.berlios.de/svnroot/repos/yacy/trunk@4186 6c8d7289-2bf4-0310-a012-ef5d649a1542
This commit is contained in:
parent
18144043e6
commit
55c87b3b12
|
@ -204,11 +204,8 @@ public class PerformanceQueues_p {
|
|||
// getting the current crawler pool configuration
|
||||
int maxActive = Integer.parseInt(post.get("Crawler Pool_maxActive","8"));
|
||||
|
||||
// accept new crawler pool settings
|
||||
plasmaSwitchboard.crawlSlots = maxActive;
|
||||
|
||||
// storing the new values into configfile
|
||||
switchboard.setConfig("crawler.MaxActiveThreads",maxActive);
|
||||
switchboard.setConfig(plasmaSwitchboard.CRAWLER_THREADS_ACTIVE_MAX,maxActive);
|
||||
//switchboard.setConfig("crawler.MinIdleThreads",minIdle);
|
||||
|
||||
/*
|
||||
|
|
|
@ -328,14 +328,14 @@ public class Status {
|
|||
|
||||
// Queue information
|
||||
int indexingJobCount = sb.getThread("80_indexing").getJobCount()+sb.indexingTasksInProcess.size();
|
||||
int indexingMaxCount = plasmaSwitchboard.indexingSlots;
|
||||
int indexingMaxCount = (int) sb.getConfigLong(plasmaSwitchboard.INDEXER_SLOTS, 30);
|
||||
int indexingPercent = (indexingMaxCount==0)?0:indexingJobCount*100/indexingMaxCount;
|
||||
prop.putNum("indexingQueueSize", indexingJobCount);
|
||||
prop.putNum("indexingQueueMax", indexingMaxCount);
|
||||
prop.put("indexingQueuePercent",(indexingPercent>100) ? 100 : indexingPercent);
|
||||
|
||||
int loaderJobCount = sb.crawlQueues.size();
|
||||
int loaderMaxCount = plasmaSwitchboard.crawlSlots;
|
||||
int loaderMaxCount = Integer.parseInt(sb.getConfig(plasmaSwitchboard.CRAWLER_THREADS_ACTIVE_MAX, "10"));
|
||||
int loaderPercent = (loaderMaxCount==0)?0:loaderJobCount*100/loaderMaxCount;
|
||||
prop.putNum("loaderQueueSize", loaderJobCount);
|
||||
prop.putNum("loaderQueueMax", loaderMaxCount);
|
||||
|
|
|
@ -88,7 +88,7 @@ public class queues_p {
|
|||
|
||||
//indexing queue
|
||||
prop.putNum("indexingSize", sb.getThread(plasmaSwitchboard.INDEXER).getJobCount()+sb.indexingTasksInProcess.size());
|
||||
prop.putNum("indexingMax", plasmaSwitchboard.indexingSlots);
|
||||
prop.putNum("indexingMax", (int) sb.getConfigLong(plasmaSwitchboard.INDEXER_SLOTS, 30));
|
||||
prop.putNum("urlpublictextSize", sb.wordIndex.loadedURL.size());
|
||||
prop.putNum("rwipublictextSize", sb.wordIndex.size());
|
||||
if ((sb.sbQueue.size() == 0) && (sb.indexingTasksInProcess.size() == 0)) {
|
||||
|
@ -140,7 +140,7 @@ public class queues_p {
|
|||
|
||||
//loader queue
|
||||
prop.put("loaderSize", Integer.toString(sb.crawlQueues.size()));
|
||||
prop.put("loaderMax", Integer.toString(plasmaSwitchboard.crawlSlots));
|
||||
prop.put("loaderMax", sb.getConfig(plasmaSwitchboard.CRAWLER_THREADS_ACTIVE_MAX, "10"));
|
||||
if (sb.crawlQueues.size() == 0) {
|
||||
prop.put("list-loader", "0");
|
||||
} else {
|
||||
|
|
|
@ -134,12 +134,12 @@ public class plasmaCrawlQueues {
|
|||
//log.logDebug("CoreCrawl: queue is empty");
|
||||
return false;
|
||||
}
|
||||
if (sb.sbQueue.size() >= plasmaSwitchboard.indexingSlots) {
|
||||
if (sb.sbQueue.size() >= (int) sb.getConfigLong(plasmaSwitchboard.INDEXER_SLOTS, 30)) {
|
||||
log.logFine("CoreCrawl: too many processes in indexing queue, dismissed (" +
|
||||
"sbQueueSize=" + sb.sbQueue.size() + ")");
|
||||
return false;
|
||||
}
|
||||
if (this.size() >= plasmaSwitchboard.crawlSlots) {
|
||||
if (this.size() >= sb.getConfigLong(plasmaSwitchboard.CRAWLER_THREADS_ACTIVE_MAX, 10)) {
|
||||
log.logFine("CoreCrawl: too many processes in loader queue, dismissed (" +
|
||||
"cacheLoader=" + this.size() + ")");
|
||||
return false;
|
||||
|
@ -230,12 +230,12 @@ public class plasmaCrawlQueues {
|
|||
|
||||
// check local indexing queues
|
||||
// in case the placing of remote crawl fails, there must be space in the local queue to work off the remote crawl
|
||||
if (sb.sbQueue.size() >= plasmaSwitchboard.indexingSlots * 2) {
|
||||
if (sb.sbQueue.size() >= (int) sb.getConfigLong(plasmaSwitchboard.INDEXER_SLOTS, 30) * 2) {
|
||||
log.logFine("LimitCrawl: too many processes in indexing queue, dismissed (" +
|
||||
"sbQueueSize=" + sb.sbQueue.size() + ")");
|
||||
return false;
|
||||
}
|
||||
if (this.size() >= plasmaSwitchboard.crawlSlots) {
|
||||
if (this.size() >= sb.getConfigLong(plasmaSwitchboard.CRAWLER_THREADS_ACTIVE_MAX, 10)) {
|
||||
log.logFine("LimitCrawl: too many processes in loader queue, dismissed (" +
|
||||
"cacheLoader=" + this.size() + ")");
|
||||
return false;
|
||||
|
@ -318,12 +318,12 @@ public class plasmaCrawlQueues {
|
|||
//log.logDebug("GlobalCrawl: queue is empty");
|
||||
return false;
|
||||
}
|
||||
if (sb.sbQueue.size() >= plasmaSwitchboard.indexingSlots) {
|
||||
if (sb.sbQueue.size() >= (int) sb.getConfigLong(plasmaSwitchboard.INDEXER_SLOTS, 30)) {
|
||||
log.logFine("GlobalCrawl: too many processes in indexing queue, dismissed (" +
|
||||
"sbQueueSize=" + sb.sbQueue.size() + ")");
|
||||
return false;
|
||||
}
|
||||
if (this.size() >= plasmaSwitchboard.crawlSlots) {
|
||||
if (this.size() >= sb.getConfigLong(plasmaSwitchboard.CRAWLER_THREADS_ACTIVE_MAX, 10)) {
|
||||
log.logFine("GlobalCrawl: too many processes in loader queue, dismissed (" +
|
||||
"cacheLoader=" + this.size() + ")");
|
||||
return false;
|
||||
|
|
|
@ -85,12 +85,14 @@ public final class plasmaCrawlStacker extends Thread {
|
|||
private File cacheStacksPath;
|
||||
private long preloadTime;
|
||||
private int dbtype;
|
||||
|
||||
private boolean prequeue;
|
||||
|
||||
// objects for the prefetch task
|
||||
private ArrayList dnsfetchHosts = new ArrayList();
|
||||
|
||||
public plasmaCrawlStacker(plasmaSwitchboard sb, File dbPath, long preloadTime, int dbtype) {
|
||||
public plasmaCrawlStacker(plasmaSwitchboard sb, File dbPath, long preloadTime, int dbtype, boolean prequeue) {
|
||||
this.sb = sb;
|
||||
this.prequeue = prequeue;
|
||||
|
||||
// init the message list
|
||||
this.urlEntryHashCache = new LinkedList();
|
||||
|
@ -168,13 +170,11 @@ public final class plasmaCrawlStacker extends Thread {
|
|||
}
|
||||
|
||||
public void close() {
|
||||
try {
|
||||
if (this.dbtype == QUEUE_DB_TYPE_RAM) {
|
||||
this.log.logFine("Shutdown. Flushing remaining " + size() + " crawl stacker job entries. please wait.");
|
||||
while (size() > 0) {
|
||||
if (!job()) break;
|
||||
}
|
||||
} catch (Exception e1) {
|
||||
this.log.logSevere("Unable to shutdown all remaining stackCrawl threads", e1);
|
||||
}
|
||||
terminateDNSPrefetcher();
|
||||
|
||||
|
@ -240,7 +240,7 @@ public final class plasmaCrawlStacker extends Thread {
|
|||
|
||||
synchronized(this.urlEntryHashCache) {
|
||||
kelondroRow.Entry oldValue;
|
||||
prefetchHost(nexturl.getHost());
|
||||
if (prequeue) prefetchHost(nexturl.getHost());
|
||||
try {
|
||||
oldValue = this.urlEntryCache.put(newEntry.toRow());
|
||||
} catch (IOException e) {
|
||||
|
|
|
@ -175,9 +175,7 @@ import de.anomic.yacy.yacySeed;
|
|||
public final class plasmaSwitchboard extends serverAbstractSwitch implements serverSwitch {
|
||||
|
||||
// load slots
|
||||
public static int crawlSlots = 10;
|
||||
public static int indexingSlots = 30;
|
||||
public static int stackCrawlSlots = 2000;
|
||||
public static int xstackCrawlSlots = 2000;
|
||||
|
||||
private int dhtTransferIndexCount = 100;
|
||||
|
||||
|
@ -410,6 +408,7 @@ public final class plasmaSwitchboard extends serverAbstractSwitch implements ser
|
|||
public static final String CRAWLSTACK_METHOD_FREEMEM = null;
|
||||
public static final String CRAWLSTACK_IDLESLEEP = "82_crawlstack_idlesleep";
|
||||
public static final String CRAWLSTACK_BUSYSLEEP = "82_crawlstack_busysleep";
|
||||
public static final String CRAWLSTACK_SLOTS = "stacker.slots";
|
||||
|
||||
// 90_cleanup
|
||||
/**
|
||||
|
@ -1170,9 +1169,6 @@ public final class plasmaSwitchboard extends serverAbstractSwitch implements ser
|
|||
// create queue
|
||||
this.sbQueue = new plasmaSwitchboardQueue(this.wordIndex.loadedURL, new File(this.plasmaPath, "switchboardQueue2.stack"), this.profilesActiveCrawls);
|
||||
|
||||
// setting the indexing queue slots
|
||||
indexingSlots = (int) getConfigLong(INDEXER_SLOTS, 30);
|
||||
|
||||
// create in process list
|
||||
this.indexingTasksInProcess = new HashMap();
|
||||
|
||||
|
@ -1204,7 +1200,6 @@ public final class plasmaSwitchboard extends serverAbstractSwitch implements ser
|
|||
|
||||
// start a loader
|
||||
log.logConfig("Starting Crawl Loader");
|
||||
crawlSlots = Integer.parseInt(getConfig(CRAWLER_THREADS_ACTIVE_MAX, "10"));
|
||||
this.crawlQueues = new plasmaCrawlQueues(this, plasmaPath);
|
||||
|
||||
/*
|
||||
|
@ -1307,7 +1302,7 @@ public final class plasmaSwitchboard extends serverAbstractSwitch implements ser
|
|||
}
|
||||
|
||||
// initializing the stackCrawlThread
|
||||
this.crawlStacker = new plasmaCrawlStacker(this, this.plasmaPath, ramPreNURL_time, (int) getConfigLong("tableTypeForPreNURL", 0));
|
||||
this.crawlStacker = new plasmaCrawlStacker(this, this.plasmaPath, ramPreNURL_time, (int) getConfigLong("tableTypeForPreNURL", 0), (((int) getConfigLong("tableTypeForPreNURL", 0) == 0) && (getConfigLong(CRAWLSTACK_BUSYSLEEP, 0) <= 100)));
|
||||
//this.sbStackCrawlThread = new plasmaStackCrawlThread(this,this.plasmaPath,ramPreNURL);
|
||||
//this.sbStackCrawlThread.start();
|
||||
|
||||
|
@ -1850,7 +1845,7 @@ public final class plasmaSwitchboard extends serverAbstractSwitch implements ser
|
|||
return doneSomething; // nothing to do
|
||||
}
|
||||
|
||||
if (crawlStacker.size() >= stackCrawlSlots) {
|
||||
if (crawlStacker.size() >= getConfigLong(CRAWLSTACK_SLOTS, 2000)) {
|
||||
log.logFine("deQueue: too many processes in stack crawl thread queue (" + "stackCrawlQueue=" + crawlStacker.size() + ")");
|
||||
return doneSomething;
|
||||
}
|
||||
|
|
|
@ -60,7 +60,7 @@ public final class plasmaWordIndex implements indexRI {
|
|||
|
||||
// environment constants
|
||||
public static final long wCacheMaxAge = 1000 * 60 * 30; // milliseconds; 30 minutes
|
||||
public static final int wCacheMaxChunk = 1000; // number of references for each urlhash
|
||||
public static final int wCacheMaxChunk = 400; // maximum number of references for each urlhash
|
||||
public static final int lowcachedivisor = 320;
|
||||
public static final int maxCollectionPartition = 7; // should be 7
|
||||
|
||||
|
|
|
@ -734,6 +734,9 @@ crawler.MaxActiveThreads = 30
|
|||
indexer.slots = 40
|
||||
indexer.slots__pro = 80
|
||||
|
||||
# maximum size of stacker queue
|
||||
stacker.slots = 2000
|
||||
|
||||
# specifies if yacy should set it's own referer if no referer URL
|
||||
# was set by the client.
|
||||
useYacyReferer = true
|
||||
|
@ -888,6 +891,7 @@ currentSkin=
|
|||
# temporary flag for new database structure. set only true for testing
|
||||
# ALL DATA THAT IS CREATED WITH THIS FLAG ON WILL BE VOID IN A FINAL VERSION
|
||||
# table-types: RAM = 0, TREE = 1, FLEX = 2;
|
||||
# if you set this to a non-RAM value, you should increase the stacker.slots value
|
||||
tableTypeForPreNURL=0
|
||||
|
||||
# flag to show if pages shall be usable for non-admin users
|
||||
|
|
Loading…
Reference in New Issue
Block a user