yacy_search_server/yacy.init

709 lines
26 KiB
Plaintext
Raw Normal View History

###
### YACY Init File
###
# These properties will be loaded upon installation.
# They are used only once for set-up.
# If you make changes to this file and want these to make any effect,
# you must delete the httpProxy.conf file in DATA/SETTINGS
# ----------------------------------------------------------------------------
# the http service configurations
# port number where the server should bind to
# e.g. 8080
# #eth0:8080
# 192.168.0.1:8080
port = 8080
# time-out of client control socket in milliseconds
# since this applies only to the client-proxy connection,
# it can be rather short
# milliseconds
clientTimeout = 30000
# maximal number of httpd sessions
# a client may open several connections at one, and the httpdMaxActiveSessions value sets
# a limit on the number of concurrent connections
httpdMaxActiveSessions = 150
httpdMaxIdleSessions = 75
httpdMinIdleSessions = 5
# default root path for the file server
# may be overridden by the htdocs parameter
# users shall be encouraged to use the htdocs path for individual content,
# not this path defined here
htRootPath = htroot
htTemplatePath = htroot/env/templates
# individual htroot folder
# every user may publicise her/his own web pages
# these pages shall be placed in the path defined here
# the htdocs path shares its content with the htroot path
htDocsPath = DATA/HTDOCS
# the default files (typically index.html), if no file name is given
# The complete path to this file is created by combination with the rootPath
# you can set a list of defaults, separated by comma
# the first one is priorized
defaultFiles = index.html,default.html,search.html,console.html,control.html,welcome.html,wiki.html,forum.html,blog.html,email.html,content.html,monitor.html,share.html,dir.html,readme.txt
# locale-options: yacy supports localization.
# Web pages for special languages are located in the htLocalePath
# The htLocaleLang defines a list of language options as <dir>/<named-language>
# the <dir> must exist as sub-path to htLocalePath
# the htLocaleSelection selects from the given locales, value=one-of-<dir>
htDefaultPath=htroot
htLocalePath=DATA/HTROOT/locale
htLocaleLang=default/English,de/Deutsch,fr/Fran&ccedil;ais,nl/Nederlands,it/Italiano,es/Espa&ntilde;ol,pt/Portug&ecirc;s,fi/Suomi,se/Svenska,dk/Dansk,gr/E&lambda;&lambda;&eta;v&iota;&kappa;&alpha;
htLocaleSelection=default
# virtual host for httpdFileServlet access
# for example http://<fileHost>/ shall access the file servlet and
# return the defaultFile at rootPath
# either way, http://<fileHost>/ denotes the same as http://localhost:<port>/
# for the preconfigured value 'localpeer', the URL is:
# http://localpeer/
fileHost = localpeer
# root path for message files
messPath = C:/AnomicServer
# specify the path to the MIME matching file table
mimeConfig = httpd.mime
# specify the path to message resource file
messConfig = httpd.messages
# proxy use. This server can also act as an caching proxy.
# to enable that function, set proxy=true
proxy=true
# a path to the proxy's file cache.
# This will be used if the server is addressed as a proxy
proxyCache = DATA/HTCACHE
# the proxy's maximum disc cache size in megabytes
# there should be enough space for the browsing load of an internet caffee
# running at 56kbit/s modem speed (this time not unusual)
# during 3 days, 8 hours a day
# necessary space = 3 * 8 * 60 * 60 * 56 / 8 = 604800 KB = ca. 590 MB
# since 600 MB is not much these days (it's below one GB!)
# we recommend using that space
#proxyCacheSize = 600
#for testing:
proxyCacheSize = 200
# the following mime-types are the whitelist for indexing
Multiple updates regarding the yacy seedUpload facility, optional content parsers, thread pool configuration ... Please help me testing if everything works correct. *) Migration of yacy seedUpload functionality See: http://www.yacy-forum.de/viewtopic.php?t=256 - new uploaders can now be easily introduced because of a new modulare uploader system - default uploaders are: none, file, ftp - adding optional uploader for scp - each uploader provides its own configuration file that will be included into the settings page using the new template include feature - Each uploader can define its libx dependencies. If not all needed libs are available, the uploader is deactivated automatically. *) Migration of optional parsers See: http://www.yacy-forum.de/viewtopic.php?t=198 - Parsers can now also define there libx dependencies - adding parser for bzip compressed content - adding parser for gzip compressed content - adding parser for zip files - adding parser for tar files - adding parser to detect the mime-type of a file this is needed by the bzip/gzip Parser.java - adding parser for rtf files - removing extra configuration file yacy.parser the list of enabled parsers is now stored in the main config file *) Adding configuration option in the performance dialog to configure See: http://www.yacy-forum.de/viewtopic.php?t=267 - maxActive / maxIdle / minIdle values for httpd-session-threadpool - maxActive / maxIdle / minIdle values for crawler-threadpool *) Changing Crawling Filter behaviour See: http://www.yacy-forum.de/viewtopic.php?p=2631 *) Replacing some hardcoded strings with the proper constants of the httpHeader class *) Adding new libs to libx directory. This libs are - needed by new content parsers - needed by new optional seed uploader - needed by SOAP API (which will be committed later) git-svn-id: https://svn.berlios.de/svnroot/repos/yacy/trunk@126 6c8d7289-2bf4-0310-a012-ef5d649a1542
2005-05-17 10:25:04 +02:00
#
# parseableRealtimeMimeTypes: specifies mime-types that can be indexed on the fly
# parseableMime: specifies mime-types that can be indexed but not on the fly
parseableRealtimeMimeTypes=application/xhtml+xml,text/html,text/plain
parseableMimeTypes=
parseableMimeTypes.CRAWLER=
parseableMimeTypes.PROXY=
parseableMimeTypes.ICAP=
parseableMimeTypes.URLREDIRECTOR=
# media extension string
# a comma-separated list of extensions that denote media file formats
# this is important to recognize <a href> - tags as not-html reference
# These files will be excluded from indexing _(Please keep extensions in alphabetical order)_
mediaExt=7z,ace,arj,asf,asx,avi,bin,bz2,css,db,dcm,deb,doc,dll,dmg,gif,gz,hqx,ico,img,iso,jar,jpe,jpg,jpeg,lx,lxl,mpeg,mov,mp3,mpg,ogg,png,pdf,ppt,ps,ram,rar,rm,rpm,scr,sit,so,swf,sxc,sxd,sxi,sxw,tar,tbz,tgz,torrent,war,wmv,xcf,xls,zip
*) Asynchronous queuing of crawl job URLs (stackCrawl) various checks like the blacklist check or the robots.txt disallow check are now done by a separate thread to unburden the indexer thread(s) TODO: maybe we have to introduce a threadpool here if it turn out that this single thread is a bottleneck because of the time consuming robots.txt downloads *) improved index transfer The index selection and transmission is done in parallel now to improve index transfer performance. TODO: maybe we could speed up performance by unsing multiple transmission threads in parallel instead of only a single one. *) gzip encoded post requests it is now configureable if a gzip encoded post request should be send on intex transfer/distribution *) storage Peer (very experimentell and not optimized yet) Now it's possible to send the result of the yacy indexer thread to a remote peer istead of storing the indexed words locally. This could be done by setting the property "storagePeerHash" in the yacy config file - Please note that if the index transfer fails, the index ist stored locally. - TODO: currently this index transfer is done by the indexer thread. To seedup the indexer a) this transmission should be done in parallel and b) multiple chunks should be bundled and transfered together *) general performance improvements - better memory cleanup after http request processing has finished - replacing some string concatenations with stringBuffers - replacing BufferedInputStreams with serverByteBuffer - replacing vectors with arraylists wherever possible - replacing hashtables with hashmaps wherever possible This was done because function calls to verctor or hashtable functions take 3 time longer than calls to functions of arraylists or hashmaps. TODO: we should take a look on the class serverObject which is inherited from hashmap Do we realy need a synchronization for this class? TODO: replace arraylists with linkedLists if random access to the list elements is not needed *) Robots Parser supports if-modified-since downloads now If the downloaded robots.txt file is older than 7 days the robots parser tries to download the robots.txt with the if-modified-since header to avoid unnecessary downloads if the file was not changed. Additionally the ETag header is used to detect changes. *) Crawler: better handling of unsupported mimeTypes + FileExtension *) Bugfix: plasmaWordIndexEntity was not closed correctly in - query.java - plasmaswitchboard.java *) function minimizeUrlDB added to yacy.java this function tests the current urlHashDB for unused urls ATTENTION: please don't use this function at the moment because it causes the wordIndexDB to flush all words into the word directory! git-svn-id: https://svn.berlios.de/svnroot/repos/yacy/trunk@853 6c8d7289-2bf4-0310-a012-ef5d649a1542
2005-10-05 12:45:33 +02:00
parseableExt=html,htm,txt,php,shtml,asp,aspx,jsp
# Promotion Strings
# These strings appear in the Web Mask of the YACY search client
# Set these Strings to cusomize your peer and give any message to
# other peer users
promoteSearchPageGreeting =
# the path to the PLASMA database, especially the reverse word index
dbPath=DATA/PLASMADB
# the path to the LISTS files. Most lists are used to filter web content
listsPath=DATA/LISTS
# the path to the SKINS files.
skinPath=DATA/SKINS
# the yellow-list; URL's elements
# (the core of an URL; like 'yahoo' in 'de.yahoo.com')
# appearing in this list will not get a manipulated user agent string
proxyYellowList=yacy.yellow
# the black-list; URLs appearing in this list will not be loaded;
# instead always a 404 is returned
# all these files will be placed in the listsPath
proxyBlackLists=url.default.black
proxyBlackListsActive=url.default.black
proxyBlackListsShared=url.default.black
proxyCookieBlackList=cookie.default.black
proxyCookieWhiteList=cookie.default.black
# the blue-list;
# no search result is locally presented that has any word of the bluelist
# in the search words, the URL or the URL's description
plasmaBlueList=yacy.blue
# this proxy may in turn again access another proxy
# if you wish to do that, specify it here
# if you want to switch on the proxy use, set remoteProxyUse=true
# remoteProxyNoProxy is a no-proxy pattern list for the remote proxy
remoteProxyUse=false
remoteProxyUse4Yacy=true
remoteProxyUse4SSL=true
remoteProxyHost=192.168.2.2
remoteProxyPort=4239
remoteProxyUser=
remoteProxyPwd=
remoteProxyNoProxy=192.*,10.*,127.*,localhost
# the proxy may filter the content of transferred web pages
# the bluelist removes specific keywords from web pages
proxyBlueList=yacy.blue
# security settigns
# we provide proxy and server security through a 2-stage security gate:
# 1st stage: firewall-like access control trough ip filter for clients
# 2nd stage: password settings for proxy, server and server administrators
# by default, these settings are weak to simplify set-up and testing
# every user/adiministrator shall be encouraged to change these settings
# your can change them also online during run-time on
# http://localhost:8080/
# proxyClient: client-ip's that may connect the proxy for proxy service
# if several ip's are allowed then they must be separated by a ','
# any ip may contain the wildcard-sign '*'
#proxyClient=192.168.0.4
proxyClient=localhost,127.0.0.1,192.168.*,10.*
# serverClient: client-ip's that may connect to the web server,
# thus are allowed to use the search service
# if you set this to another value, search requst from others
# are blocked, but you will also be blocked from using others
# search services.
serverClient=*
### proxyAccount: a user:password - pair for proxy authentification
### leave empty for no authenication
### example:
##proxyAccount=jim:knopf
##proxyAccount=
##proxyAccountBase64MD5=
# use_proxyAccounts: set to true to restrict proxy-access to some identified users.
#use User_p.html to create some Users.
use_proxyAccounts=false
# serverAccount: a user:password - pair for web server access
# this is the access to the 'public' pages on the server
# should be always open, but you get the option here
# if set to a user:password, you get a conflict with the administration account
# future versions will check if the server is unprotected,
# because the p2p-index-sharing function will use the http server for
# data exchange.
# example
#serverAccount=dicke:berta
serverAccount=
serverAccountBase64MD5=
# adminAccount: a user:password - pair for administration of
# settings through the web interface
# should be set to a secret. By default it is without a password
# but you are encouraged to set it to another value on the page
# http://localhost:8080/
#adminAccount=admin:anomic
adminAccount=
adminAccountBase64MD5=
# peer-to-peer construction for distributed search
# we have several stages:
# 1st: a file within every distribution that has a list of URLs:
# -> this is the superseed file
# 2nd: the files that can be retrieved by the superseeds' URLs
# are called seed list-files.
# -> the seed list-files contain IP/port combinations of running
# AnomicHTTPProxies
# 3rd: the peers that are targeted within the seed files are called superpeers
# 4th: the superpeers hold and share a list of all client/search/crawl peers
#
# some superpeers should be able to create again seed list-files.
# These superpeers must upload their IP or their list of peer-IP's to a
# ftp location to create the seed list-file.
# Everyone who do so should mail his/her new seed location to mc<at>anomic.de
# The seed list-file location will then be included in the superseed file.
# This superseed file is available then at two localtions:
# - it is included in every distribution and
# - updated through a specific URL-location
# we see the file name and the URL of the superseed here:
superseedFile=superseed.txt
superseedLocation=http://www.yacy.net/superseed.txt
Multiple updates regarding the yacy seedUpload facility, optional content parsers, thread pool configuration ... Please help me testing if everything works correct. *) Migration of yacy seedUpload functionality See: http://www.yacy-forum.de/viewtopic.php?t=256 - new uploaders can now be easily introduced because of a new modulare uploader system - default uploaders are: none, file, ftp - adding optional uploader for scp - each uploader provides its own configuration file that will be included into the settings page using the new template include feature - Each uploader can define its libx dependencies. If not all needed libs are available, the uploader is deactivated automatically. *) Migration of optional parsers See: http://www.yacy-forum.de/viewtopic.php?t=198 - Parsers can now also define there libx dependencies - adding parser for bzip compressed content - adding parser for gzip compressed content - adding parser for zip files - adding parser for tar files - adding parser to detect the mime-type of a file this is needed by the bzip/gzip Parser.java - adding parser for rtf files - removing extra configuration file yacy.parser the list of enabled parsers is now stored in the main config file *) Adding configuration option in the performance dialog to configure See: http://www.yacy-forum.de/viewtopic.php?t=267 - maxActive / maxIdle / minIdle values for httpd-session-threadpool - maxActive / maxIdle / minIdle values for crawler-threadpool *) Changing Crawling Filter behaviour See: http://www.yacy-forum.de/viewtopic.php?p=2631 *) Replacing some hardcoded strings with the proper constants of the httpHeader class *) Adding new libs to libx directory. This libs are - needed by new content parsers - needed by new optional seed uploader - needed by SOAP API (which will be committed later) git-svn-id: https://svn.berlios.de/svnroot/repos/yacy/trunk@126 6c8d7289-2bf4-0310-a012-ef5d649a1542
2005-05-17 10:25:04 +02:00
# if you are running a principal peer, you must update the following variables
Multiple updates regarding the yacy seedUpload facility, optional content parsers, thread pool configuration ... Please help me testing if everything works correct. *) Migration of yacy seedUpload functionality See: http://www.yacy-forum.de/viewtopic.php?t=256 - new uploaders can now be easily introduced because of a new modulare uploader system - default uploaders are: none, file, ftp - adding optional uploader for scp - each uploader provides its own configuration file that will be included into the settings page using the new template include feature - Each uploader can define its libx dependencies. If not all needed libs are available, the uploader is deactivated automatically. *) Migration of optional parsers See: http://www.yacy-forum.de/viewtopic.php?t=198 - Parsers can now also define there libx dependencies - adding parser for bzip compressed content - adding parser for gzip compressed content - adding parser for zip files - adding parser for tar files - adding parser to detect the mime-type of a file this is needed by the bzip/gzip Parser.java - adding parser for rtf files - removing extra configuration file yacy.parser the list of enabled parsers is now stored in the main config file *) Adding configuration option in the performance dialog to configure See: http://www.yacy-forum.de/viewtopic.php?t=267 - maxActive / maxIdle / minIdle values for httpd-session-threadpool - maxActive / maxIdle / minIdle values for crawler-threadpool *) Changing Crawling Filter behaviour See: http://www.yacy-forum.de/viewtopic.php?p=2631 *) Replacing some hardcoded strings with the proper constants of the httpHeader class *) Adding new libs to libx directory. This libs are - needed by new content parsers - needed by new optional seed uploader - needed by SOAP API (which will be committed later) git-svn-id: https://svn.berlios.de/svnroot/repos/yacy/trunk@126 6c8d7289-2bf4-0310-a012-ef5d649a1542
2005-05-17 10:25:04 +02:00
# The upload method that should be used to upload the seed-list file to
# a public accessible webserver where it can be loaded by other peers.
#
# You can set the seedUploadMethod-Property to
# - None
# - Ftp
# - File
# - Scp (only if you have installed the optional addon)
#
seedUploadMethod=
# The URL to the seed list file
seedURL=
# This is the most common method to upload the seed-list
#
# This is an ftp account with all relevant information.
# The update is only made if there had been changes in between.
seedFTPServer=
seedFTPAccount=
seedFTPPassword=
seedFTPPath=
# alternatively to an FTP account, a peer can also become a principal peer
# if the seed-list can be generated as a file and that file is also accessible from
# the internet. In this case, omit any ftp settings and set this path here.
# if this path stays empty, an ftp account is considered
# however, you must always set a seedURL because it is used to check if the
# file is actually accessible from the internet
seedFilePath=
Multiple updates regarding the yacy seedUpload facility, optional content parsers, thread pool configuration ... Please help me testing if everything works correct. *) Migration of yacy seedUpload functionality See: http://www.yacy-forum.de/viewtopic.php?t=256 - new uploaders can now be easily introduced because of a new modulare uploader system - default uploaders are: none, file, ftp - adding optional uploader for scp - each uploader provides its own configuration file that will be included into the settings page using the new template include feature - Each uploader can define its libx dependencies. If not all needed libs are available, the uploader is deactivated automatically. *) Migration of optional parsers See: http://www.yacy-forum.de/viewtopic.php?t=198 - Parsers can now also define there libx dependencies - adding parser for bzip compressed content - adding parser for gzip compressed content - adding parser for zip files - adding parser for tar files - adding parser to detect the mime-type of a file this is needed by the bzip/gzip Parser.java - adding parser for rtf files - removing extra configuration file yacy.parser the list of enabled parsers is now stored in the main config file *) Adding configuration option in the performance dialog to configure See: http://www.yacy-forum.de/viewtopic.php?t=267 - maxActive / maxIdle / minIdle values for httpd-session-threadpool - maxActive / maxIdle / minIdle values for crawler-threadpool *) Changing Crawling Filter behaviour See: http://www.yacy-forum.de/viewtopic.php?p=2631 *) Replacing some hardcoded strings with the proper constants of the httpHeader class *) Adding new libs to libx directory. This libs are - needed by new content parsers - needed by new optional seed uploader - needed by SOAP API (which will be committed later) git-svn-id: https://svn.berlios.de/svnroot/repos/yacy/trunk@126 6c8d7289-2bf4-0310-a012-ef5d649a1542
2005-05-17 10:25:04 +02:00
# Settings needed to upload the seed-list file via scp
#
# Please note that this upload method can only be used if you have installed
# this optional upload method.
seedScpServer=
seedScpAccount=
seedScpPassword=
seedScpPath=
# every peer should have a name. inded, we try to give every peer an unique ID,
# which is necessary for internal organization of the index sharing, but the
# peer's name is purely informal. No function but information is applied.
# please change this at your pleasure
peerName=anomic
# every peer periodically scans for other peers. you can set the time
# of the period here (minutes)
peerCycle=2
# The p2p maintenance can run in either of two online modes:
# - don't process jobs and only access available in cache -> mode 0
# - process any job only if we are online, which is technically only the case
# if the proxy is used -> mode 1
# - process jobs periodically, with periodes according to peerCycle -> mode 2
#onlineMode=1
onlineMode=2
# Debug mode for YACY network: this will trigger that also local ip's are
# accepted as peer addresses
yacyDebugMode=false
#staticIP if you have a static IP, you can use this setting
staticIP=
# if the process is running behind a NAT or ROUTER, we cannot easily identify
# the public IP of the process. We can ask a public IP responder, but cannot
# rely on it. Therefore, AnomicHTTPProxy includes it's own responder.
# But for the first running peer this is not an option.
# The author uses a DI-604 router, which can be
# asked for the public IP. If you own a DI-604 as well, please set the
# DI604use to true and put in your router password, it will not be used for any
# other purpose of asking for the IP
#DI604use=true
DI604use=false
DI604pw=
# each time the proxy starts up, it can trigger the local browser to show the
# status page. This is active by default, to make it easier for first-time
# users to understand what this application does. You can disable browser
# pop-up here or set a different start page, like the search page
# the browser type is optional and works only under certain conditions
#browserPopUpTrigger=false
browserPopUpTrigger=true
#browserPopUpPage=index.html
browserPopUpPage=Status.html
browserPopUpApplication=netscape
# the proxy saves it's own seed information. It is positive for the network if
# the seed does not change it's configuration often (or not at all).
# The reason for that is that the seed hash is the target for the
# distributed hash table distribution function.
# The following file will contain the saved seed:
yacyOwnSeedFile=DATA/YACYDB/mySeed.txt
yacyDB=DATA/YACYDB
# index sharing attributes
# by default, sharing is on. If you want to use the proxy only for
# local indexing, you may switch this off
allowDistributeIndex=true
allowDistributeIndexWhileCrawling=false
allowReceiveIndex=true
indexReceiveBlockBlacklist=false
# the frequency is the number of links per minute, that the peer allowes
# _every_ other peer to send to this peer
defaultWordReceiveFrequency=100
defaultLinkReceiveFrequency=30
# the default may be overridden for each peer individually, these
# settings are only available through the online interface
# prefetch parameters
# the prefetch depth assigns a specific depth to the prefetch mechanism
# prefetch of 0 means no prefetch; a prefetch of 1 means to prefetch all
# embedded URLs, but since embedded image links are loaded by the browser
# this means that only embedded anchors are prefetched additionally
# a prefetch of 2 would result in loading of all images and anchor pages
# of all embedded anchors. Be careful with this value, since even a prefetch
# of 2 would result in hundreds of prefetched URLs for each single proxy fill.
proxyPrefetchDepth=0
proxyStoreHTCache=true
proxyCrawlOrder=false
# From the 'IndexCreate' menu point you can also define a crawling start point.
# The crawling works the same way as the prefetch, but it is possible to
# assign a different crawling depth.
# Be careful with this number. Consider a branching factor of average 20;
# A prefect-depth of 8 would index 25.600.000.000 pages, maybe the whole WWW.
crawlingDepth=2
localIndexing=true
# Filter for crawlinig; may be used to restrict a crawl to a specific domain
# URLs are only indexed and further crawled if they match this filter
crawlingFilter=.*
crawlingQ=false
storeHTCache=false
storeTXCache=true
# default crawl profile entries
# if these entries are empty, then a new entry will be generated
defaultProxyProfile=
defaultRemoteProfile=
# peers may initiate remote crawling tasks.
# every peer may allow or disallow to be used as crawling-peer;
# you can also set a maximum crawl depth that can be requested or accepted
# order=parameters for requester; response=parameters for responder
# these values apply only for senior-senior - communication
# The delay value is number of seconds bewteen two separate orders
crawlOrder=true
crawlOrderDepth=0
crawlOrderDelay=8
crawlResponse=true
crawlResponseDepth=0
# indexing-exclusion - rules
# There rules are important to reduce the number of words that are indexed
# We distinguish three different sets of stop-words:
# static - excludes all words given in the file yacy.stopwords from indexing,
# dynamic - excludes all words from indexing which are listed by statistic rules,
# parental - excludes all words from indexing which had been indexed in the parent web page.
xsstopw=true
xdstopw=true
xpstopw=true
# performance-settings
# delay-times for permanent loops (milliseconds)
# the idlesleep is the pause that an proces sleeps if the last call to the
# process job was without execution of anything;
# the busysleep is the pause after a full job execution
# the prereq-value is a memory pre-requisite: that much bytes must
# be available/free in the heap; othervise the loop is not executed
# and another idlesleep is performed
20_dhtdistribution_idlesleep=50000
20_dhtdistribution_busysleep=2000
20_dhtdistribution_memprereq=8388608
20_dhtdistribution_threads=1
30_peerping_idlesleep=120000
30_peerping_busysleep=120000
30_peerping_memprereq=1048576
40_peerseedcycle_idlesleep=1800000
40_peerseedcycle_busysleep=1200000
40_peerseedcycle_memprereq=4194304
50_localcrawl_idlesleep=10000
50_localcrawl_busysleep=100
50_localcrawl_memprereq=1048576
50_localcrawl_isPaused=false
61_globalcrawltrigger_idlesleep=10000
61_globalcrawltrigger_busysleep=200
61_globalcrawltrigger_memprereq=1048576
61_globalcrawltrigger_isPaused=false
62_remotetriggeredcrawl_idlesleep=10000
62_remotetriggeredcrawl_busysleep=200
62_remotetriggeredcrawl_memprereq=1048576
62_remotetriggeredcrawl_isPaused=false
70_cachemanager_idlesleep=5000
70_cachemanager_busysleep=0
70_cachemanager_memprereq=1048576
80_indexing_idlesleep=2000
80_indexing_busysleep=100
80_indexing_memprereq=2097152
82_crawlstack_idlesleep=5000
82_crawlstack_busysleep=0
82_crawlstack_memprereq=1048576
90_cleanup_idlesleep=300000
90_cleanup_busysleep=300000
90_cleanup_memprereq=0
# multiprocessor-settings
# you may want to run time-consuming processes on several processors
# the most time-consuming process is the indexing-Process
# We implemented an option to run several of these processes here
# setting the number of processes to Zero is not allowed
# If you have a double-processor system,
# a cluster value of '2' would be appropriate
80_indexing_cluster=1
# ram cache for database files
# ram cache for assortment cache cluster (for all 64 files)
ramCacheRWI = 8388608
# ram cache for responseHeader.db
ramCacheHTTP = 4194304
# ram cache for urlHash.db
ramCacheLURL = 4194304
# ram cache for urlNotice.db
ramCacheNURL = 4194304
# ram cache for urlErr.db
ramCacheEURL = 8192
# ram cache for seedDBs
ramCacheDHT = 8192
# ram cache for message.db
ramCacheMessage = 8192
# ram cache for wiki.db
ramCacheWiki = 8192
# ram cache for news1.db
ramCacheNews = 8192
# ram cache for robotsTxt.db
ramCacheRobots = 2097152
# ram cache for crawlProfile.db
ramCacheProfiles = 8192
*) Asynchronous queuing of crawl job URLs (stackCrawl) various checks like the blacklist check or the robots.txt disallow check are now done by a separate thread to unburden the indexer thread(s) TODO: maybe we have to introduce a threadpool here if it turn out that this single thread is a bottleneck because of the time consuming robots.txt downloads *) improved index transfer The index selection and transmission is done in parallel now to improve index transfer performance. TODO: maybe we could speed up performance by unsing multiple transmission threads in parallel instead of only a single one. *) gzip encoded post requests it is now configureable if a gzip encoded post request should be send on intex transfer/distribution *) storage Peer (very experimentell and not optimized yet) Now it's possible to send the result of the yacy indexer thread to a remote peer istead of storing the indexed words locally. This could be done by setting the property "storagePeerHash" in the yacy config file - Please note that if the index transfer fails, the index ist stored locally. - TODO: currently this index transfer is done by the indexer thread. To seedup the indexer a) this transmission should be done in parallel and b) multiple chunks should be bundled and transfered together *) general performance improvements - better memory cleanup after http request processing has finished - replacing some string concatenations with stringBuffers - replacing BufferedInputStreams with serverByteBuffer - replacing vectors with arraylists wherever possible - replacing hashtables with hashmaps wherever possible This was done because function calls to verctor or hashtable functions take 3 time longer than calls to functions of arraylists or hashmaps. TODO: we should take a look on the class serverObject which is inherited from hashmap Do we realy need a synchronization for this class? TODO: replace arraylists with linkedLists if random access to the list elements is not needed *) Robots Parser supports if-modified-since downloads now If the downloaded robots.txt file is older than 7 days the robots parser tries to download the robots.txt with the if-modified-since header to avoid unnecessary downloads if the file was not changed. Additionally the ETag header is used to detect changes. *) Crawler: better handling of unsupported mimeTypes + FileExtension *) Bugfix: plasmaWordIndexEntity was not closed correctly in - query.java - plasmaswitchboard.java *) function minimizeUrlDB added to yacy.java this function tests the current urlHashDB for unused urls ATTENTION: please don't use this function at the moment because it causes the wordIndexDB to flush all words into the word directory! git-svn-id: https://svn.berlios.de/svnroot/repos/yacy/trunk@853 6c8d7289-2bf4-0310-a012-ef5d649a1542
2005-10-05 12:45:33 +02:00
# ram cache for stack crawl thread db
ramCachePreNURL = 4194304
*) Asynchronous queuing of crawl job URLs (stackCrawl) various checks like the blacklist check or the robots.txt disallow check are now done by a separate thread to unburden the indexer thread(s) TODO: maybe we have to introduce a threadpool here if it turn out that this single thread is a bottleneck because of the time consuming robots.txt downloads *) improved index transfer The index selection and transmission is done in parallel now to improve index transfer performance. TODO: maybe we could speed up performance by unsing multiple transmission threads in parallel instead of only a single one. *) gzip encoded post requests it is now configureable if a gzip encoded post request should be send on intex transfer/distribution *) storage Peer (very experimentell and not optimized yet) Now it's possible to send the result of the yacy indexer thread to a remote peer istead of storing the indexed words locally. This could be done by setting the property "storagePeerHash" in the yacy config file - Please note that if the index transfer fails, the index ist stored locally. - TODO: currently this index transfer is done by the indexer thread. To seedup the indexer a) this transmission should be done in parallel and b) multiple chunks should be bundled and transfered together *) general performance improvements - better memory cleanup after http request processing has finished - replacing some string concatenations with stringBuffers - replacing BufferedInputStreams with serverByteBuffer - replacing vectors with arraylists wherever possible - replacing hashtables with hashmaps wherever possible This was done because function calls to verctor or hashtable functions take 3 time longer than calls to functions of arraylists or hashmaps. TODO: we should take a look on the class serverObject which is inherited from hashmap Do we realy need a synchronization for this class? TODO: replace arraylists with linkedLists if random access to the list elements is not needed *) Robots Parser supports if-modified-since downloads now If the downloaded robots.txt file is older than 7 days the robots parser tries to download the robots.txt with the if-modified-since header to avoid unnecessary downloads if the file was not changed. Additionally the ETag header is used to detect changes. *) Crawler: better handling of unsupported mimeTypes + FileExtension *) Bugfix: plasmaWordIndexEntity was not closed correctly in - query.java - plasmaswitchboard.java *) function minimizeUrlDB added to yacy.java this function tests the current urlHashDB for unused urls ATTENTION: please don't use this function at the moment because it causes the wordIndexDB to flush all words into the word directory! git-svn-id: https://svn.berlios.de/svnroot/repos/yacy/trunk@853 6c8d7289-2bf4-0310-a012-ef5d649a1542
2005-10-05 12:45:33 +02:00
# default memory settings for startup of yacy
# is only valid in unix/shell environments and
# not for first startup of YaCy
# -Xmx<size> set maximum Java heap size
javastart_Xmx=Xmx64m
# -Xms<size> set initial Java heap size
javastart_Xms=Xms10m
# performance properties for the word index cache
# wordCacheMaxLow/High is the number of word indexes that shall be held in the
# ram cache during indexing. When YaCy is shut down, this cache must be
# flushed to disc; this may last some minutes.
# The low value is valid for crawling tasks, the high value is valid for
# remote index transmissions and search requests
# maxWaitingWordFlush gives the number of seconds that the shutdown
# may last for the word flush
wordCacheMaxLow = 12000
wordCacheMaxHigh = 16000
maxWaitingWordFlush = 180
# Specifies if yacy can be used as transparent http proxy.
#
# Please note that you also have to reconfigure your firewall
# before you can use yacy as transparent proxy. On linux this
# can be done like this:
# iptables -t nat -A PREROUTING -p tcp -s 192.168.0.0/16 \
# --dport 80 -j DNAT --to 192.168.0.1:8080
#
# With this iptables filter listed above all http traffic that
# comes from your private network (in this case 192.168.0.0)
# and goes to any webserver listening on port 80 will be forwarded
# by the firewall to yacy running on port 8080 (192.168.0.1:8080)
isTransparentProxy=false
# Specifies if yacy should use the http connection keep-alive feature
connectionKeepAliveSupport=true
# Specifies if the proxy should send the via header according to RFC
proxy.sendViaHeader=true
# Configuration options needed to configure server port forwarding
portForwardingEnabled=false
portForwardingUseProxy=false
portForwardingPort=
portForwardingHost=
portForwardingHostPort=22
portForwardingHostUser=
portForwardingHostPwd=
# msgForwarding: Specifies if yacy should forward received messages via
# email to the configured email address
msgForwardingEnabled=false
msgForwardingCmd=/usr/sbin/sendmail
msgForwardingTo=root@localhost
#onlineCautionDelay: delay time after proxy usage before crawling is resumed
onlineCautionDelay=30000
# Some configuration values for the crawler
crawler.acceptLanguage=en-us,en;q=0.5
crawler.acceptCharset=ISO-8859-1,utf-8;q=0.7,*;q=0.7
crawler.clientTimeout=9000
# maximum number of crawler threads
crawler.MaxActiveThreads = 10
crawler.MaxIdleThreads = 7
crawler.MinIdleThreads = 5
# maximum number of crawl-stacker threads
stacker.MaxActiveThreads = 50
stacker.MaxIdleThreads = 10
stacker.MinIdleThreads = 5
# maximum size of indexing queue
indexer.slots = 100
# specifies if yacy should set it's own referer if no referer URL
# was set by the client.
useYacyReferer = true
# allow only 443(https-port) for https-proxy?
# if you want to tunnel other protokols, set to false
secureHttps = true
*) Asynchronous queuing of crawl job URLs (stackCrawl) various checks like the blacklist check or the robots.txt disallow check are now done by a separate thread to unburden the indexer thread(s) TODO: maybe we have to introduce a threadpool here if it turn out that this single thread is a bottleneck because of the time consuming robots.txt downloads *) improved index transfer The index selection and transmission is done in parallel now to improve index transfer performance. TODO: maybe we could speed up performance by unsing multiple transmission threads in parallel instead of only a single one. *) gzip encoded post requests it is now configureable if a gzip encoded post request should be send on intex transfer/distribution *) storage Peer (very experimentell and not optimized yet) Now it's possible to send the result of the yacy indexer thread to a remote peer istead of storing the indexed words locally. This could be done by setting the property "storagePeerHash" in the yacy config file - Please note that if the index transfer fails, the index ist stored locally. - TODO: currently this index transfer is done by the indexer thread. To seedup the indexer a) this transmission should be done in parallel and b) multiple chunks should be bundled and transfered together *) general performance improvements - better memory cleanup after http request processing has finished - replacing some string concatenations with stringBuffers - replacing BufferedInputStreams with serverByteBuffer - replacing vectors with arraylists wherever possible - replacing hashtables with hashmaps wherever possible This was done because function calls to verctor or hashtable functions take 3 time longer than calls to functions of arraylists or hashmaps. TODO: we should take a look on the class serverObject which is inherited from hashmap Do we realy need a synchronization for this class? TODO: replace arraylists with linkedLists if random access to the list elements is not needed *) Robots Parser supports if-modified-since downloads now If the downloaded robots.txt file is older than 7 days the robots parser tries to download the robots.txt with the if-modified-since header to avoid unnecessary downloads if the file was not changed. Additionally the ETag header is used to detect changes. *) Crawler: better handling of unsupported mimeTypes + FileExtension *) Bugfix: plasmaWordIndexEntity was not closed correctly in - query.java - plasmaswitchboard.java *) function minimizeUrlDB added to yacy.java this function tests the current urlHashDB for unused urls ATTENTION: please don't use this function at the moment because it causes the wordIndexDB to flush all words into the word directory! git-svn-id: https://svn.berlios.de/svnroot/repos/yacy/trunk@853 6c8d7289-2bf4-0310-a012-ef5d649a1542
2005-10-05 12:45:33 +02:00
# specifies if the httpdFileHandler should cache
# the template-files from the htroot directory
enableTemplateCache = true
# specifies if the http post body should be transfered
# using content-encoding gzip during index transfer
# a) indexDistribution: which is done periodically if you have enabled
# Index Distribution via IndexControl_p.html
# b) indexTransfer: which can be used to transfer the whole index of a peer
# this can be started via IndexTransfer_p.html
# c) indexControl: which can be triggered manually via IndexControl_p.html to
# transfer a chosen subset of the peer index
indexDistribution.gzipBody = true
indexTransfer.gzipBody = true
indexControl.gzipBody = true
# defining timeouts for index- transfer/distribution/control
indexControl.timeout = 60000
indexDistribution.timeout = 60000
indexTransfer.timeout = 120000
# defining max. allowed amount of open files during index- transfer/distribution
indexDistribution.maxOpenFiles = 800
indexTransfer.maxOpenFiles = 800
# Distribution of Citation-Reference (CR-) files
# The distribution is done in two steps:
# first step to anonymize the records
# second step to forward to collecting peer
# to anonymize the data even against the intermediate peer
# a specific precentage is also sent again to other peers.
# for key-numbers please see de.anomic.plasma.plasmaRankingDistribution
CRDistOn = true
CRDist0Path = GLOBAL/010_owncr
CRDist0Method = 1
CRDist0Percent = 0
CRDist0Target =
CRDist1Path = GLOBAL/014_othercr
CRDist1Method = 9
CRDist1Percent = 30
CRDist1Target = kaskelix.de:8080,yacy.dyndns.org:8000,suma-lab.de:8080
#
*) Asynchronous queuing of crawl job URLs (stackCrawl) various checks like the blacklist check or the robots.txt disallow check are now done by a separate thread to unburden the indexer thread(s) TODO: maybe we have to introduce a threadpool here if it turn out that this single thread is a bottleneck because of the time consuming robots.txt downloads *) improved index transfer The index selection and transmission is done in parallel now to improve index transfer performance. TODO: maybe we could speed up performance by unsing multiple transmission threads in parallel instead of only a single one. *) gzip encoded post requests it is now configureable if a gzip encoded post request should be send on intex transfer/distribution *) storage Peer (very experimentell and not optimized yet) Now it's possible to send the result of the yacy indexer thread to a remote peer istead of storing the indexed words locally. This could be done by setting the property "storagePeerHash" in the yacy config file - Please note that if the index transfer fails, the index ist stored locally. - TODO: currently this index transfer is done by the indexer thread. To seedup the indexer a) this transmission should be done in parallel and b) multiple chunks should be bundled and transfered together *) general performance improvements - better memory cleanup after http request processing has finished - replacing some string concatenations with stringBuffers - replacing BufferedInputStreams with serverByteBuffer - replacing vectors with arraylists wherever possible - replacing hashtables with hashmaps wherever possible This was done because function calls to verctor or hashtable functions take 3 time longer than calls to functions of arraylists or hashmaps. TODO: we should take a look on the class serverObject which is inherited from hashmap Do we realy need a synchronization for this class? TODO: replace arraylists with linkedLists if random access to the list elements is not needed *) Robots Parser supports if-modified-since downloads now If the downloaded robots.txt file is older than 7 days the robots parser tries to download the robots.txt with the if-modified-since header to avoid unnecessary downloads if the file was not changed. Additionally the ETag header is used to detect changes. *) Crawler: better handling of unsupported mimeTypes + FileExtension *) Bugfix: plasmaWordIndexEntity was not closed correctly in - query.java - plasmaswitchboard.java *) function minimizeUrlDB added to yacy.java this function tests the current urlHashDB for unused urls ATTENTION: please don't use this function at the moment because it causes the wordIndexDB to flush all words into the word directory! git-svn-id: https://svn.berlios.de/svnroot/repos/yacy/trunk@853 6c8d7289-2bf4-0310-a012-ef5d649a1542
2005-10-05 12:45:33 +02:00
storagePeerHash =
# Search sequence settings
# collection:
# time = time to get a RWI out of RAM cache, assortments and WORDS files
# count = maximum number of RWI-entries that shall be collected
#
# join:
# time = time to perform the join between all collected RWIs
# count = maximum number of entries that shall be joined
#
# presort:
# time = time to do a sort of the joined URL-records
# count = maximum number of entries that shall be pre-sorted
#
# urlfetch:
# time = time to fetch the real URLs from the LURL database
# count = maximum number of urls that shall be fetched
#
# postsort:
# time = time for final sort of URLs
# count = maximum number oof URLs that shall be retrieved during sort
#
# filter:
# time = time to filter out unwanted urls (like redundant urls)
# count = maximum number of urls that shall be filtered
#
# snippetfetch:
# time = time to fetch snippets for selected URLs
# count = maximum number of snipptes to be fetched
#
# all values are percent
# time-percent is the percent of total search time
# count-percent is the percent of total wanted urls in result
# we distinguish local and remote search times
searchProcessLocalTime_c = 25
searchProcessLocalCount_c = 10000000
searchProcessLocalTime_j = 10
searchProcessLocalCount_j = 1000000
searchProcessLocalTime_r = 10
searchProcessLocalCount_r =100000
searchProcessLocalTime_u = 30
searchProcessLocalCount_u = 10000
searchProcessLocalTime_o = 10
searchProcessLocalCount_o = 100
searchProcessLocalTime_f = 5
searchProcessLocalCount_f = 100
searchProcessLocalTime_s = 10
searchProcessLocalCount_s = 30
searchProcessRemoteTime_c = 25
searchProcessRemoteCount_c = 1000000
searchProcessRemoteTime_j = 10
searchProcessRemoteCount_j = 1000000
searchProcessRemoteTime_r = 10
searchProcessRemoteCount_r = 1000
searchProcessRemoteTime_u = 30
searchProcessRemoteCount_u = 1000
searchProcessRemoteTime_o = 10
searchProcessRemoteCount_o = 1000
searchProcessRemoteTime_f = 5
searchProcessRemoteCount_f = 100
searchProcessRemoteTime_s = 10
searchProcessRemoteCount_s = 10
# path to ranking directory containing ranking reference files
rankingPath = DATA/RANKING
# a list of domain name patterns that should not be cached by the httpc dns cache
httpc.nameCacheNoCachingPatterns = .*.dyndns.org, .*.dynalias.org
#externalRedirectors
#squid Redirector compatible
externalRedirector=
svnRevision=0