2005-09-20 17:36:22 +02:00
// yacy.java
// -----------------------
// (C) by Michael Peter Christen; mc@anomic.de
// first published on http://www.yacy.net
// Frankfurt, Germany, 2004, 2005
2005-04-07 21:19:42 +02:00
//
2005-09-20 17:36:22 +02:00
// $LastChangedDate$
// $LastChangedRevision$
// $LastChangedBy$
2005-04-07 21:19:42 +02:00
//
2005-09-20 17:36:22 +02:00
// This program is free software; you can redistribute it and/or modify
// it under the terms of the GNU General Public License as published by
// the Free Software Foundation; either version 2 of the License, or
// (at your option) any later version.
2005-04-07 21:19:42 +02:00
//
2005-09-20 17:36:22 +02:00
// This program is distributed in the hope that it will be useful,
// but WITHOUT ANY WARRANTY; without even the implied warranty of
// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
// GNU General Public License for more details.
2005-04-07 21:19:42 +02:00
//
2005-09-20 17:36:22 +02:00
// You should have received a copy of the GNU General Public License
// along with this program; if not, write to the Free Software
// Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
2005-04-07 21:19:42 +02:00
//
2005-09-20 17:36:22 +02:00
// Using this software in any meaning (reading, learning, copying, compiling,
// running) means that you agree that the Author(s) is (are) not responsible
// for cost, loss of data or any harm that may be caused directly or indirectly
// by usage of this softare or this documentation. The usage of this software
// is on your own risk. The installation and usage (starting/running) of this
// software may allow other people or application to access your computer and
// any attached devices and is highly dependent on the configuration of the
// software which must be done by the user of the software; the author(s) is
// (are) also not responsible for proper configuration and usage of the
// software, even if provoked by documentation provided together with
// the software.
//
// Any changes to this file according to the GPL as documented in the file
// gpl.txt aside this file in the shipment you received can be done to the
// lines that follows this copyright notice here, but changes must not be
// done inside the copyright notive above. A re-distribution must contain
// the intact and unchanged copyright notice.
// Contributions and changes to the program code must be marked as such.
2006-06-01 01:31:46 +02:00
2005-12-07 02:40:52 +01:00
import java.io.BufferedOutputStream ;
2005-05-05 07:32:19 +02:00
import java.io.BufferedReader ;
import java.io.BufferedWriter ;
import java.io.ByteArrayOutputStream ;
import java.io.File ;
import java.io.FileInputStream ;
import java.io.FileNotFoundException ;
2005-12-07 02:40:52 +01:00
import java.io.FileOutputStream ;
2005-05-05 07:32:19 +02:00
import java.io.FileWriter ;
import java.io.IOException ;
import java.io.InputStreamReader ;
import java.io.PrintWriter ;
import java.util.HashMap ;
import java.util.Iterator ;
2006-01-30 09:28:22 +01:00
import java.util.Map ;
2005-05-05 07:32:19 +02:00
import java.util.Properties ;
import java.util.TreeSet ;
2005-06-07 10:31:49 +02:00
import java.util.regex.Matcher ;
import java.util.regex.Pattern ;
2005-12-17 16:43:13 +01:00
import java.util.zip.ZipEntry ;
import java.util.zip.ZipOutputStream ;
2005-09-27 18:28:55 +02:00
2005-07-06 16:48:41 +02:00
import de.anomic.data.translator ;
2005-05-05 07:32:19 +02:00
import de.anomic.http.httpHeader ;
import de.anomic.http.httpc ;
import de.anomic.http.httpd ;
2006-05-28 03:09:31 +02:00
import de.anomic.index.indexContainer ;
2006-11-08 17:17:47 +01:00
import de.anomic.index.indexRWIEntry ;
2008-01-23 22:23:17 +01:00
import de.anomic.index.indexRWIRowEntry ;
2006-11-05 03:10:40 +01:00
import de.anomic.index.indexURLEntry ;
2007-05-09 19:59:36 +02:00
import de.anomic.kelondro.kelondroBase64Order ;
2006-01-30 09:28:22 +01:00
import de.anomic.kelondro.kelondroDyn ;
2005-05-05 07:32:19 +02:00
import de.anomic.kelondro.kelondroMScoreCluster ;
2007-01-30 00:51:10 +01:00
import de.anomic.kelondro.kelondroMapObjects ;
2008-02-27 16:16:47 +01:00
import de.anomic.kelondro.kelondroRowCollection ;
2006-11-23 03:16:30 +01:00
import de.anomic.plasma.plasmaCondenser ;
2006-09-30 00:27:20 +02:00
import de.anomic.plasma.plasmaCrawlLURL ;
2005-05-05 07:32:19 +02:00
import de.anomic.plasma.plasmaSwitchboard ;
2005-10-05 12:45:33 +02:00
import de.anomic.plasma.plasmaWordIndex ;
2005-05-05 07:32:19 +02:00
import de.anomic.server.serverCore ;
2005-09-27 18:28:55 +02:00
import de.anomic.server.serverDate ;
2005-05-05 07:32:19 +02:00
import de.anomic.server.serverFileUtils ;
2008-03-02 16:42:50 +01:00
import de.anomic.server.serverMemory ;
2007-05-05 17:41:05 +02:00
import de.anomic.server.serverSemaphore ;
2005-05-05 07:32:19 +02:00
import de.anomic.server.serverSystem ;
2005-06-09 11:46:43 +02:00
import de.anomic.server.logging.serverLog ;
2005-05-05 07:32:19 +02:00
import de.anomic.tools.enumerateFiles ;
2007-10-16 04:12:31 +02:00
import de.anomic.tools.yFormatter ;
2005-11-11 00:48:20 +01:00
import de.anomic.yacy.yacyClient ;
2006-01-30 09:28:22 +01:00
import de.anomic.yacy.yacySeedDB ;
2007-09-05 11:01:35 +02:00
import de.anomic.yacy.yacyURL ;
2007-04-27 11:23:44 +02:00
import de.anomic.yacy.yacyVersion ;
2005-04-07 21:19:42 +02:00
2005-08-02 21:40:29 +02:00
/ * *
2005-12-07 11:31:48 +01:00
* This is the main class of YaCy . Several threads are started from here :
2005-08-02 21:40:29 +02:00
* < ul >
* < li > one single instance of the plasmaSwitchboard is generated , which itself
* starts a thread with a plasmaHTMLCache object . This object simply counts
* files sizes in the cache and terminates them . It also generates a
* plasmaCrawlerLoader object , which may itself start some more httpc - calling
* threads to load web pages . They terminate automatically when a page has
* loaded .
* < li > one serverCore - thread is started , which implements a multi - threaded
* server . The process may start itself many more processes that handle
2006-03-05 11:07:52 +01:00
* connections . lo
2005-08-02 21:40:29 +02:00
* < li > finally , all idle - dependent processes are written in a queue in
* plasmaSwitchboard which are worked off inside an idle - sensitive loop of the
* main process . ( here )
* < / ul >
*
* On termination , the following must be done :
* < ul >
* < li > stop feeding of the crawling process because it othervise fills the
* indexing queue .
* < li > say goodbye to connected peers and disable new connections . Don ' t wait for
* success .
* < li > first terminate the serverCore thread . This prevents that new cache
* objects are queued .
* < li > wait that the plasmaHTMLCache terminates ( it should be normal that this
* process already has terminated ) .
* < li > then wait for termination of all loader process of the
* plasmaCrawlerLoader .
* < li > work off the indexing and cache storage queue . These values are inside a
* RAM cache and would be lost otherwise .
* < li > write all settings .
* < li > terminate .
* < / ul >
* /
2005-09-20 17:36:22 +02:00
public final class yacy {
2007-07-11 01:56:25 +02:00
2005-04-07 21:19:42 +02:00
// static objects
2007-07-11 01:56:25 +02:00
public static final String vString = " @REPL_VERSION@ " ;
public static double version = 0 . 1 ;
public static boolean pro ;
2007-06-28 16:52:26 +02:00
2007-07-11 01:56:25 +02:00
public static final String vDATE = " @REPL_DATE@ " ;
public static final String copyright = " [ YaCy v " + vString + " , build " + vDATE + " by Michael Christen / www.yacy.net ] " ;
public static final String hline = " ------------------------------------------------------------------------------- " ;
2005-10-05 18:35:05 +02:00
2007-05-06 10:22:18 +02:00
/ * *
* a reference to the { @link plasmaSwitchboard } created by the
* { @link yacy # startup ( String , long , long ) } method .
* /
private static plasmaSwitchboard sb = null ;
/ * *
* Semaphore needed by { @link yacy # setUpdaterCallback ( serverUpdaterCallback ) } to block
* until the { @link plasmaSwitchboard } object was created .
* /
private static serverSemaphore sbSync = new serverSemaphore ( 0 ) ;
/ * *
* Semaphore needed by { @link yacy # waitForFinishedStartup ( ) } to block
* until startup has finished
* /
private static serverSemaphore startupFinishedSync = new serverSemaphore ( 0 ) ;
2005-08-02 21:40:29 +02:00
/ * *
* Starts up the whole application . Sets up all datastructures and starts
* the main threads .
*
* @param homePath Root - path where all information is to be found .
2005-09-20 12:10:34 +02:00
* @param startupFree free memory at startup time , to be used later for statistics
2005-08-02 21:40:29 +02:00
* /
2008-02-01 00:40:47 +01:00
private static void startup ( File homePath , long startupMemFree , long startupMemTotal ) {
2006-02-12 17:46:43 +01:00
int oldRev = 0 ;
int newRev = 0 ;
2007-01-04 23:03:32 +01:00
2005-05-11 11:44:36 +02:00
try {
// start up
System . out . println ( copyright ) ;
System . out . println ( hline ) ;
2005-08-02 21:40:29 +02:00
2005-04-07 21:19:42 +02:00
// check java version
try {
2005-12-05 10:13:13 +01:00
/*String[] check =*/ " a,b " . split ( " , " ) ; // split needs java 1.4
2005-04-07 21:19:42 +02:00
} catch ( NoSuchMethodError e ) {
2006-03-13 21:12:31 +01:00
System . err . println ( " STARTUP: Java Version too low. You need at least Java 1.4.2 to run YaCy " ) ;
2005-12-05 01:17:12 +01:00
Thread . sleep ( 3000 ) ;
2005-04-07 21:19:42 +02:00
System . exit ( - 1 ) ;
}
2005-09-27 18:28:55 +02:00
2006-10-11 20:27:38 +02:00
// ensure that there is a DATA directory, if not, create one and if that fails warn and die
2008-02-01 00:40:47 +01:00
File f = homePath ; if ( ! ( f . exists ( ) ) ) f . mkdirs ( ) ;
2005-09-27 18:28:55 +02:00
f = new File ( homePath , " DATA/ " ) ; if ( ! ( f . exists ( ) ) ) f . mkdirs ( ) ;
2006-10-11 20:27:38 +02:00
if ( ! ( f . exists ( ) ) ) {
2006-12-17 19:06:39 +01:00
System . err . println ( " Error creating DATA-directory in " + homePath . toString ( ) + " . Please check your write-permission for this folder. YaCy will now terminate. " ) ;
System . exit ( - 1 ) ;
2006-10-11 20:27:38 +02:00
}
2005-09-27 18:28:55 +02:00
2005-06-09 11:46:43 +02:00
// setting up logging
2005-09-27 18:28:55 +02:00
f = new File ( homePath , " DATA/LOG/ " ) ; if ( ! ( f . exists ( ) ) ) f . mkdirs ( ) ;
if ( ! ( ( new File ( homePath , " DATA/LOG/yacy.logging " ) ) . exists ( ) ) ) try {
serverFileUtils . copy ( new File ( homePath , " yacy.logging " ) , new File ( homePath , " DATA/LOG/yacy.logging " ) ) ;
} catch ( IOException e ) {
System . out . println ( " could not copy yacy.logging " ) ;
}
try {
2008-02-01 00:40:47 +01:00
serverLog . configureLogging ( homePath , new File ( homePath , " DATA/LOG/yacy.logging " ) ) ;
2005-06-13 14:01:58 +02:00
} catch ( IOException e ) {
System . out . println ( " could not find logging properties in homePath= " + homePath ) ;
e . printStackTrace ( ) ;
}
2007-06-29 14:46:08 +02:00
serverLog . logConfig ( " STARTUP " , " Java version: " + System . getProperty ( " java.version " , " no-java-version " ) ) ;
serverLog . logConfig ( " STARTUP " , " Operation system: " + System . getProperty ( " os.name " , " unknown " ) ) ;
serverLog . logConfig ( " STARTUP " , " Application root-ath: " + homePath ) ;
serverLog . logConfig ( " STARTUP " , " Time zone: UTC " + serverDate . UTCDiffString ( ) + " ; UTC+0000 is " + System . currentTimeMillis ( ) ) ;
2006-03-23 21:12:23 +01:00
serverLog . logConfig ( " STARTUP " , " Maximum file system path length: " + serverSystem . maxPathLength ) ;
2007-06-08 14:45:03 +02:00
f = new File ( homePath , " DATA/yacy.running " ) ;
if ( f . exists ( ) ) { // another instance running? VM crash? User will have to care about this
2007-06-22 16:29:14 +02:00
serverLog . logSevere ( " STARTUP " , " WARNING: the file " + f + " exists, this usually means that a YaCy instance is still running " ) ;
2007-06-13 22:55:48 +02:00
f . delete ( ) ;
2007-06-08 14:45:03 +02:00
}
2007-06-13 22:55:48 +02:00
f . createNewFile ( ) ;
f . deleteOnExit ( ) ;
2007-06-28 16:52:26 +02:00
pro = new File ( homePath , " libx " ) . exists ( ) ;
2007-06-16 16:11:52 +02:00
sb = new plasmaSwitchboard ( homePath , " yacy.init " , " DATA/SETTINGS/httpProxy.conf " , pro ) ;
2007-05-05 17:41:05 +02:00
sbSync . V ( ) ; // signal that the sb reference was set
2005-10-05 18:35:05 +02:00
2005-09-20 12:10:34 +02:00
// save information about available memory at startup time
2005-09-21 02:12:37 +02:00
sb . setConfig ( " memoryFreeAfterStartup " , startupMemFree ) ;
sb . setConfig ( " memoryTotalAfterStartup " , startupMemTotal ) ;
2005-09-20 12:10:34 +02:00
2005-06-30 00:55:37 +02:00
// hardcoded, forced, temporary value-migration
2005-09-21 02:12:37 +02:00
sb . setConfig ( " htTemplatePath " , " htroot/env/templates " ) ;
sb . setConfig ( " parseableExt " , " html,htm,txt,php,shtml,asp " ) ;
2005-08-02 21:40:29 +02:00
2005-06-07 10:31:49 +02:00
// if we are running an SVN version, we try to detect the used svn revision now ...
2005-09-20 17:36:22 +02:00
final Properties buildProp = new Properties ( ) ;
2005-07-03 14:40:36 +02:00
File buildPropFile = null ;
try {
buildPropFile = new File ( homePath , " build.properties " ) ;
buildProp . load ( new FileInputStream ( buildPropFile ) ) ;
} catch ( Exception e ) {
2005-09-20 17:36:22 +02:00
serverLog . logWarning ( " STARTUP " , buildPropFile . toString ( ) + " not found in settings path " ) ;
2005-07-03 14:40:36 +02:00
}
2006-02-12 17:46:43 +01:00
oldRev = Integer . parseInt ( sb . getConfig ( " svnRevision " , " 0 " ) ) ;
2005-06-30 21:39:19 +02:00
try {
if ( buildProp . containsKey ( " releaseNr " ) ) {
2005-09-20 17:36:22 +02:00
// this normally looks like this: $Revision$
final String svnReleaseNrStr = buildProp . getProperty ( " releaseNr " ) ;
final Pattern pattern = Pattern . compile ( " \\ $Revision: \\ s(.*) \\ s \\ $ " , Pattern . DOTALL + Pattern . CASE_INSENSITIVE ) ;
final Matcher matcher = pattern . matcher ( svnReleaseNrStr ) ;
2005-06-30 21:39:19 +02:00
if ( matcher . find ( ) ) {
2005-09-20 17:36:22 +02:00
final String svrReleaseNr = matcher . group ( 1 ) ;
2005-07-03 14:40:36 +02:00
try {
2006-08-28 19:54:07 +02:00
try { version = Double . parseDouble ( vString ) ; } catch ( NumberFormatException e ) { version = ( float ) 0 . 1 ; }
2007-07-12 18:23:33 +02:00
version = yacyVersion . versvn2combinedVersion ( version , Integer . parseInt ( svrReleaseNr ) ) ;
2005-07-03 14:40:36 +02:00
} catch ( NumberFormatException e ) { }
2005-09-21 02:12:37 +02:00
sb . setConfig ( " svnRevision " , svrReleaseNr ) ;
2005-06-07 10:31:49 +02:00
}
}
2006-02-12 17:46:43 +01:00
newRev = Integer . parseInt ( sb . getConfig ( " svnRevision " , " 0 " ) ) ;
2005-06-30 21:39:19 +02:00
} catch ( Exception e ) {
System . err . println ( " Unable to determine the currently used SVN revision number. " ) ;
2005-06-07 10:31:49 +02:00
}
2005-08-02 21:40:29 +02:00
2006-08-28 19:54:07 +02:00
sb . setConfig ( " version " , Double . toString ( version ) ) ;
2007-07-12 18:23:33 +02:00
sb . setConfig ( " vString " , yacyVersion . combined2prettyVersion ( Double . toString ( version ) ) ) ;
2007-12-19 20:39:19 +01:00
sb . setConfig ( " vdate " , ( vDATE . startsWith ( " @ " ) ) ? serverDate . formatShortDay ( ) : vDATE ) ;
2008-02-01 00:40:47 +01:00
sb . setConfig ( " applicationRoot " , homePath . toString ( ) ) ;
2007-07-17 01:47:21 +02:00
serverLog . logConfig ( " STARTUP " , " YACY Version: " + version + " , Built " + sb . getConfig ( " vdate " , " 00000000 " ) ) ;
2007-04-27 11:23:44 +02:00
yacyVersion . latestRelease = version ;
2005-08-02 21:40:29 +02:00
2005-05-11 11:44:36 +02:00
// read environment
2007-05-09 16:30:01 +02:00
int timeout = Math . max ( 20000 , Integer . parseInt ( sb . getConfig ( " httpdTimeout " , " 20000 " ) ) ) ;
2005-08-02 21:40:29 +02:00
2005-04-07 21:19:42 +02:00
// create some directories
2005-09-22 22:25:56 +02:00
final File htRootPath = new File ( homePath , sb . getConfig ( " htRootPath " , " htroot " ) ) ;
2007-11-04 11:36:25 +01:00
final File htDocsPath = sb . getConfigPath ( plasmaSwitchboard . HTDOCS_PATH , plasmaSwitchboard . HTDOCS_PATH_DEFAULT ) ;
2007-03-21 13:23:27 +01:00
if ( ! ( htDocsPath . exists ( ) ) ) htDocsPath . mkdir ( ) ;
2005-12-05 10:13:13 +01:00
//final File htTemplatePath = new File(homePath, sb.getConfig("htTemplatePath","htdocs"));
2005-08-02 21:40:29 +02:00
2005-08-02 02:16:19 +02:00
// create default notifier picture
2006-02-03 22:21:42 +01:00
//TODO: Use templates instead of copying images ...
2006-02-04 11:50:22 +01:00
if ( ! ( ( new File ( htDocsPath , " notifier.gif " ) ) . exists ( ) ) ) try {
2005-09-01 17:27:41 +02:00
serverFileUtils . copy ( new File ( htRootPath , " env/grafics/empty.gif " ) ,
2006-02-03 22:21:42 +01:00
new File ( htDocsPath , " notifier.gif " ) ) ;
2005-08-02 02:16:19 +02:00
} catch ( IOException e ) { }
2005-08-02 21:40:29 +02:00
2008-02-18 17:38:06 +01:00
final File htdocsReadme = new File ( htDocsPath , " readme.txt " ) ;
if ( ! ( htdocsReadme . exists ( ) ) ) try { serverFileUtils . write ( (
2005-05-11 11:44:36 +02:00
" This is your root directory for individual Web Content \ r \ n " +
" \ r \ n " +
" Please place your html files into the www subdirectory. \ r \ n " +
" The URL of that path is either \ r \ n " +
" http://www.<your-peer-name>.yacy or \ r \ n " +
" http://<your-ip>:<your-port>/www \ r \ n " +
" \ r \ n " +
" Other subdirectories may be created; they map to corresponding sub-domains. \ r \ n " +
" This directory shares it's content with the applications htroot path, so you \ r \ n " +
" may access your yacy search page with \ r \ n " +
" http://<your-peer-name>.yacy/ \ r \ n " +
2008-02-18 17:38:06 +01:00
" \ r \ n " ) . getBytes ( ) , htdocsReadme ) ; } catch ( IOException e ) {
2005-07-03 14:40:36 +02:00
System . out . println ( " Error creating htdocs readme: " + e . getMessage ( ) ) ;
}
2005-08-02 21:40:29 +02:00
2005-09-20 17:36:22 +02:00
final File wwwDefaultPath = new File ( htDocsPath , " www " ) ;
2005-04-07 21:19:42 +02:00
if ( ! ( wwwDefaultPath . exists ( ) ) ) wwwDefaultPath . mkdir ( ) ;
2005-08-02 21:40:29 +02:00
2005-09-20 17:36:22 +02:00
final File shareDefaultPath = new File ( htDocsPath , " share " ) ;
2005-04-07 21:19:42 +02:00
if ( ! ( shareDefaultPath . exists ( ) ) ) shareDefaultPath . mkdir ( ) ;
2005-08-02 21:40:29 +02:00
2006-02-12 17:46:43 +01:00
migration . migrate ( sb , oldRev , newRev ) ;
2005-12-06 23:30:15 +01:00
2005-05-11 11:44:36 +02:00
// start main threads
2006-03-04 12:07:01 +01:00
final String port = sb . getConfig ( " port " , " 8080 " ) ;
2005-05-11 11:44:36 +02:00
try {
2007-08-09 23:58:38 +02:00
final httpd protocolHandler = new httpd ( sb ) ;
2005-11-15 16:03:15 +01:00
final serverCore server = new serverCore (
2005-05-11 11:44:36 +02:00
timeout /*control socket timeout in milliseconds*/ ,
true /* block attacks (wrong protocol) */ ,
protocolHandler /*command class*/ ,
2005-09-21 02:12:37 +02:00
sb ,
2005-06-10 11:19:24 +02:00
30000 /*command max length incl. GET args*/ ) ;
2005-05-11 11:44:36 +02:00
server . setName ( " httpd: " + port ) ;
2005-07-03 14:40:36 +02:00
server . setPriority ( Thread . MAX_PRIORITY ) ;
2005-10-31 11:46:13 +01:00
server . setObeyIntermission ( false ) ;
2005-05-11 11:44:36 +02:00
if ( server = = null ) {
2005-08-30 23:32:59 +02:00
serverLog . logSevere ( " STARTUP " , " Failed to start server. Probably port " + port + " already in use. " ) ;
2005-05-11 11:44:36 +02:00
} else {
// first start the server
2005-09-21 02:12:37 +02:00
sb . deployThread ( " 10_httpd " , " HTTPD Server/Proxy " , " the HTTPD, used as web server and proxy " , null , server , 0 , 0 , 0 , 0 ) ;
2005-05-11 11:44:36 +02:00
//server.start();
2005-08-02 21:40:29 +02:00
2005-05-11 11:44:36 +02:00
// open the browser window
2005-09-21 02:12:37 +02:00
final boolean browserPopUpTrigger = sb . getConfig ( " browserPopUpTrigger " , " true " ) . equals ( " true " ) ;
2005-05-11 11:44:36 +02:00
if ( browserPopUpTrigger ) {
2006-03-01 23:27:20 +01:00
String browserPopUpPage = sb . getConfig ( " browserPopUpPage " , " ConfigBasic.html " ) ;
2007-02-05 20:46:50 +01:00
boolean properPW = ( sb . getConfig ( " adminAccount " , " " ) . length ( ) = = 0 ) & & ( sb . getConfig ( httpd . ADMIN_ACCOUNT_B64MD5 , " " ) . length ( ) > 0 ) ;
2006-03-01 23:27:20 +01:00
if ( ! properPW ) browserPopUpPage = " ConfigBasic.html " ;
2005-09-21 02:12:37 +02:00
final String browserPopUpApplication = sb . getConfig ( " browserPopUpApplication " , " netscape " ) ;
2006-05-20 16:05:49 +02:00
serverSystem . openBrowser ( ( server . withSSL ( ) ? " https " : " http " ) + " ://localhost: " + serverCore . getPortNr ( port ) + " / " + browserPopUpPage , browserPopUpApplication ) ;
2005-05-11 11:44:36 +02:00
}
2005-08-02 21:40:29 +02:00
2007-07-04 12:32:30 +02:00
// Copy the shipped locales into DATA, existing files are overwritten
2007-11-04 11:36:25 +01:00
final File locale_work = sb . getConfigPath ( " locale.work " , " DATA/LOCALE/locales " ) ;
final File locale_source = sb . getConfigPath ( " locale.source " , " locales " ) ;
2005-05-27 10:36:07 +02:00
try {
2007-07-04 12:32:30 +02:00
final File [ ] locale_source_files = locale_source . listFiles ( ) ;
locale_work . mkdirs ( ) ;
File target ;
for ( int i = 0 ; i < locale_source_files . length ; i + + ) {
target = new File ( locale_work , locale_source_files [ i ] . getName ( ) ) ;
if ( locale_source_files [ i ] . getName ( ) . endsWith ( " .lng " ) ) {
if ( target . exists ( ) ) target . delete ( ) ;
serverFileUtils . copy ( locale_source_files [ i ] , target ) ;
}
2005-05-27 10:36:07 +02:00
}
2007-07-04 12:32:30 +02:00
serverLog . logInfo ( " STARTUP " , " Copied the default locales to " + locale_work . toString ( ) ) ;
2005-05-27 10:36:07 +02:00
} catch ( NullPointerException e ) {
2005-08-30 23:32:59 +02:00
serverLog . logSevere ( " STARTUP " , " Nullpointer Exception while copying the default Locales " ) ;
2005-05-27 10:36:07 +02:00
}
2005-08-02 21:40:29 +02:00
2005-07-03 14:40:36 +02:00
//regenerate Locales from Translationlist, if needed
2007-07-04 12:32:30 +02:00
final String lang = sb . getConfig ( " locale.language " , " " ) ;
if ( ! lang . equals ( " " ) & & ! lang . equals ( " default " ) ) { //locale is used
2005-07-03 14:40:36 +02:00
String currentRev = " " ;
try {
2007-11-04 11:36:25 +01:00
final BufferedReader br = new BufferedReader ( new InputStreamReader ( new FileInputStream ( new File ( sb . getConfigPath ( " locale.translated_html " , " DATA/LOCALE/htroot " ) , lang + " /version " ) ) ) ) ;
2005-07-03 14:40:36 +02:00
currentRev = br . readLine ( ) ;
br . close ( ) ;
} catch ( IOException e ) {
//Error
}
2005-08-02 21:40:29 +02:00
2007-07-04 12:32:30 +02:00
if ( ! currentRev . equals ( sb . getConfig ( " svnRevision " , " " ) ) ) try { //is this another version?!
final File sourceDir = new File ( sb . getConfig ( " htRootPath " , " htroot " ) ) ;
2007-11-04 11:36:25 +01:00
final File destDir = new File ( sb . getConfigPath ( " locale.translated_html " , " DATA/LOCALE/htroot " ) , lang ) ;
2007-07-04 12:32:30 +02:00
if ( translator . translateFilesRecursive ( sourceDir , destDir , new File ( locale_work , lang + " .lng " ) , " html,template,inc " , " locale " ) ) { //translate it
//write the new Versionnumber
final BufferedWriter bw = new BufferedWriter ( new PrintWriter ( new FileWriter ( new File ( destDir , " version " ) ) ) ) ;
bw . write ( sb . getConfig ( " svnRevision " , " Error getting Version " ) ) ;
bw . close ( ) ;
2005-07-03 14:40:36 +02:00
}
2007-07-04 12:32:30 +02:00
} catch ( IOException e ) { }
2005-07-03 14:40:36 +02:00
}
2007-10-16 04:12:31 +02:00
// initialize number formatter with this locale
yFormatter . setLocale ( lang ) ;
2005-05-11 11:44:36 +02:00
// registering shutdown hook
2005-08-30 23:10:39 +02:00
serverLog . logConfig ( " STARTUP " , " Registering Shutdown Hook " ) ;
2005-09-20 17:36:22 +02:00
final Runtime run = Runtime . getRuntime ( ) ;
2005-09-21 02:12:37 +02:00
run . addShutdownHook ( new shutdownHookThread ( Thread . currentThread ( ) , sb ) ) ;
2005-08-02 21:40:29 +02:00
2005-09-20 12:10:34 +02:00
// save information about available memory after all initializations
2006-01-31 00:07:20 +01:00
//try {
2008-03-02 16:42:50 +01:00
sb . setConfig ( " memoryFreeAfterInitBGC " , serverMemory . free ( ) ) ;
sb . setConfig ( " memoryTotalAfterInitBGC " , serverMemory . total ( ) ) ;
2006-01-30 13:42:06 +01:00
System . gc ( ) ;
2008-03-02 16:42:50 +01:00
sb . setConfig ( " memoryFreeAfterInitAGC " , serverMemory . free ( ) ) ;
sb . setConfig ( " memoryTotalAfterInitAGC " , serverMemory . total ( ) ) ;
2006-01-31 00:07:20 +01:00
//} catch (ConcurrentModificationException e) {}
2005-09-21 14:21:01 +02:00
2007-05-06 10:22:18 +02:00
// signal finished startup
startupFinishedSync . V ( ) ;
2005-05-11 11:44:36 +02:00
// wait for server shutdown
try {
2005-09-21 02:12:37 +02:00
sb . waitForShutdown ( ) ;
2005-05-11 11:44:36 +02:00
} catch ( Exception e ) {
2006-03-13 21:12:31 +01:00
serverLog . logSevere ( " MAIN CONTROL LOOP " , " PANIC: " + e . getMessage ( ) , e ) ;
2005-04-07 21:19:42 +02:00
}
// shut down
2008-02-27 16:16:47 +01:00
if ( kelondroRowCollection . sortingthread ! = null ) kelondroRowCollection . sortingthread . terminate ( ) ;
2005-08-30 23:10:39 +02:00
serverLog . logConfig ( " SHUTDOWN " , " caught termination signal " ) ;
2005-04-07 21:19:42 +02:00
server . terminate ( false ) ;
2005-05-11 11:44:36 +02:00
server . interrupt ( ) ;
if ( server . isAlive ( ) ) try {
2007-09-05 11:01:35 +02:00
yacyURL u = new yacyURL ( ( server . withSSL ( ) ? " https " : " http " ) + " ://localhost: " + serverCore . getPortNr ( port ) , null ) ;
2007-07-04 00:55:47 +02:00
httpc . wget ( u , u . getHost ( ) , 1000 , null , null , null , null , null ) ; // kick server
2005-08-30 23:10:39 +02:00
serverLog . logConfig ( " SHUTDOWN " , " sent termination signal to server socket " ) ;
2005-04-07 21:19:42 +02:00
} catch ( IOException ee ) {
2005-08-30 23:10:39 +02:00
serverLog . logConfig ( " SHUTDOWN " , " termination signal to server socket missed (server shutdown, ok) " ) ;
2005-04-07 21:19:42 +02:00
}
2005-08-02 21:40:29 +02:00
2005-04-07 21:19:42 +02:00
// idle until the processes are down
while ( server . isAlive ( ) ) {
2005-12-05 10:13:13 +01:00
Thread . sleep ( 2000 ) ; // wait a while
2005-04-07 21:19:42 +02:00
}
2005-08-30 23:10:39 +02:00
serverLog . logConfig ( " SHUTDOWN " , " server has terminated " ) ;
2005-09-21 02:12:37 +02:00
sb . close ( ) ;
2005-04-07 21:19:42 +02:00
}
2005-05-11 11:44:36 +02:00
} catch ( Exception e ) {
2005-08-30 23:32:59 +02:00
serverLog . logSevere ( " STARTUP " , " Unexpected Error: " + e . getClass ( ) . getName ( ) , e ) ;
2005-05-11 11:44:36 +02:00
//System.exit(1);
}
} catch ( Exception ee ) {
2005-08-30 23:32:59 +02:00
serverLog . logSevere ( " STARTUP " , " FATAL ERROR: " + ee . getMessage ( ) , ee ) ;
2007-05-06 10:22:18 +02:00
} finally {
2007-05-06 10:54:02 +02:00
startupFinishedSync . V ( ) ;
2005-05-11 11:44:36 +02:00
}
2007-01-04 23:03:32 +01:00
serverLog . logConfig ( " SHUTDOWN " , " goodbye. (this is the last line) " ) ;
2007-05-03 21:16:43 +02:00
//try {
// System.exit(0);
//} catch (Exception e) {} // was once stopped by de.anomic.net.ftpc$sm.checkExit(ftpc.java:1790)
2005-04-07 21:19:42 +02:00
}
2005-08-02 21:40:29 +02:00
/ * *
* Loads the configuration from the data - folder .
* FIXME : Why is this called over and over again from every method , instead
* of setting the configurationdata once for this class in main ?
*
* @param mes Where are we called from , so that the errormessages can be
* more descriptive .
* @param homePath Root - path where all the information is to be found .
* @return Properties read from the configurationfile .
* /
2008-02-01 00:40:47 +01:00
private static Properties configuration ( String mes , File homePath ) {
2005-08-30 23:10:39 +02:00
serverLog . logConfig ( mes , " Application Root Path: " + homePath . toString ( ) ) ;
2005-08-02 21:40:29 +02:00
2005-04-07 21:19:42 +02:00
// read data folder
File dataFolder = new File ( homePath , " DATA " ) ;
if ( ! ( dataFolder . exists ( ) ) ) {
2005-08-30 23:32:59 +02:00
serverLog . logSevere ( mes , " Application was never started or root path wrong. " ) ;
2005-04-07 21:19:42 +02:00
System . exit ( - 1 ) ;
}
2005-08-02 21:40:29 +02:00
2005-04-07 21:19:42 +02:00
Properties config = new Properties ( ) ;
try {
config . load ( new FileInputStream ( new File ( homePath , " DATA/SETTINGS/httpProxy.conf " ) ) ) ;
} catch ( FileNotFoundException e ) {
2005-08-30 23:32:59 +02:00
serverLog . logSevere ( mes , " could not find configuration file. " ) ;
2005-04-07 21:19:42 +02:00
System . exit ( - 1 ) ;
} catch ( IOException e ) {
2005-08-30 23:32:59 +02:00
serverLog . logSevere ( mes , " could not read configuration file. " ) ;
2005-04-07 21:19:42 +02:00
System . exit ( - 1 ) ;
}
2005-08-02 21:40:29 +02:00
2005-04-07 21:19:42 +02:00
return config ;
}
2007-04-29 17:56:45 +02:00
public static void shutdown ( ) {
if ( sb ! = null ) {
// YaCy is running in the same runtime. we can shutdown via interrupt
sb . terminate ( ) ;
} else {
2008-02-01 00:40:47 +01:00
File applicationRoot = new File ( System . getProperty ( " user.dir " ) . replace ( '\\' , '/' ) ) ;
2007-04-29 17:56:45 +02:00
shutdown ( applicationRoot ) ;
}
2006-02-01 12:03:37 +01:00
}
2005-08-02 21:40:29 +02:00
/ * *
2006-05-26 14:18:12 +02:00
* Call the shutdown - page of YaCy to tell it to shut down . This method is
2005-08-02 21:40:29 +02:00
* called if you start yacy with the argument - shutdown .
*
* @param homePath Root - path where all the information is to be found .
* /
2008-02-01 00:40:47 +01:00
static void shutdown ( File homePath ) {
2005-04-07 21:19:42 +02:00
// start up
System . out . println ( copyright ) ;
System . out . println ( hline ) ;
2005-08-02 21:40:29 +02:00
2005-04-07 21:19:42 +02:00
Properties config = configuration ( " REMOTE-SHUTDOWN " , homePath ) ;
2005-08-02 21:40:29 +02:00
2005-04-07 21:19:42 +02:00
// read port
2006-03-04 12:07:01 +01:00
int port = serverCore . getPortNr ( config . getProperty ( " port " , " 8080 " ) ) ;
2005-08-02 21:40:29 +02:00
2005-04-07 21:19:42 +02:00
// read password
2007-02-05 20:46:50 +01:00
String encodedPassword = ( String ) config . get ( httpd . ADMIN_ACCOUNT_B64MD5 ) ;
2005-04-07 21:19:42 +02:00
if ( encodedPassword = = null ) encodedPassword = " " ; // not defined
2005-08-02 21:40:29 +02:00
2005-04-07 21:19:42 +02:00
// send 'wget' to web interface
2005-05-11 11:44:36 +02:00
httpHeader requestHeader = new httpHeader ( ) ;
2005-05-12 12:05:17 +02:00
requestHeader . put ( " Authorization " , " realm= " + encodedPassword ) ; // for http-authentify
2005-04-07 21:19:42 +02:00
try {
2007-09-23 22:49:52 +02:00
httpc con = new httpc ( " localhost " , " localhost " , port , 10000 , false , null , null , null ) ;
2005-04-07 21:19:42 +02:00
httpc . response res = con . GET ( " Steering.html?shutdown= " , requestHeader ) ;
2005-08-02 21:40:29 +02:00
2005-04-07 21:19:42 +02:00
// read response
if ( res . status . startsWith ( " 2 " ) ) {
2005-08-30 23:10:39 +02:00
serverLog . logConfig ( " REMOTE-SHUTDOWN " , " YACY accepted shutdown command. " ) ;
serverLog . logConfig ( " REMOTE-SHUTDOWN " , " Stand by for termination, which may last some seconds. " ) ;
2005-04-07 21:19:42 +02:00
ByteArrayOutputStream bos = new ByteArrayOutputStream ( ) ;
res . writeContent ( bos , null ) ;
con . close ( ) ;
} else {
2005-08-30 23:32:59 +02:00
serverLog . logSevere ( " REMOTE-SHUTDOWN " , " error response from YACY socket: " + res . status ) ;
2007-09-25 23:36:08 +02:00
con . close ( ) ;
2005-04-07 21:19:42 +02:00
System . exit ( - 1 ) ;
}
} catch ( IOException e ) {
2005-08-30 23:32:59 +02:00
serverLog . logSevere ( " REMOTE-SHUTDOWN " , " could not establish connection to YACY socket: " + e . getMessage ( ) ) ;
2005-04-07 21:19:42 +02:00
System . exit ( - 1 ) ;
}
2005-08-02 21:40:29 +02:00
2005-04-07 21:19:42 +02:00
// finished
2005-08-30 23:10:39 +02:00
serverLog . logConfig ( " REMOTE-SHUTDOWN " , " SUCCESSFULLY FINISHED remote-shutdown: " ) ;
serverLog . logConfig ( " REMOTE-SHUTDOWN " , " YACY will terminate after working off all enqueued tasks. " ) ;
2005-04-07 21:19:42 +02:00
}
2005-08-02 21:40:29 +02:00
/ * *
* This method gets all found words and outputs a statistic about the score
* of the words . The output of this method can be used to create stop - word
* lists . This method will be called if you start yacy with the argument
* - genwordstat .
* FIXME : How can stop - word list be created from this output ? What type of
* score is output ?
*
* @param homePath Root - Path where all the information is to be found .
* /
2008-02-01 00:40:47 +01:00
private static void genWordstat ( File homePath ) {
2005-04-07 21:19:42 +02:00
// start up
System . out . println ( copyright ) ;
System . out . println ( hline ) ;
2005-08-02 21:40:29 +02:00
2005-04-07 21:19:42 +02:00
Properties config = configuration ( " GEN-WORDSTAT " , homePath ) ;
2005-08-02 21:40:29 +02:00
2005-05-11 11:44:36 +02:00
// load words
2005-04-07 21:19:42 +02:00
serverLog . logInfo ( " GEN-WORDSTAT " , " loading words... " ) ;
2008-01-23 22:23:17 +01:00
HashMap < String , String > words = loadWordMap ( new File ( homePath , " yacy.words " ) ) ;
2005-08-02 21:40:29 +02:00
2005-04-07 21:19:42 +02:00
// find all hashes
serverLog . logInfo ( " GEN-WORDSTAT " , " searching all word-hash databases... " ) ;
File dbRoot = new File ( homePath , config . getProperty ( " dbPath " ) ) ;
2005-06-07 03:05:55 +02:00
enumerateFiles ef = new enumerateFiles ( new File ( dbRoot , " WORDS " ) , true , false , true , true ) ;
2005-04-07 21:19:42 +02:00
File f ;
String h ;
2007-12-28 19:47:45 +01:00
kelondroMScoreCluster < String > hs = new kelondroMScoreCluster < String > ( ) ;
2005-04-07 21:19:42 +02:00
while ( ef . hasMoreElements ( ) ) {
f = ( File ) ef . nextElement ( ) ;
2006-11-08 17:17:47 +01:00
h = f . getName ( ) . substring ( 0 , yacySeedDB . commonHashLength ) ;
2005-04-07 21:19:42 +02:00
hs . addScore ( h , ( int ) f . length ( ) ) ;
}
2005-08-02 21:40:29 +02:00
2005-04-07 21:19:42 +02:00
// list the hashes in reverse order
serverLog . logInfo ( " GEN-WORDSTAT " , " listing words in reverse size order... " ) ;
String w ;
2008-01-23 22:23:17 +01:00
Iterator < String > i = hs . scores ( false ) ;
2005-04-07 21:19:42 +02:00
while ( i . hasNext ( ) ) {
2008-01-23 22:23:17 +01:00
h = i . next ( ) ;
w = words . get ( h ) ;
2005-04-07 21:19:42 +02:00
if ( w = = null ) System . out . print ( " # " + h ) ; else System . out . print ( w ) ;
System . out . println ( " - " + hs . getScore ( h ) ) ;
}
2005-08-02 21:40:29 +02:00
2005-04-07 21:19:42 +02:00
// finished
2005-08-30 23:10:39 +02:00
serverLog . logConfig ( " GEN-WORDSTAT " , " FINISHED " ) ;
2005-04-07 21:19:42 +02:00
}
2005-10-13 14:31:32 +02:00
2006-06-09 07:38:59 +02:00
/ * *
* @param homePath path to the YaCy directory
* @param dbcache cache size in MB
* /
2008-02-01 00:40:47 +01:00
public static void minimizeUrlDB ( File homePath ) {
2006-02-21 15:10:00 +01:00
// run with "java -classpath classes yacy -minimizeUrlDB"
2008-02-01 00:40:47 +01:00
try { serverLog . configureLogging ( homePath , new File ( homePath , " DATA/LOG/yacy.logging " ) ) ; } catch ( Exception e ) { }
File indexPrimaryRoot = new File ( homePath , " DATA/INDEX " ) ;
File indexSecondaryRoot = new File ( homePath , " DATA/INDEX " ) ;
File indexRoot2 = new File ( homePath , " DATA/INDEX2 " ) ;
2006-04-20 10:20:12 +02:00
serverLog log = new serverLog ( " URL-CLEANUP " ) ;
2005-10-05 12:45:33 +02:00
try {
log . logInfo ( " STARTING URL CLEANUP " ) ;
// db containing all currently loades urls
2008-02-19 10:14:07 +01:00
plasmaCrawlLURL currentUrlDB = new plasmaCrawlLURL ( indexSecondaryRoot ) ;
2005-10-05 12:45:33 +02:00
// db used to hold all neede urls
2008-02-19 10:14:07 +01:00
plasmaCrawlLURL minimizedUrlDB = new plasmaCrawlLURL ( indexRoot2 ) ;
2005-10-05 12:45:33 +02:00
2008-03-02 16:42:50 +01:00
int cacheMem = ( int ) ( serverMemory . max ( ) - serverMemory . total ( ) ) ;
2006-12-05 03:47:51 +01:00
if ( cacheMem < 2048000 ) throw new OutOfMemoryError ( " Not enough memory available to start clean up. " ) ;
2006-06-09 07:38:59 +02:00
2008-02-19 10:14:07 +01:00
plasmaWordIndex wordIndex = new plasmaWordIndex ( indexPrimaryRoot , indexSecondaryRoot , log ) ;
2008-01-23 22:23:17 +01:00
Iterator < indexContainer > indexContainerIterator = wordIndex . wordContainers ( " AAAAAAAAAAAA " , false , false ) ;
2005-10-05 12:45:33 +02:00
long urlCounter = 0 , wordCounter = 0 ;
long wordChunkStart = System . currentTimeMillis ( ) , wordChunkEnd = 0 ;
2007-01-31 10:22:22 +01:00
String wordChunkStartHash = " AAAAAAAAAAAA " , wordChunkEndHash ;
2005-10-05 12:45:33 +02:00
2006-07-26 13:21:51 +02:00
while ( indexContainerIterator . hasNext ( ) ) {
2006-05-28 03:09:31 +02:00
indexContainer wordIdxContainer = null ;
2005-10-05 12:45:33 +02:00
try {
wordCounter + + ;
2008-01-23 22:23:17 +01:00
wordIdxContainer = indexContainerIterator . next ( ) ;
2005-10-05 12:45:33 +02:00
// the combined container will fit, read the container
2008-01-23 22:23:17 +01:00
Iterator < indexRWIRowEntry > wordIdxEntries = wordIdxContainer . entries ( ) ;
2006-11-08 17:17:47 +01:00
indexRWIEntry iEntry ;
2005-10-13 15:57:15 +02:00
while ( wordIdxEntries . hasNext ( ) ) {
2006-11-08 17:17:47 +01:00
iEntry = ( indexRWIEntry ) wordIdxEntries . next ( ) ;
2006-08-02 21:59:28 +02:00
String urlHash = iEntry . urlHash ( ) ;
2005-12-15 11:31:00 +01:00
if ( ( currentUrlDB . exists ( urlHash ) ) & & ( ! minimizedUrlDB . exists ( urlHash ) ) ) try {
2007-11-16 15:48:09 +01:00
indexURLEntry urlEntry = currentUrlDB . load ( urlHash , null , 0 ) ;
2005-12-15 11:31:00 +01:00
urlCounter + + ;
2006-10-16 17:04:16 +02:00
minimizedUrlDB . store ( urlEntry ) ;
2005-10-05 12:45:33 +02:00
if ( urlCounter % 500 = = 0 ) {
log . logInfo ( urlCounter + " URLs found so far. " ) ;
}
2005-12-15 11:31:00 +01:00
} catch ( IOException e ) { }
2005-10-05 12:45:33 +02:00
}
if ( wordCounter % 500 = = 0 ) {
2006-07-26 13:21:51 +02:00
wordChunkEndHash = wordIdxContainer . getWordHash ( ) ;
2005-10-05 12:45:33 +02:00
wordChunkEnd = System . currentTimeMillis ( ) ;
long duration = wordChunkEnd - wordChunkStart ;
log . logInfo ( wordCounter + " words scanned " +
" [ " + wordChunkStartHash + " .. " + wordChunkEndHash + " ] \ n " +
" Duration: " + 500 * 1000 / duration + " words/s " +
2008-03-02 16:42:50 +01:00
" | Free memory: " + serverMemory . free ( ) +
" | Total memory: " + serverMemory . total ( ) ) ;
2005-10-05 12:45:33 +02:00
wordChunkStart = wordChunkEnd ;
wordChunkStartHash = wordChunkEndHash ;
}
2006-07-26 13:21:51 +02:00
// we have read all elements, now we can close it
wordIdxContainer = null ;
2005-10-05 12:45:33 +02:00
} catch ( Exception e ) {
2006-04-20 10:20:12 +02:00
log . logSevere ( " Exception " , e ) ;
2005-10-05 12:45:33 +02:00
} finally {
2006-02-25 09:42:45 +01:00
if ( wordIdxContainer ! = null ) try { wordIdxContainer = null ; } catch ( Exception e ) { }
2005-10-05 12:45:33 +02:00
}
}
2006-10-22 09:09:45 +02:00
log . logInfo ( " current LURL DB contains " + currentUrlDB . size ( ) + " entries. " ) ;
log . logInfo ( " mimimized LURL DB contains " + minimizedUrlDB . size ( ) + " entries. " ) ;
2005-10-05 12:45:33 +02:00
currentUrlDB . close ( ) ;
minimizedUrlDB . close ( ) ;
2006-12-05 03:47:51 +01:00
wordIndex . close ( ) ;
2005-10-05 12:45:33 +02:00
2006-06-09 07:38:59 +02:00
// TODO: rename the mimimized UrlDB to the name of the previous UrlDB
2005-10-05 12:45:33 +02:00
log . logInfo ( " FINISHED URL CLEANUP, WAIT FOR DUMP " ) ;
2006-09-14 12:12:41 +02:00
log . logInfo ( " You can now backup your old URL DB and rename minimized/urlHash.db to urlHash.db " ) ;
2006-06-09 07:38:59 +02:00
2005-10-05 12:45:33 +02:00
log . logInfo ( " TERMINATED URL CLEANUP " ) ;
2006-06-09 07:38:59 +02:00
} catch ( Exception e ) {
log . logSevere ( " Exception: " + e . getMessage ( ) , e ) ;
} catch ( Error e ) {
log . logSevere ( " Error: " + e . getMessage ( ) , e ) ;
2005-10-05 12:45:33 +02:00
}
}
2005-08-02 21:40:29 +02:00
/ * *
* Reads all words from the given file and creates a hashmap , where key is
* the plasma word hash and value is the word itself .
*
* @param wordlist File where the words are stored .
* @return HashMap with the hash - word - relation .
* /
2008-01-23 22:23:17 +01:00
private static HashMap < String , String > loadWordMap ( File wordlist ) {
2005-05-11 11:44:36 +02:00
// returns a hash-word - Relation
2008-01-23 22:23:17 +01:00
HashMap < String , String > wordmap = new HashMap < String , String > ( ) ;
2005-04-07 21:19:42 +02:00
try {
String word ;
BufferedReader br = new BufferedReader ( new InputStreamReader ( new FileInputStream ( wordlist ) ) ) ;
2008-01-23 22:23:17 +01:00
while ( ( word = br . readLine ( ) ) ! = null ) wordmap . put ( plasmaCondenser . word2hash ( word ) , word ) ;
2005-04-07 21:19:42 +02:00
br . close ( ) ;
} catch ( IOException e ) { }
return wordmap ;
}
2005-08-02 21:40:29 +02:00
/ * *
* Cleans a wordlist in a file according to the length of the words . The
* file with the given filename is read and then only the words in the given
* length - range are written back to the file .
*
* @param wordlist Name of the file the words are stored in .
* @param minlength Minimal needed length for each word to be stored .
* @param maxlength Maximal allowed length for each word to be stored .
* /
2005-04-07 21:19:42 +02:00
private static void cleanwordlist ( String wordlist , int minlength , int maxlength ) {
// start up
System . out . println ( copyright ) ;
System . out . println ( hline ) ;
2005-08-30 23:10:39 +02:00
serverLog . logConfig ( " CLEAN-WORDLIST " , " START " ) ;
2005-08-02 21:40:29 +02:00
2005-04-07 21:19:42 +02:00
String word ;
2008-01-23 22:23:17 +01:00
TreeSet < String > wordset = new TreeSet < String > ( ) ;
2005-04-07 21:19:42 +02:00
int count = 0 ;
try {
BufferedReader br = new BufferedReader ( new InputStreamReader ( new FileInputStream ( wordlist ) ) ) ;
2005-05-11 11:44:36 +02:00
String seps = " ' .,:/-& " ;
2005-04-07 21:19:42 +02:00
while ( ( word = br . readLine ( ) ) ! = null ) {
word = word . toLowerCase ( ) . trim ( ) ;
2005-05-11 11:44:36 +02:00
for ( int i = 0 ; i < seps . length ( ) ; i + + ) {
2005-04-07 21:19:42 +02:00
if ( word . indexOf ( seps . charAt ( i ) ) > = 0 ) word = word . substring ( 0 , word . indexOf ( seps . charAt ( i ) ) ) ;
2005-05-11 11:44:36 +02:00
}
2005-04-07 21:19:42 +02:00
if ( ( word . length ( ) > = minlength ) & & ( word . length ( ) < = maxlength ) ) wordset . add ( word ) ;
count + + ;
}
br . close ( ) ;
2005-08-02 21:40:29 +02:00
2005-04-07 21:19:42 +02:00
if ( wordset . size ( ) ! = count ) {
count = count - wordset . size ( ) ;
BufferedWriter bw = new BufferedWriter ( new PrintWriter ( new FileWriter ( wordlist ) ) ) ;
while ( wordset . size ( ) > 0 ) {
word = ( String ) wordset . first ( ) ;
bw . write ( word + " \ n " ) ;
wordset . remove ( word ) ;
}
bw . close ( ) ;
serverLog . logInfo ( " CLEAN-WORDLIST " , " shrinked wordlist by " + count + " words. " ) ;
} else {
serverLog . logInfo ( " CLEAN-WORDLIST " , " not necessary to change wordlist " ) ;
}
} catch ( IOException e ) {
2005-08-30 23:32:59 +02:00
serverLog . logSevere ( " CLEAN-WORDLIST " , " ERROR: " + e . getMessage ( ) ) ;
2005-04-07 21:19:42 +02:00
System . exit ( - 1 ) ;
}
2005-08-02 21:40:29 +02:00
2005-04-07 21:19:42 +02:00
// finished
2005-08-30 23:10:39 +02:00
serverLog . logConfig ( " CLEAN-WORDLIST " , " FINISHED " ) ;
2005-04-07 21:19:42 +02:00
}
2005-08-02 21:40:29 +02:00
2005-11-11 00:48:20 +01:00
private static void transferCR ( String targetaddress , String crfile ) {
File f = new File ( crfile ) ;
try {
byte [ ] b = serverFileUtils . read ( f ) ;
String result = yacyClient . transfer ( targetaddress , f . getName ( ) , b ) ;
if ( result = = null )
serverLog . logInfo ( " TRANSFER-CR " , " transmitted file " + crfile + " to " + targetaddress + " successfully " ) ;
else
serverLog . logInfo ( " TRANSFER-CR " , " error transmitting file " + crfile + " to " + targetaddress + " : " + result ) ;
} catch ( IOException e ) {
serverLog . logInfo ( " TRANSFER-CR " , " could not read file " + crfile ) ;
}
}
2005-12-07 02:40:52 +01:00
private static String [ ] shift ( String [ ] args , int pos , int count ) {
String [ ] newargs = new String [ args . length - count ] ;
System . arraycopy ( args , 0 , newargs , 0 , pos ) ;
System . arraycopy ( args , pos + count , newargs , pos , args . length - pos - count ) ;
return newargs ;
}
2005-12-07 12:10:08 +01:00
/ * *
* Uses an Iteration over urlHash . db to detect malformed URL - Entries .
* Damaged URL - Entries will be marked in a HashSet and removed at the end of the function .
*
* @param homePath Root - Path where all information is to be found .
* /
2008-02-01 00:40:47 +01:00
private static void urldbcleanup ( File homePath ) {
File root = homePath ;
2006-10-19 23:14:37 +02:00
File indexroot = new File ( root , " DATA/INDEX " ) ;
2008-02-01 00:40:47 +01:00
try { serverLog . configureLogging ( homePath , new File ( homePath , " DATA/LOG/yacy.logging " ) ) ; } catch ( Exception e ) { }
2008-02-19 10:14:07 +01:00
plasmaCrawlLURL currentUrlDB = new plasmaCrawlLURL ( indexroot ) ;
2007-03-09 09:48:47 +01:00
currentUrlDB . urldbcleanup ( ) ;
currentUrlDB . close ( ) ;
2005-12-07 12:10:08 +01:00
}
2008-02-01 00:40:47 +01:00
private static void RWIHashList ( File homePath , String targetName , String resource , String format ) {
2006-01-14 00:59:04 +01:00
plasmaWordIndex WordIndex = null ;
2006-01-04 14:55:45 +01:00
serverLog log = new serverLog ( " HASHLIST " ) ;
2008-02-01 00:40:47 +01:00
File indexPrimaryRoot = new File ( homePath , " DATA/INDEX " ) ;
File indexSecondaryRoot = new File ( homePath , " DATA/INDEX " ) ;
2006-12-08 13:57:17 +01:00
String wordChunkStartHash = " AAAAAAAAAAAA " ;
2008-02-01 00:40:47 +01:00
try { serverLog . configureLogging ( homePath , new File ( homePath , " DATA/LOG/yacy.logging " ) ) ; } catch ( Exception e ) { }
2006-01-04 14:55:45 +01:00
log . logInfo ( " STARTING CREATION OF RWI-HASHLIST " ) ;
2008-02-01 00:40:47 +01:00
File root = homePath ;
2006-01-04 14:55:45 +01:00
try {
2008-01-23 22:23:17 +01:00
Iterator < indexContainer > indexContainerIterator = null ;
2006-01-10 02:04:22 +01:00
if ( resource . equals ( " all " ) ) {
2008-02-19 10:14:07 +01:00
WordIndex = new plasmaWordIndex ( indexPrimaryRoot , indexSecondaryRoot , log ) ;
2006-12-05 03:47:51 +01:00
indexContainerIterator = WordIndex . wordContainers ( wordChunkStartHash , false , false ) ;
}
2006-01-04 14:55:45 +01:00
int counter = 0 ;
2006-07-26 13:21:51 +02:00
indexContainer container = null ;
2006-01-15 11:29:48 +01:00
if ( format . equals ( " zip " ) ) {
log . logInfo ( " Writing Hashlist to ZIP-file: " + targetName + " .zip " ) ;
ZipEntry zipEntry = new ZipEntry ( targetName + " .txt " ) ;
File file = new File ( root , targetName + " .zip " ) ;
ZipOutputStream bos = new ZipOutputStream ( new FileOutputStream ( file ) ) ;
bos . putNextEntry ( zipEntry ) ;
2006-07-26 13:21:51 +02:00
while ( indexContainerIterator . hasNext ( ) ) {
2006-01-15 11:29:48 +01:00
counter + + ;
2008-01-23 22:23:17 +01:00
container = indexContainerIterator . next ( ) ;
2006-07-26 13:21:51 +02:00
bos . write ( ( container . getWordHash ( ) ) . getBytes ( ) ) ;
2007-12-14 20:17:54 +01:00
bos . write ( serverCore . CRLF ) ;
2006-01-15 11:29:48 +01:00
if ( counter % 500 = = 0 ) {
2006-07-26 13:21:51 +02:00
log . logInfo ( " Found " + counter + " Hashs until now. Last found Hash: " + container . getWordHash ( ) ) ;
2006-01-15 11:29:48 +01:00
}
}
2007-01-31 10:22:22 +01:00
bos . flush ( ) ;
2006-01-15 11:29:48 +01:00
bos . close ( ) ;
2006-07-26 13:21:51 +02:00
} else {
2006-01-15 11:29:48 +01:00
log . logInfo ( " Writing Hashlist to TXT-file: " + targetName + " .txt " ) ;
File file = new File ( root , targetName + " .txt " ) ;
BufferedOutputStream bos = new BufferedOutputStream ( new FileOutputStream ( file ) ) ;
2006-07-26 13:21:51 +02:00
while ( indexContainerIterator . hasNext ( ) ) {
2006-01-15 11:29:48 +01:00
counter + + ;
2008-01-23 22:23:17 +01:00
container = indexContainerIterator . next ( ) ;
2006-07-26 13:21:51 +02:00
bos . write ( ( container . getWordHash ( ) ) . getBytes ( ) ) ;
2007-12-14 20:17:54 +01:00
bos . write ( serverCore . CRLF ) ;
2006-01-15 11:29:48 +01:00
if ( counter % 500 = = 0 ) {
2006-07-26 13:21:51 +02:00
log . logInfo ( " Found " + counter + " Hashs until now. Last found Hash: " + container . getWordHash ( ) ) ;
2006-01-15 11:29:48 +01:00
}
2006-01-04 14:55:45 +01:00
}
2007-01-31 10:22:22 +01:00
bos . flush ( ) ;
2006-01-15 11:29:48 +01:00
bos . close ( ) ;
2006-01-04 14:55:45 +01:00
}
2006-07-26 13:21:51 +02:00
log . logInfo ( " Total number of Hashs: " + counter + " . Last found Hash: " + container . getWordHash ( ) ) ;
2006-01-04 14:55:45 +01:00
} catch ( IOException e ) {
2006-04-20 10:20:12 +02:00
log . logSevere ( " IOException " , e ) ;
2006-01-10 02:04:22 +01:00
}
2006-01-14 00:59:04 +01:00
if ( WordIndex ! = null ) {
2006-12-05 03:47:51 +01:00
WordIndex . close ( ) ;
2006-01-14 00:59:04 +01:00
WordIndex = null ;
}
2006-01-04 14:55:45 +01:00
}
2006-01-30 09:28:22 +01:00
/ * *
2007-07-19 17:32:10 +02:00
* Searching for peers affected by Bug
2006-01-30 09:28:22 +01:00
* @param homePath
* /
2008-02-01 00:40:47 +01:00
public static void testPeerDB ( File homePath ) {
2006-01-30 09:28:22 +01:00
try {
2008-02-01 00:40:47 +01:00
File yacyDBPath = new File ( homePath , " DATA/YACYDB " ) ;
2006-01-30 09:28:22 +01:00
String [ ] dbFileNames = { " seed.new.db " , " seed.old.db " , " seed.pot.db " } ;
for ( int i = 0 ; i < dbFileNames . length ; i + + ) {
File dbFile = new File ( yacyDBPath , dbFileNames [ i ] ) ;
2008-02-19 10:14:07 +01:00
kelondroMapObjects db = new kelondroMapObjects ( new kelondroDyn ( dbFile , true , true , yacySeedDB . commonHashLength , 480 , '#' , kelondroBase64Order . enhancedCoder , true , false , true ) , 500 , yacySeedDB . sortFields , yacySeedDB . longaccFields , yacySeedDB . doubleaccFields , null , null ) ;
2006-01-30 09:28:22 +01:00
2007-01-30 00:51:10 +01:00
kelondroMapObjects . mapIterator it ;
2006-01-30 09:28:22 +01:00
it = db . maps ( true , false ) ;
while ( it . hasNext ( ) ) {
2008-01-23 22:23:17 +01:00
Map < String , String > dna = it . next ( ) ;
2006-01-30 09:28:22 +01:00
String peerHash = ( String ) dna . get ( " key " ) ;
if ( peerHash . length ( ) < yacySeedDB . commonHashLength ) {
String peerName = ( String ) dna . get ( " Name " ) ;
String peerIP = ( String ) dna . get ( " IP " ) ;
String peerPort = ( String ) dna . get ( " Port " ) ;
while ( peerHash . length ( ) < yacySeedDB . commonHashLength ) { peerHash = peerHash + " _ " ; }
2006-01-30 09:31:14 +01:00
System . err . println ( " Invalid Peer-Hash found in ' " + dbFileNames [ i ] + " ': " + peerName + " : " + peerHash + " , http:// " + peerIP + " : " + peerPort ) ;
2006-01-30 09:28:22 +01:00
}
}
db . close ( ) ;
}
} catch ( Exception e ) {
e . printStackTrace ( ) ;
}
}
2006-01-10 02:04:22 +01:00
2005-08-02 21:40:29 +02:00
/ * *
2005-12-11 01:25:02 +01:00
* Main - method which is started by java . Checks for special arguments or
* starts up the application .
*
* @param args
* Given arguments from the command line .
* /
2005-04-07 21:19:42 +02:00
public static void main ( String args [ ] ) {
2005-10-12 14:34:08 +02:00
2006-10-23 02:59:55 +02:00
// check assertion status
//ClassLoader.getSystemClassLoader().setDefaultAssertionStatus(true);
boolean assertionenabled = false ;
assert assertionenabled = true ;
if ( assertionenabled ) System . out . println ( " Asserts are enabled " ) ;
2005-10-12 14:34:08 +02:00
// check memory amount
2005-09-21 02:12:37 +02:00
System . gc ( ) ;
2008-03-02 16:42:50 +01:00
long startupMemFree = serverMemory . free ( ) ;
long startupMemTotal = serverMemory . total ( ) ;
2006-08-18 03:33:54 +02:00
2005-10-12 14:34:08 +02:00
// go into headless awt mode
System . setProperty ( " java.awt.headless " , " true " ) ;
2005-09-21 14:21:01 +02:00
2008-02-01 00:40:47 +01:00
File applicationRoot = new File ( System . getProperty ( " user.dir " ) . replace ( '\\' , '/' ) ) ;
2005-07-01 01:19:08 +02:00
//System.out.println("args.length=" + args.length);
//System.out.print("args=["); for (int i = 0; i < args.length; i++) System.out.print(args[i] + ", "); System.out.println("]");
2006-07-19 13:20:22 +02:00
if ( ( args . length > = 1 ) & & ( ( args [ 0 ] . toLowerCase ( ) . equals ( " -startup " ) ) | | ( args [ 0 ] . equals ( " -start " ) ) ) ) {
2005-04-07 21:19:42 +02:00
// normal start-up of yacy
2008-02-01 00:40:47 +01:00
if ( args . length = = 2 ) applicationRoot = new File ( args [ 1 ] ) ;
2005-09-21 14:21:01 +02:00
startup ( applicationRoot , startupMemFree , startupMemTotal ) ;
2006-07-19 13:20:22 +02:00
} else if ( ( args . length > = 1 ) & & ( ( args [ 0 ] . toLowerCase ( ) . equals ( " -shutdown " ) ) | | ( args [ 0 ] . equals ( " -stop " ) ) ) ) {
2005-04-07 21:19:42 +02:00
// normal shutdown of yacy
2008-02-01 00:40:47 +01:00
if ( args . length = = 2 ) applicationRoot = new File ( args [ 1 ] ) ;
2005-04-07 21:19:42 +02:00
shutdown ( applicationRoot ) ;
2006-07-19 13:20:22 +02:00
} else if ( ( args . length > = 1 ) & & ( args [ 0 ] . toLowerCase ( ) . equals ( " -minimizeurldb " ) ) ) {
2005-10-05 12:45:33 +02:00
// migrate words from DATA/PLASMADB/WORDS path to assortment cache, if possible
// attention: this may run long and should not be interrupted!
2006-07-19 13:20:22 +02:00
if ( args . length > = 3 & & args [ 1 ] . toLowerCase ( ) . equals ( " -cache " ) ) {
2006-04-20 10:20:12 +02:00
args = shift ( args , 1 , 2 ) ;
}
2008-02-01 00:40:47 +01:00
if ( args . length = = 2 ) applicationRoot = new File ( args [ 1 ] ) ;
2007-03-06 23:43:32 +01:00
minimizeUrlDB ( applicationRoot ) ;
2006-07-19 13:20:22 +02:00
} else if ( ( args . length > = 1 ) & & ( args [ 0 ] . toLowerCase ( ) . equals ( " -testpeerdb " ) ) ) {
2006-01-30 09:28:22 +01:00
if ( args . length = = 2 ) {
2008-02-01 00:40:47 +01:00
applicationRoot = new File ( args [ 1 ] ) ;
2006-01-30 09:28:22 +01:00
} else if ( args . length > 2 ) {
System . err . println ( " Usage: -testPeerDB [homeDbRoot] " ) ;
}
testPeerDB ( applicationRoot ) ;
2006-07-19 13:20:22 +02:00
} else if ( ( args . length > = 1 ) & & ( args [ 0 ] . toLowerCase ( ) . equals ( " -genwordstat " ) ) ) {
2005-04-07 21:19:42 +02:00
// this can help to create a stop-word list
2005-05-11 11:44:36 +02:00
// to use this, you need a 'yacy.words' file in the root path
// start this with "java -classpath classes yacy -genwordstat [<rootdir>]"
2008-02-01 00:40:47 +01:00
if ( args . length = = 2 ) applicationRoot = new File ( args [ 1 ] ) ;
2005-04-07 21:19:42 +02:00
genWordstat ( applicationRoot ) ;
2006-07-19 13:20:22 +02:00
} else if ( ( args . length = = 4 ) & & ( args [ 0 ] . toLowerCase ( ) . equals ( " -cleanwordlist " ) ) ) {
2005-04-07 21:19:42 +02:00
// this can be used to organize and clean a word-list
2005-05-11 11:44:36 +02:00
// start this with "java -classpath classes yacy -cleanwordlist <word-file> <minlength> <maxlength>"
2005-04-07 21:19:42 +02:00
int minlength = Integer . parseInt ( args [ 2 ] ) ;
int maxlength = Integer . parseInt ( args [ 3 ] ) ;
cleanwordlist ( args [ 1 ] , minlength , maxlength ) ;
2006-07-19 13:20:22 +02:00
} else if ( ( args . length > = 1 ) & & ( args [ 0 ] . toLowerCase ( ) . equals ( " -transfercr " ) ) ) {
2005-11-11 00:48:20 +01:00
// transfer a single cr file to a remote peer
2005-11-21 02:30:30 +01:00
String targetaddress = args [ 1 ] ;
String crfile = args [ 2 ] ;
transferCR ( targetaddress , crfile ) ;
2006-07-19 13:20:22 +02:00
} else if ( ( args . length > = 1 ) & & ( args [ 0 ] . toLowerCase ( ) . equals ( " -urldbcleanup " ) ) ) {
2005-12-07 12:10:08 +01:00
// generate a url list and save it in a file
2008-02-01 00:40:47 +01:00
if ( args . length = = 2 ) applicationRoot = new File ( args [ 1 ] ) ;
2005-12-07 12:10:08 +01:00
urldbcleanup ( applicationRoot ) ;
2006-07-19 13:20:22 +02:00
} else if ( ( args . length > = 1 ) & & ( args [ 0 ] . toLowerCase ( ) . equals ( " -rwihashlist " ) ) ) {
2006-01-04 14:55:45 +01:00
// generate a url list and save it in a file
2006-01-10 02:04:22 +01:00
String domain = " all " ;
2006-01-15 11:29:48 +01:00
String format = " txt " ;
2006-01-10 02:04:22 +01:00
if ( args . length > = 2 ) domain = args [ 1 ] ;
2006-01-15 11:29:48 +01:00
if ( args . length > = 3 ) format = args [ 2 ] ;
2008-02-01 00:40:47 +01:00
if ( args . length = = 4 ) applicationRoot = new File ( args [ 3 ] ) ;
2006-01-04 14:55:45 +01:00
String outfile = " rwihashlist_ " + System . currentTimeMillis ( ) ;
2006-01-15 11:29:48 +01:00
RWIHashList ( applicationRoot , outfile , domain , format ) ;
2005-04-07 21:19:42 +02:00
} else {
2008-02-01 00:40:47 +01:00
if ( args . length = = 1 ) applicationRoot = new File ( args [ 0 ] ) ;
2005-09-21 14:21:01 +02:00
startup ( applicationRoot , startupMemFree , startupMemTotal ) ;
2005-04-07 21:19:42 +02:00
}
}
}
2005-05-11 11:44:36 +02:00
2005-08-02 21:40:29 +02:00
/ * *
* This class is a helper class whose instance is started , when the java virtual
* machine shuts down . Signals the plasmaSwitchboard to shut down .
* /
2005-07-03 14:40:36 +02:00
class shutdownHookThread extends Thread {
2005-09-21 02:12:37 +02:00
private plasmaSwitchboard sb = null ;
2005-05-11 11:44:36 +02:00
private Thread mainThread = null ;
2005-08-02 21:40:29 +02:00
2005-09-21 02:12:37 +02:00
public shutdownHookThread ( Thread mainThread , plasmaSwitchboard sb ) {
2005-09-20 17:36:22 +02:00
super ( ) ;
2005-09-21 02:12:37 +02:00
this . sb = sb ;
2005-05-11 11:44:36 +02:00
this . mainThread = mainThread ;
}
2005-08-02 21:40:29 +02:00
2005-07-03 14:40:36 +02:00
public void run ( ) {
2005-05-11 11:44:36 +02:00
try {
2005-09-21 02:12:37 +02:00
if ( ! this . sb . isTerminated ( ) ) {
2005-08-30 23:10:39 +02:00
serverLog . logConfig ( " SHUTDOWN " , " Shutdown via shutdown hook. " ) ;
2005-08-02 21:40:29 +02:00
2005-05-11 11:44:36 +02:00
// sending the yacy main thread a shutdown signal
2006-02-01 12:03:37 +01:00
serverLog . logFine ( " SHUTDOWN " , " Signaling shutdown to the switchboard. " ) ;
2005-09-21 02:12:37 +02:00
this . sb . terminate ( ) ;
2005-08-02 21:40:29 +02:00
2005-05-11 11:44:36 +02:00
// waiting for the yacy thread to finish execution
2006-02-01 12:03:37 +01:00
serverLog . logFine ( " SHUTDOWN " , " Waiting for main thread to finish. " ) ;
2006-09-20 12:13:23 +02:00
if ( this . mainThread . isAlive ( ) & & ! this . sb . isTerminated ( ) ) {
this . mainThread . join ( ) ;
}
2005-05-11 11:44:36 +02:00
}
} catch ( Exception e ) {
2005-08-30 23:32:59 +02:00
serverLog . logSevere ( " SHUTDOWN " , " Unexpected error. " + e . getClass ( ) . getName ( ) , e ) ;
2005-05-11 11:44:36 +02:00
}
}
2005-09-22 20:54:36 +02:00
}