2005-09-20 17:36:22 +02:00
// yacy.java
// -----------------------
// (C) by Michael Peter Christen; mc@anomic.de
// first published on http://www.yacy.net
// Frankfurt, Germany, 2004, 2005
2005-04-07 21:19:42 +02:00
//
2005-09-20 17:36:22 +02:00
// $LastChangedDate$
// $LastChangedRevision$
// $LastChangedBy$
2005-04-07 21:19:42 +02:00
//
2005-09-20 17:36:22 +02:00
// This program is free software; you can redistribute it and/or modify
// it under the terms of the GNU General Public License as published by
// the Free Software Foundation; either version 2 of the License, or
// (at your option) any later version.
2005-04-07 21:19:42 +02:00
//
2005-09-20 17:36:22 +02:00
// This program is distributed in the hope that it will be useful,
// but WITHOUT ANY WARRANTY; without even the implied warranty of
// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
// GNU General Public License for more details.
2005-04-07 21:19:42 +02:00
//
2005-09-20 17:36:22 +02:00
// You should have received a copy of the GNU General Public License
// along with this program; if not, write to the Free Software
// Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
2005-04-07 21:19:42 +02:00
//
2005-09-20 17:36:22 +02:00
// Using this software in any meaning (reading, learning, copying, compiling,
// running) means that you agree that the Author(s) is (are) not responsible
// for cost, loss of data or any harm that may be caused directly or indirectly
// by usage of this softare or this documentation. The usage of this software
// is on your own risk. The installation and usage (starting/running) of this
// software may allow other people or application to access your computer and
// any attached devices and is highly dependent on the configuration of the
// software which must be done by the user of the software; the author(s) is
// (are) also not responsible for proper configuration and usage of the
// software, even if provoked by documentation provided together with
// the software.
//
// Any changes to this file according to the GPL as documented in the file
// gpl.txt aside this file in the shipment you received can be done to the
// lines that follows this copyright notice here, but changes must not be
// done inside the copyright notive above. A re-distribution must contain
// the intact and unchanged copyright notice.
// Contributions and changes to the program code must be marked as such.
2006-06-01 01:31:46 +02:00
2005-12-07 02:40:52 +01:00
import java.io.BufferedOutputStream ;
2005-05-05 07:32:19 +02:00
import java.io.BufferedReader ;
import java.io.BufferedWriter ;
import java.io.ByteArrayOutputStream ;
import java.io.File ;
import java.io.FileInputStream ;
import java.io.FileNotFoundException ;
2005-12-07 02:40:52 +01:00
import java.io.FileOutputStream ;
2005-05-05 07:32:19 +02:00
import java.io.FileWriter ;
import java.io.IOException ;
import java.io.InputStreamReader ;
import java.io.PrintWriter ;
import java.util.HashMap ;
import java.util.HashSet ;
import java.util.Iterator ;
2006-01-30 09:28:22 +01:00
import java.util.Map ;
2005-05-05 07:32:19 +02:00
import java.util.Properties ;
import java.util.TreeSet ;
2005-06-07 10:31:49 +02:00
import java.util.regex.Matcher ;
import java.util.regex.Pattern ;
2005-12-17 16:43:13 +01:00
import java.util.zip.ZipEntry ;
import java.util.zip.ZipOutputStream ;
2005-09-27 18:28:55 +02:00
2005-07-06 16:48:41 +02:00
import de.anomic.data.translator ;
2005-05-05 07:32:19 +02:00
import de.anomic.http.httpHeader ;
import de.anomic.http.httpc ;
import de.anomic.http.httpd ;
import de.anomic.http.httpdFileHandler ;
import de.anomic.http.httpdProxyHandler ;
2006-05-28 03:09:31 +02:00
import de.anomic.index.indexContainer ;
2006-11-08 17:17:47 +01:00
import de.anomic.index.indexRWIEntry ;
2006-12-05 03:47:51 +01:00
import de.anomic.index.indexRWIEntryNew ;
2006-11-05 03:10:40 +01:00
import de.anomic.index.indexURLEntry ;
2006-11-08 17:17:47 +01:00
import de.anomic.index.indexURLEntryOld ;
2006-11-23 03:16:30 +01:00
import de.anomic.kelondro.kelondroBitfield ;
2006-01-30 09:28:22 +01:00
import de.anomic.kelondro.kelondroDyn ;
2006-12-05 03:47:51 +01:00
import de.anomic.kelondro.kelondroException ;
2005-05-05 07:32:19 +02:00
import de.anomic.kelondro.kelondroMScoreCluster ;
2007-01-30 00:51:10 +01:00
import de.anomic.kelondro.kelondroMapObjects ;
2006-10-19 23:14:37 +02:00
import de.anomic.kelondro.kelondroRow ;
import de.anomic.kelondro.kelondroTree ;
2006-09-30 00:27:20 +02:00
import de.anomic.net.URL ;
2006-11-23 03:16:30 +01:00
import de.anomic.plasma.plasmaCondenser ;
2006-07-24 10:08:33 +02:00
import de.anomic.plasma.plasmaCrawlEURL ;
2006-09-30 00:27:20 +02:00
import de.anomic.plasma.plasmaCrawlLURL ;
2006-07-24 18:40:59 +02:00
import de.anomic.plasma.plasmaCrawlNURL ;
2005-05-05 07:32:19 +02:00
import de.anomic.plasma.plasmaSwitchboard ;
2005-10-05 12:45:33 +02:00
import de.anomic.plasma.plasmaWordIndex ;
2006-12-05 03:47:51 +01:00
import de.anomic.plasma.plasmaWordIndexAssortment ;
2006-05-28 13:59:16 +02:00
import de.anomic.plasma.plasmaWordIndexFile ;
2006-12-05 03:47:51 +01:00
import de.anomic.plasma.plasmaWordIndexFileCluster ;
import de.anomic.plasma.dbImport.AssortmentImporter ;
2005-05-05 07:32:19 +02:00
import de.anomic.server.serverCore ;
2005-09-27 18:28:55 +02:00
import de.anomic.server.serverDate ;
2005-05-05 07:32:19 +02:00
import de.anomic.server.serverFileUtils ;
2006-08-18 03:33:54 +02:00
import de.anomic.server.serverMemory ;
2005-05-05 07:32:19 +02:00
import de.anomic.server.serverSystem ;
2005-06-09 11:46:43 +02:00
import de.anomic.server.logging.serverLog ;
2005-05-05 07:32:19 +02:00
import de.anomic.tools.enumerateFiles ;
2005-11-11 00:48:20 +01:00
import de.anomic.yacy.yacyClient ;
2005-11-13 16:12:48 +01:00
import de.anomic.yacy.yacyCore ;
2006-01-30 09:28:22 +01:00
import de.anomic.yacy.yacySeedDB ;
2005-04-07 21:19:42 +02:00
2005-08-02 21:40:29 +02:00
/ * *
2005-12-07 11:31:48 +01:00
* This is the main class of YaCy . Several threads are started from here :
2005-08-02 21:40:29 +02:00
* < ul >
* < li > one single instance of the plasmaSwitchboard is generated , which itself
* starts a thread with a plasmaHTMLCache object . This object simply counts
* files sizes in the cache and terminates them . It also generates a
* plasmaCrawlerLoader object , which may itself start some more httpc - calling
* threads to load web pages . They terminate automatically when a page has
* loaded .
* < li > one serverCore - thread is started , which implements a multi - threaded
* server . The process may start itself many more processes that handle
2006-03-05 11:07:52 +01:00
* connections . lo
2005-08-02 21:40:29 +02:00
* < li > finally , all idle - dependent processes are written in a queue in
* plasmaSwitchboard which are worked off inside an idle - sensitive loop of the
* main process . ( here )
* < / ul >
*
* On termination , the following must be done :
* < ul >
* < li > stop feeding of the crawling process because it othervise fills the
* indexing queue .
* < li > say goodbye to connected peers and disable new connections . Don ' t wait for
* success .
* < li > first terminate the serverCore thread . This prevents that new cache
* objects are queued .
* < li > wait that the plasmaHTMLCache terminates ( it should be normal that this
* process already has terminated ) .
* < li > then wait for termination of all loader process of the
* plasmaCrawlerLoader .
* < li > work off the indexing and cache storage queue . These values are inside a
* RAM cache and would be lost otherwise .
* < li > write all settings .
* < li > terminate .
* < / ul >
* /
2005-09-20 17:36:22 +02:00
public final class yacy {
2005-04-07 21:19:42 +02:00
// static objects
2005-06-07 10:31:49 +02:00
private static String vString = " @REPL_VERSION@ " ;
2006-08-28 19:54:07 +02:00
private static double version = 0 . 1 ;
2005-08-02 21:40:29 +02:00
2005-05-22 16:50:15 +02:00
private static final String vDATE = " @REPL_DATE@ " ;
2006-03-24 14:45:01 +01:00
private static final String copyright = " [ YaCy v " + vString + " , build " + vDATE + " by Michael Christen / www.yacy.net ] " ;
2005-04-07 21:19:42 +02:00
private static final String hline = " ------------------------------------------------------------------------------- " ;
2005-10-05 18:35:05 +02:00
2005-08-02 21:40:29 +02:00
/ * *
2006-08-30 18:04:40 +02:00
* Converts combined version - string to a pretty string , e . g . " 0.435/01818 " or " dev/01818 " ( development version ) or " dev/00000 " ( in case of wrong input )
2005-08-02 21:40:29 +02:00
*
2006-09-02 09:29:12 +02:00
* @param ver Combined version string matching regular expression : " \ A( \ d+ \ . \ d{3})( \ d{4}| \ d{5}) \ z " < br >
* ( i . e . : start of input , 1 or more digits in front of decimal point , decimal point followed by 3 digits as major version , 4 or 5 digits for SVN - Version , end of input )
2006-08-30 18:04:40 +02:00
* @return If the major version is & lt ; 0 . 11 - major version is separated from SVN - version by '/' , e . g . " 0.435/01818 " < br >
* If the major version is & gt ; = 0 . 11 - major version is replaced by " dev " and separated SVN - version by '/' , e . g . " dev/01818 " < br >
* " dev/00000 " - If the input does not matcht the regular expression above
2005-08-02 21:40:29 +02:00
* /
2006-10-28 13:33:05 +02:00
public static String combined2prettyVersion ( String ver ) {
return combined2prettyVersion ( ver , " " ) ;
}
public static String combined2prettyVersion ( String ver , String computerName ) {
2007-01-11 16:27:20 +01:00
final Matcher matcher = Pattern . compile ( " \\ A( \\ d+ \\ . \\ d{1,3})( \\ d{0,5}) \\ z " ) . matcher ( ver ) ;
2006-09-11 16:41:06 +02:00
2006-08-30 18:04:40 +02:00
if ( ! matcher . find ( ) ) {
2006-10-28 13:33:05 +02:00
serverLog . logWarning ( " STARTUP " , " Peer ' " + computerName + " ': wrong format of version-string: ' " + ver + " '. Using default string 'dev/00000' instead " ) ;
2006-09-11 16:41:06 +02:00
return " dev/00000 " ;
2007-01-11 16:27:20 +01:00
}
String mainversion = ( Double . parseDouble ( matcher . group ( 1 ) ) < 0 . 11 ? " dev " : matcher . group ( 1 ) ) ;
String revision = matcher . group ( 2 ) ;
for ( int i = revision . length ( ) ; i < 5 ; + + i ) revision + = " 0 " ;
return mainversion + " / " + revision ;
2005-07-26 13:47:50 +02:00
}
2006-08-30 18:04:40 +02:00
2005-08-02 21:40:29 +02:00
/ * *
2006-03-24 14:45:01 +01:00
* Combines the version of YaCy with the versionnumber from SVN to a
2005-10-14 22:36:45 +02:00
* combined version
2005-08-02 21:40:29 +02:00
*
2006-03-24 14:45:01 +01:00
* @param version Current given version .
2006-05-26 14:18:12 +02:00
* @param svn Current version given from SVN .
* @return String with the combined version .
2005-08-02 21:40:29 +02:00
* /
2006-08-28 19:54:07 +02:00
public static double versvn2combinedVersion ( double v , int svn ) {
2006-08-30 18:04:40 +02:00
return ( Math . rint ( ( v * 100000000 . 0 ) + ( ( double ) svn ) ) / 100000000 ) ;
2005-07-26 13:47:50 +02:00
}
2005-08-02 21:40:29 +02:00
/ * *
* Starts up the whole application . Sets up all datastructures and starts
* the main threads .
*
* @param homePath Root - path where all information is to be found .
2005-09-20 12:10:34 +02:00
* @param startupFree free memory at startup time , to be used later for statistics
2005-08-02 21:40:29 +02:00
* /
2005-09-21 14:21:01 +02:00
private static void startup ( String homePath , long startupMemFree , long startupMemTotal ) {
2005-09-27 18:28:55 +02:00
long startup = System . currentTimeMillis ( ) ;
2007-01-04 23:03:32 +01:00
2006-02-12 17:46:43 +01:00
int oldRev = 0 ;
int newRev = 0 ;
2007-01-04 23:03:32 +01:00
2005-05-11 11:44:36 +02:00
try {
// start up
System . out . println ( copyright ) ;
System . out . println ( hline ) ;
2005-08-02 21:40:29 +02:00
2005-04-07 21:19:42 +02:00
// check java version
try {
2005-12-05 10:13:13 +01:00
/*String[] check =*/ " a,b " . split ( " , " ) ; // split needs java 1.4
2005-04-07 21:19:42 +02:00
} catch ( NoSuchMethodError e ) {
2006-03-13 21:12:31 +01:00
System . err . println ( " STARTUP: Java Version too low. You need at least Java 1.4.2 to run YaCy " ) ;
2005-12-05 01:17:12 +01:00
Thread . sleep ( 3000 ) ;
2005-04-07 21:19:42 +02:00
System . exit ( - 1 ) ;
}
2005-09-27 18:28:55 +02:00
2006-10-11 20:27:38 +02:00
// ensure that there is a DATA directory, if not, create one and if that fails warn and die
2005-09-27 18:28:55 +02:00
File f = new File ( homePath ) ; if ( ! ( f . exists ( ) ) ) f . mkdirs ( ) ;
f = new File ( homePath , " DATA/ " ) ; if ( ! ( f . exists ( ) ) ) f . mkdirs ( ) ;
2006-10-11 20:27:38 +02:00
if ( ! ( f . exists ( ) ) ) {
2006-12-17 19:06:39 +01:00
System . err . println ( " Error creating DATA-directory in " + homePath . toString ( ) + " . Please check your write-permission for this folder. YaCy will now terminate. " ) ;
System . exit ( - 1 ) ;
2006-10-11 20:27:38 +02:00
}
2006-12-17 19:06:39 +01:00
f = new File ( homePath , " DATA/yacy.running " ) ;
if ( ! f . exists ( ) ) f . createNewFile ( ) ; f . deleteOnExit ( ) ;
2005-09-27 18:28:55 +02:00
2005-06-09 11:46:43 +02:00
// setting up logging
2005-09-27 18:28:55 +02:00
f = new File ( homePath , " DATA/LOG/ " ) ; if ( ! ( f . exists ( ) ) ) f . mkdirs ( ) ;
if ( ! ( ( new File ( homePath , " DATA/LOG/yacy.logging " ) ) . exists ( ) ) ) try {
serverFileUtils . copy ( new File ( homePath , " yacy.logging " ) , new File ( homePath , " DATA/LOG/yacy.logging " ) ) ;
} catch ( IOException e ) {
System . out . println ( " could not copy yacy.logging " ) ;
}
try {
2005-09-22 20:54:36 +02:00
serverLog . configureLogging ( new File ( homePath , " DATA/LOG/yacy.logging " ) ) ;
2005-06-13 14:01:58 +02:00
} catch ( IOException e ) {
System . out . println ( " could not find logging properties in homePath= " + homePath ) ;
e . printStackTrace ( ) ;
}
2005-08-30 23:10:39 +02:00
serverLog . logConfig ( " STARTUP " , " java version " + System . getProperty ( " java.version " , " no-java-version " ) ) ;
2005-09-01 17:27:41 +02:00
serverLog . logConfig ( " STARTUP " , " Application Root Path: " + homePath ) ;
2005-09-27 18:28:55 +02:00
serverLog . logConfig ( " STARTUP " , " Time Zone: UTC " + serverDate . UTCDiffString ( ) + " ; UTC+0000 is " + System . currentTimeMillis ( ) ) ;
2006-03-23 21:12:23 +01:00
serverLog . logConfig ( " STARTUP " , " Maximum file system path length: " + serverSystem . maxPathLength ) ;
2005-08-02 21:40:29 +02:00
2006-06-14 17:18:41 +02:00
/ *
// Testing if the yacy archive file were unzipped correctly.
// This test is needed because of classfile-names longer than 100 chars
// which could cause problems with incompatible unzip software.
// See:
// - http://www.yacy-forum.de/viewtopic.php?t=1763
// - http://www.yacy-forum.de/viewtopic.php?t=715
// - http://www.yacy-forum.de/viewtopic.php?t=1674
File unzipTest = new File ( homePath , " doc/This_is_a_test_if_the_archive_file_containing_YaCy_was_unpacked_correctly_If_not_please_use_gnu_tar_instead.txt " ) ;
if ( ! unzipTest . exists ( ) ) {
String errorMsg = " The archive file containing YaCy was not unpacked correctly. " +
" Please use 'GNU-Tar' or upgrade to a newer version of your unzip software. \ n " +
" For detailed information on this bug see: " +
" http://www.yacy-forum.de/viewtopic.php?t=715 " ;
System . err . println ( errorMsg ) ;
serverLog . logSevere ( " STARTUP " , errorMsg ) ;
System . exit ( 1 ) ;
}
* /
2006-01-24 09:33:52 +01:00
2005-09-21 02:12:37 +02:00
final plasmaSwitchboard sb = new plasmaSwitchboard ( homePath , " yacy.init " , " DATA/SETTINGS/httpProxy.conf " ) ;
2005-10-05 18:35:05 +02:00
2005-09-20 12:10:34 +02:00
// save information about available memory at startup time
2005-09-21 02:12:37 +02:00
sb . setConfig ( " memoryFreeAfterStartup " , startupMemFree ) ;
sb . setConfig ( " memoryTotalAfterStartup " , startupMemTotal ) ;
2005-09-20 12:10:34 +02:00
2005-06-30 00:55:37 +02:00
// hardcoded, forced, temporary value-migration
2005-09-21 02:12:37 +02:00
sb . setConfig ( " htTemplatePath " , " htroot/env/templates " ) ;
sb . setConfig ( " parseableExt " , " html,htm,txt,php,shtml,asp " ) ;
2005-08-02 21:40:29 +02:00
2005-06-07 10:31:49 +02:00
// if we are running an SVN version, we try to detect the used svn revision now ...
2005-09-20 17:36:22 +02:00
final Properties buildProp = new Properties ( ) ;
2005-07-03 14:40:36 +02:00
File buildPropFile = null ;
try {
buildPropFile = new File ( homePath , " build.properties " ) ;
buildProp . load ( new FileInputStream ( buildPropFile ) ) ;
} catch ( Exception e ) {
2005-09-20 17:36:22 +02:00
serverLog . logWarning ( " STARTUP " , buildPropFile . toString ( ) + " not found in settings path " ) ;
2005-07-03 14:40:36 +02:00
}
2006-02-12 17:46:43 +01:00
oldRev = Integer . parseInt ( sb . getConfig ( " svnRevision " , " 0 " ) ) ;
2005-06-30 21:39:19 +02:00
try {
if ( buildProp . containsKey ( " releaseNr " ) ) {
2005-09-20 17:36:22 +02:00
// this normally looks like this: $Revision$
final String svnReleaseNrStr = buildProp . getProperty ( " releaseNr " ) ;
final Pattern pattern = Pattern . compile ( " \\ $Revision: \\ s(.*) \\ s \\ $ " , Pattern . DOTALL + Pattern . CASE_INSENSITIVE ) ;
final Matcher matcher = pattern . matcher ( svnReleaseNrStr ) ;
2005-06-30 21:39:19 +02:00
if ( matcher . find ( ) ) {
2005-09-20 17:36:22 +02:00
final String svrReleaseNr = matcher . group ( 1 ) ;
2005-07-03 14:40:36 +02:00
try {
2006-08-28 19:54:07 +02:00
try { version = Double . parseDouble ( vString ) ; } catch ( NumberFormatException e ) { version = ( float ) 0 . 1 ; }
2005-07-26 17:17:29 +02:00
version = versvn2combinedVersion ( version , Integer . parseInt ( svrReleaseNr ) ) ;
2005-07-03 14:40:36 +02:00
} catch ( NumberFormatException e ) { }
2005-09-21 02:12:37 +02:00
sb . setConfig ( " svnRevision " , svrReleaseNr ) ;
2005-06-07 10:31:49 +02:00
}
}
2006-02-12 17:46:43 +01:00
newRev = Integer . parseInt ( sb . getConfig ( " svnRevision " , " 0 " ) ) ;
2005-06-30 21:39:19 +02:00
} catch ( Exception e ) {
System . err . println ( " Unable to determine the currently used SVN revision number. " ) ;
2005-06-07 10:31:49 +02:00
}
2005-08-02 21:40:29 +02:00
2006-08-28 19:54:07 +02:00
sb . setConfig ( " version " , Double . toString ( version ) ) ;
2006-10-28 13:33:05 +02:00
sb . setConfig ( " vString " , combined2prettyVersion ( Double . toString ( version ) ) ) ;
2005-09-21 02:12:37 +02:00
sb . setConfig ( " vdate " , vDATE ) ;
sb . setConfig ( " applicationRoot " , homePath ) ;
2007-01-31 18:05:15 +01:00
sb . startupTime = startup ;
2005-08-30 23:10:39 +02:00
serverLog . logConfig ( " STARTUP " , " YACY Version: " + version + " , Built " + vDATE ) ;
2005-12-07 00:51:29 +01:00
yacyCore . latestVersion = version ;
2005-08-02 21:40:29 +02:00
2005-05-11 11:44:36 +02:00
// read environment
2005-09-21 02:12:37 +02:00
int timeout = Integer . parseInt ( sb . getConfig ( " httpdTimeout " , " 60000 " ) ) ;
2005-05-11 11:44:36 +02:00
if ( timeout < 60000 ) timeout = 60000 ;
2005-08-02 21:40:29 +02:00
2005-04-07 21:19:42 +02:00
// create some directories
2005-09-22 22:25:56 +02:00
final File htRootPath = new File ( homePath , sb . getConfig ( " htRootPath " , " htroot " ) ) ;
final File htDocsPath = new File ( homePath , sb . getConfig ( " htDocsPath " , " DATA/HTDOCS " ) ) ;
2005-12-05 10:13:13 +01:00
//final File htTemplatePath = new File(homePath, sb.getConfig("htTemplatePath","htdocs"));
2005-08-02 21:40:29 +02:00
2005-08-02 02:16:19 +02:00
// create default notifier picture
2006-02-03 22:21:42 +01:00
//TODO: Use templates instead of copying images ...
2006-02-04 11:50:22 +01:00
if ( ! ( ( new File ( htDocsPath , " notifier.gif " ) ) . exists ( ) ) ) try {
2005-09-01 17:27:41 +02:00
serverFileUtils . copy ( new File ( htRootPath , " env/grafics/empty.gif " ) ,
2006-02-03 22:21:42 +01:00
new File ( htDocsPath , " notifier.gif " ) ) ;
2005-08-02 02:16:19 +02:00
} catch ( IOException e ) { }
2005-08-02 21:40:29 +02:00
2005-04-07 21:19:42 +02:00
if ( ! ( htDocsPath . exists ( ) ) ) htDocsPath . mkdir ( ) ;
2005-09-20 17:36:22 +02:00
final File htdocsDefaultReadme = new File ( htDocsPath , " readme.txt " ) ;
2005-04-07 21:19:42 +02:00
if ( ! ( htdocsDefaultReadme . exists ( ) ) ) try { serverFileUtils . write ( (
2005-05-11 11:44:36 +02:00
" This is your root directory for individual Web Content \ r \ n " +
" \ r \ n " +
" Please place your html files into the www subdirectory. \ r \ n " +
" The URL of that path is either \ r \ n " +
" http://www.<your-peer-name>.yacy or \ r \ n " +
" http://<your-ip>:<your-port>/www \ r \ n " +
" \ r \ n " +
" Other subdirectories may be created; they map to corresponding sub-domains. \ r \ n " +
" This directory shares it's content with the applications htroot path, so you \ r \ n " +
" may access your yacy search page with \ r \ n " +
" http://<your-peer-name>.yacy/ \ r \ n " +
2005-07-03 14:40:36 +02:00
" \ r \ n " ) . getBytes ( ) , htdocsDefaultReadme ) ; } catch ( IOException e ) {
System . out . println ( " Error creating htdocs readme: " + e . getMessage ( ) ) ;
}
2005-08-02 21:40:29 +02:00
2005-09-20 17:36:22 +02:00
final File wwwDefaultPath = new File ( htDocsPath , " www " ) ;
2005-04-07 21:19:42 +02:00
if ( ! ( wwwDefaultPath . exists ( ) ) ) wwwDefaultPath . mkdir ( ) ;
2005-08-02 21:40:29 +02:00
2005-09-20 17:36:22 +02:00
final File shareDefaultPath = new File ( htDocsPath , " share " ) ;
2005-04-07 21:19:42 +02:00
if ( ! ( shareDefaultPath . exists ( ) ) ) shareDefaultPath . mkdir ( ) ;
2005-08-02 21:40:29 +02:00
2006-02-12 17:46:43 +01:00
migration . migrate ( sb , oldRev , newRev ) ;
2005-12-06 23:30:15 +01:00
2005-05-11 11:44:36 +02:00
// start main threads
2006-03-04 12:07:01 +01:00
final String port = sb . getConfig ( " port " , " 8080 " ) ;
2005-05-11 11:44:36 +02:00
try {
2005-09-21 02:12:37 +02:00
final httpd protocolHandler = new httpd ( sb , new httpdFileHandler ( sb ) , new httpdProxyHandler ( sb ) ) ;
2005-11-15 16:03:15 +01:00
final serverCore server = new serverCore (
2005-05-11 11:44:36 +02:00
timeout /*control socket timeout in milliseconds*/ ,
true /* block attacks (wrong protocol) */ ,
protocolHandler /*command class*/ ,
2005-09-21 02:12:37 +02:00
sb ,
2005-06-10 11:19:24 +02:00
30000 /*command max length incl. GET args*/ ) ;
2005-05-11 11:44:36 +02:00
server . setName ( " httpd: " + port ) ;
2005-07-03 14:40:36 +02:00
server . setPriority ( Thread . MAX_PRIORITY ) ;
2005-10-31 11:46:13 +01:00
server . setObeyIntermission ( false ) ;
2005-05-11 11:44:36 +02:00
if ( server = = null ) {
2005-08-30 23:32:59 +02:00
serverLog . logSevere ( " STARTUP " , " Failed to start server. Probably port " + port + " already in use. " ) ;
2005-05-11 11:44:36 +02:00
} else {
// first start the server
2005-09-21 02:12:37 +02:00
sb . deployThread ( " 10_httpd " , " HTTPD Server/Proxy " , " the HTTPD, used as web server and proxy " , null , server , 0 , 0 , 0 , 0 ) ;
2005-05-11 11:44:36 +02:00
//server.start();
2005-08-02 21:40:29 +02:00
2005-05-11 11:44:36 +02:00
// open the browser window
2005-09-21 02:12:37 +02:00
final boolean browserPopUpTrigger = sb . getConfig ( " browserPopUpTrigger " , " true " ) . equals ( " true " ) ;
2005-05-11 11:44:36 +02:00
if ( browserPopUpTrigger ) {
2006-03-01 23:27:20 +01:00
String browserPopUpPage = sb . getConfig ( " browserPopUpPage " , " ConfigBasic.html " ) ;
2007-02-05 20:46:50 +01:00
boolean properPW = ( sb . getConfig ( " adminAccount " , " " ) . length ( ) = = 0 ) & & ( sb . getConfig ( httpd . ADMIN_ACCOUNT_B64MD5 , " " ) . length ( ) > 0 ) ;
2006-03-01 23:27:20 +01:00
if ( ! properPW ) browserPopUpPage = " ConfigBasic.html " ;
2005-09-21 02:12:37 +02:00
final String browserPopUpApplication = sb . getConfig ( " browserPopUpApplication " , " netscape " ) ;
2006-05-20 16:05:49 +02:00
serverSystem . openBrowser ( ( server . withSSL ( ) ? " https " : " http " ) + " ://localhost: " + serverCore . getPortNr ( port ) + " / " + browserPopUpPage , browserPopUpApplication ) ;
2005-05-11 11:44:36 +02:00
}
2005-08-02 21:40:29 +02:00
2005-05-27 10:36:07 +02:00
//Copy the shipped locales into DATA
2005-09-22 22:25:56 +02:00
final File localesPath = new File ( homePath , sb . getConfig ( " localesPath " , " DATA/LOCALE " ) ) ;
final File defaultLocalesPath = new File ( homePath , " locales " ) ;
2006-03-17 22:09:07 +01:00
2005-08-02 21:40:29 +02:00
2005-05-27 10:36:07 +02:00
try {
2005-09-20 17:36:22 +02:00
final File [ ] defaultLocales = defaultLocalesPath . listFiles ( ) ;
2005-05-27 10:36:07 +02:00
localesPath . mkdirs ( ) ;
for ( int i = 0 ; i < defaultLocales . length ; i + + ) {
if ( defaultLocales [ i ] . getName ( ) . endsWith ( " .lng " ) )
2005-07-03 14:40:36 +02:00
serverFileUtils . copy ( defaultLocales [ i ] , new File ( localesPath , defaultLocales [ i ] . getName ( ) ) ) ;
2005-05-27 10:36:07 +02:00
}
2005-12-19 20:14:18 +01:00
serverLog . logInfo ( " STARTUP " , " Copied the default locales to DATA/LOCALE " ) ;
2005-05-27 10:36:07 +02:00
} catch ( NullPointerException e ) {
2005-08-30 23:32:59 +02:00
serverLog . logSevere ( " STARTUP " , " Nullpointer Exception while copying the default Locales " ) ;
2005-05-27 10:36:07 +02:00
}
2005-08-02 21:40:29 +02:00
2005-07-03 14:40:36 +02:00
//regenerate Locales from Translationlist, if needed
2005-09-21 02:12:37 +02:00
final String lang = sb . getConfig ( " htLocaleSelection " , " " ) ;
2005-07-03 14:40:36 +02:00
if ( ! lang . equals ( " " ) & & ! lang . equals ( " default " ) ) { //locale is used
String currentRev = " " ;
try {
2006-02-07 19:06:00 +01:00
final BufferedReader br = new BufferedReader ( new InputStreamReader ( new FileInputStream ( new File ( sb . getConfig ( " htLocalePath " , " DATA/HTDOCS/locale " ) , lang + " /version " ) ) ) ) ;
2005-07-03 14:40:36 +02:00
currentRev = br . readLine ( ) ;
br . close ( ) ;
} catch ( IOException e ) {
//Error
}
2005-08-02 21:40:29 +02:00
2006-03-13 21:12:31 +01:00
try { //seperate try, because we want this, even if the file "version" does not exist.
2005-09-21 02:12:37 +02:00
if ( ! currentRev . equals ( sb . getConfig ( " svnRevision " , " " ) ) ) { //is this another version?!
final File sourceDir = new File ( sb . getConfig ( " htRootPath " , " htroot " ) ) ;
2006-02-04 11:50:22 +01:00
final File destDir = new File ( sb . getConfig ( " htLocalePath " , " DATA/HTDOCS/locale " ) , lang ) ;
2006-01-14 12:56:20 +01:00
2006-01-14 23:11:45 +01:00
if ( translator . translateFilesRecursive ( sourceDir , destDir , new File ( " DATA/LOCALE/ " + lang + " .lng " ) , " html,template,inc " , " locale " ) ) { //translate it
2005-07-03 14:40:36 +02:00
//write the new Versionnumber
2005-09-20 17:36:22 +02:00
final BufferedWriter bw = new BufferedWriter ( new PrintWriter ( new FileWriter ( new File ( destDir , " version " ) ) ) ) ;
2005-09-21 02:12:37 +02:00
bw . write ( sb . getConfig ( " svnRevision " , " Error getting Version " ) ) ;
2005-07-03 14:40:36 +02:00
bw . close ( ) ;
}
}
} catch ( IOException e ) {
//Error
}
}
2005-08-02 21:40:29 +02:00
2005-05-11 11:44:36 +02:00
// registering shutdown hook
2005-08-30 23:10:39 +02:00
serverLog . logConfig ( " STARTUP " , " Registering Shutdown Hook " ) ;
2005-09-20 17:36:22 +02:00
final Runtime run = Runtime . getRuntime ( ) ;
2005-09-21 02:12:37 +02:00
run . addShutdownHook ( new shutdownHookThread ( Thread . currentThread ( ) , sb ) ) ;
2005-08-02 21:40:29 +02:00
2005-09-20 12:10:34 +02:00
// save information about available memory after all initializations
2006-01-31 00:07:20 +01:00
//try {
2006-01-30 13:42:06 +01:00
sb . setConfig ( " memoryFreeAfterInitBGC " , Runtime . getRuntime ( ) . freeMemory ( ) ) ;
sb . setConfig ( " memoryTotalAfterInitBGC " , Runtime . getRuntime ( ) . totalMemory ( ) ) ;
System . gc ( ) ;
sb . setConfig ( " memoryFreeAfterInitAGC " , Runtime . getRuntime ( ) . freeMemory ( ) ) ;
sb . setConfig ( " memoryTotalAfterInitAGC " , Runtime . getRuntime ( ) . totalMemory ( ) ) ;
2006-01-31 00:07:20 +01:00
//} catch (ConcurrentModificationException e) {}
2005-09-21 14:21:01 +02:00
2005-05-11 11:44:36 +02:00
// wait for server shutdown
try {
2005-09-21 02:12:37 +02:00
sb . waitForShutdown ( ) ;
2005-05-11 11:44:36 +02:00
} catch ( Exception e ) {
2006-03-13 21:12:31 +01:00
serverLog . logSevere ( " MAIN CONTROL LOOP " , " PANIC: " + e . getMessage ( ) , e ) ;
2005-04-07 21:19:42 +02:00
}
// shut down
2005-08-30 23:10:39 +02:00
serverLog . logConfig ( " SHUTDOWN " , " caught termination signal " ) ;
2005-04-07 21:19:42 +02:00
server . terminate ( false ) ;
2005-05-11 11:44:36 +02:00
server . interrupt ( ) ;
if ( server . isAlive ( ) ) try {
2006-05-20 16:05:49 +02:00
URL u = new URL ( ( server . withSSL ( ) ? " https " : " http " ) + " ://localhost: " + serverCore . getPortNr ( port ) ) ;
2006-05-09 15:11:00 +02:00
httpc . wget ( u , u . getHost ( ) , 1000 , null , null , null ) ; // kick server
2005-08-30 23:10:39 +02:00
serverLog . logConfig ( " SHUTDOWN " , " sent termination signal to server socket " ) ;
2005-04-07 21:19:42 +02:00
} catch ( IOException ee ) {
2005-08-30 23:10:39 +02:00
serverLog . logConfig ( " SHUTDOWN " , " termination signal to server socket missed (server shutdown, ok) " ) ;
2005-04-07 21:19:42 +02:00
}
2005-08-02 21:40:29 +02:00
2005-04-07 21:19:42 +02:00
// idle until the processes are down
while ( server . isAlive ( ) ) {
2005-12-05 10:13:13 +01:00
Thread . sleep ( 2000 ) ; // wait a while
2005-04-07 21:19:42 +02:00
}
2005-08-30 23:10:39 +02:00
serverLog . logConfig ( " SHUTDOWN " , " server has terminated " ) ;
2005-09-21 02:12:37 +02:00
sb . close ( ) ;
2005-04-07 21:19:42 +02:00
}
2005-05-11 11:44:36 +02:00
} catch ( Exception e ) {
2005-08-30 23:32:59 +02:00
serverLog . logSevere ( " STARTUP " , " Unexpected Error: " + e . getClass ( ) . getName ( ) , e ) ;
2005-05-11 11:44:36 +02:00
//System.exit(1);
}
} catch ( Exception ee ) {
2005-08-30 23:32:59 +02:00
serverLog . logSevere ( " STARTUP " , " FATAL ERROR: " + ee . getMessage ( ) , ee ) ;
2005-05-11 11:44:36 +02:00
}
2007-01-04 23:03:32 +01:00
serverLog . logConfig ( " SHUTDOWN " , " goodbye. (this is the last line) " ) ;
2005-05-11 11:44:36 +02:00
try {
System . exit ( 0 ) ;
} catch ( Exception e ) { } // was once stopped by de.anomic.net.ftpc$sm.checkExit(ftpc.java:1790)
2005-04-07 21:19:42 +02:00
}
2005-08-02 21:40:29 +02:00
/ * *
* Loads the configuration from the data - folder .
* FIXME : Why is this called over and over again from every method , instead
* of setting the configurationdata once for this class in main ?
*
* @param mes Where are we called from , so that the errormessages can be
* more descriptive .
* @param homePath Root - path where all the information is to be found .
* @return Properties read from the configurationfile .
* /
2005-04-07 21:19:42 +02:00
private static Properties configuration ( String mes , String homePath ) {
2005-08-30 23:10:39 +02:00
serverLog . logConfig ( mes , " Application Root Path: " + homePath . toString ( ) ) ;
2005-08-02 21:40:29 +02:00
2005-04-07 21:19:42 +02:00
// read data folder
File dataFolder = new File ( homePath , " DATA " ) ;
if ( ! ( dataFolder . exists ( ) ) ) {
2005-08-30 23:32:59 +02:00
serverLog . logSevere ( mes , " Application was never started or root path wrong. " ) ;
2005-04-07 21:19:42 +02:00
System . exit ( - 1 ) ;
}
2005-08-02 21:40:29 +02:00
2005-04-07 21:19:42 +02:00
Properties config = new Properties ( ) ;
try {
config . load ( new FileInputStream ( new File ( homePath , " DATA/SETTINGS/httpProxy.conf " ) ) ) ;
} catch ( FileNotFoundException e ) {
2005-08-30 23:32:59 +02:00
serverLog . logSevere ( mes , " could not find configuration file. " ) ;
2005-04-07 21:19:42 +02:00
System . exit ( - 1 ) ;
} catch ( IOException e ) {
2005-08-30 23:32:59 +02:00
serverLog . logSevere ( mes , " could not read configuration file. " ) ;
2005-04-07 21:19:42 +02:00
System . exit ( - 1 ) ;
}
2005-08-02 21:40:29 +02:00
2005-04-07 21:19:42 +02:00
return config ;
}
2005-08-02 21:40:29 +02:00
2006-02-01 12:03:37 +01:00
static void shutdown ( ) {
String applicationRoot = System . getProperty ( " user.dir " ) . replace ( '\\' , '/' ) ;
shutdown ( applicationRoot ) ;
}
2005-08-02 21:40:29 +02:00
/ * *
2006-05-26 14:18:12 +02:00
* Call the shutdown - page of YaCy to tell it to shut down . This method is
2005-08-02 21:40:29 +02:00
* called if you start yacy with the argument - shutdown .
*
* @param homePath Root - path where all the information is to be found .
* /
2005-05-11 11:44:36 +02:00
static void shutdown ( String homePath ) {
2005-04-07 21:19:42 +02:00
// start up
System . out . println ( copyright ) ;
System . out . println ( hline ) ;
2005-08-02 21:40:29 +02:00
2005-04-07 21:19:42 +02:00
Properties config = configuration ( " REMOTE-SHUTDOWN " , homePath ) ;
2005-08-02 21:40:29 +02:00
2005-04-07 21:19:42 +02:00
// read port
2006-03-04 12:07:01 +01:00
int port = serverCore . getPortNr ( config . getProperty ( " port " , " 8080 " ) ) ;
2005-08-02 21:40:29 +02:00
2005-04-07 21:19:42 +02:00
// read password
2007-02-05 20:46:50 +01:00
String encodedPassword = ( String ) config . get ( httpd . ADMIN_ACCOUNT_B64MD5 ) ;
2005-04-07 21:19:42 +02:00
if ( encodedPassword = = null ) encodedPassword = " " ; // not defined
2005-08-02 21:40:29 +02:00
2005-04-07 21:19:42 +02:00
// send 'wget' to web interface
2005-05-11 11:44:36 +02:00
httpHeader requestHeader = new httpHeader ( ) ;
2005-05-12 12:05:17 +02:00
requestHeader . put ( " Authorization " , " realm= " + encodedPassword ) ; // for http-authentify
2005-04-07 21:19:42 +02:00
try {
2006-05-10 18:01:14 +02:00
httpc con = httpc . getInstance ( " localhost " , " localhost " , port , 10000 , false ) ;
2005-04-07 21:19:42 +02:00
httpc . response res = con . GET ( " Steering.html?shutdown= " , requestHeader ) ;
2005-08-02 21:40:29 +02:00
2005-04-07 21:19:42 +02:00
// read response
if ( res . status . startsWith ( " 2 " ) ) {
2005-08-30 23:10:39 +02:00
serverLog . logConfig ( " REMOTE-SHUTDOWN " , " YACY accepted shutdown command. " ) ;
serverLog . logConfig ( " REMOTE-SHUTDOWN " , " Stand by for termination, which may last some seconds. " ) ;
2005-04-07 21:19:42 +02:00
ByteArrayOutputStream bos = new ByteArrayOutputStream ( ) ;
res . writeContent ( bos , null ) ;
con . close ( ) ;
} else {
2005-08-30 23:32:59 +02:00
serverLog . logSevere ( " REMOTE-SHUTDOWN " , " error response from YACY socket: " + res . status ) ;
2005-04-07 21:19:42 +02:00
System . exit ( - 1 ) ;
}
} catch ( IOException e ) {
2005-08-30 23:32:59 +02:00
serverLog . logSevere ( " REMOTE-SHUTDOWN " , " could not establish connection to YACY socket: " + e . getMessage ( ) ) ;
2005-04-07 21:19:42 +02:00
System . exit ( - 1 ) ;
}
2005-08-02 21:40:29 +02:00
2005-04-07 21:19:42 +02:00
// finished
2005-08-30 23:10:39 +02:00
serverLog . logConfig ( " REMOTE-SHUTDOWN " , " SUCCESSFULLY FINISHED remote-shutdown: " ) ;
serverLog . logConfig ( " REMOTE-SHUTDOWN " , " YACY will terminate after working off all enqueued tasks. " ) ;
2005-04-07 21:19:42 +02:00
}
2005-08-02 21:40:29 +02:00
/ * *
* This method gets all found words and outputs a statistic about the score
* of the words . The output of this method can be used to create stop - word
* lists . This method will be called if you start yacy with the argument
* - genwordstat .
* FIXME : How can stop - word list be created from this output ? What type of
* score is output ?
*
* @param homePath Root - Path where all the information is to be found .
* /
2005-04-07 21:19:42 +02:00
private static void genWordstat ( String homePath ) {
// start up
System . out . println ( copyright ) ;
System . out . println ( hline ) ;
2005-08-02 21:40:29 +02:00
2005-04-07 21:19:42 +02:00
Properties config = configuration ( " GEN-WORDSTAT " , homePath ) ;
2005-08-02 21:40:29 +02:00
2005-05-11 11:44:36 +02:00
// load words
2005-04-07 21:19:42 +02:00
serverLog . logInfo ( " GEN-WORDSTAT " , " loading words... " ) ;
HashMap words = loadWordMap ( new File ( homePath , " yacy.words " ) ) ;
2005-08-02 21:40:29 +02:00
2005-04-07 21:19:42 +02:00
// find all hashes
serverLog . logInfo ( " GEN-WORDSTAT " , " searching all word-hash databases... " ) ;
File dbRoot = new File ( homePath , config . getProperty ( " dbPath " ) ) ;
2005-06-07 03:05:55 +02:00
enumerateFiles ef = new enumerateFiles ( new File ( dbRoot , " WORDS " ) , true , false , true , true ) ;
2005-04-07 21:19:42 +02:00
File f ;
String h ;
kelondroMScoreCluster hs = new kelondroMScoreCluster ( ) ;
while ( ef . hasMoreElements ( ) ) {
f = ( File ) ef . nextElement ( ) ;
2006-11-08 17:17:47 +01:00
h = f . getName ( ) . substring ( 0 , yacySeedDB . commonHashLength ) ;
2005-04-07 21:19:42 +02:00
hs . addScore ( h , ( int ) f . length ( ) ) ;
}
2005-08-02 21:40:29 +02:00
2005-04-07 21:19:42 +02:00
// list the hashes in reverse order
serverLog . logInfo ( " GEN-WORDSTAT " , " listing words in reverse size order... " ) ;
String w ;
Iterator i = hs . scores ( false ) ;
while ( i . hasNext ( ) ) {
h = ( String ) i . next ( ) ;
w = ( String ) words . get ( h ) ;
if ( w = = null ) System . out . print ( " # " + h ) ; else System . out . print ( w ) ;
System . out . println ( " - " + hs . getScore ( h ) ) ;
}
2005-08-02 21:40:29 +02:00
2005-04-07 21:19:42 +02:00
// finished
2005-08-30 23:10:39 +02:00
serverLog . logConfig ( " GEN-WORDSTAT " , " FINISHED " ) ;
2005-04-07 21:19:42 +02:00
}
2005-08-02 21:40:29 +02:00
/ * *
* Migrates the PLASMA WORDS structure to the assortment cache if possible .
* This method will be called if you start yacy with the argument
* - migratewords .
* Caution : This might take a long time to finish . Don ' t interrupt it !
* FIXME : Shouldn ' t this method be private ?
*
* @param homePath Root - path where all the information is to be found .
* /
2005-06-15 03:22:07 +02:00
public static void migrateWords ( String homePath ) {
// run with "java -classpath classes yacy -migratewords"
2006-06-28 11:04:53 +02:00
try { serverLog . configureLogging ( new File ( homePath , " DATA/LOG/yacy.logging " ) ) ; } catch ( Exception e ) { }
2005-06-15 03:22:07 +02:00
File dbroot = new File ( new File ( homePath ) , " DATA/PLASMADB " ) ;
2006-10-19 23:14:37 +02:00
File indexRoot = new File ( new File ( homePath ) , " DATA/INDEX " ) ;
2005-12-07 00:51:29 +01:00
serverLog log = new serverLog ( " WORDMIGRATION " ) ;
log . logInfo ( " STARTING MIGRATION " ) ;
2006-12-05 03:47:51 +01:00
plasmaWordIndex wordIndexCache = new plasmaWordIndex ( indexRoot , 60000000 , 60000000 , 10000 , log ) ;
2005-12-07 00:51:29 +01:00
enumerateFiles words = new enumerateFiles ( new File ( dbroot , " WORDS " ) , true , false , true , true ) ;
String wordhash ;
File wordfile ;
2006-12-05 03:47:51 +01:00
int migrationCount ;
2005-12-07 00:51:29 +01:00
while ( words . hasMoreElements ( ) )
try {
2005-07-20 02:39:06 +02:00
wordfile = ( File ) words . nextElement ( ) ;
wordhash = wordfile . getName ( ) . substring ( 0 , 12 ) ;
2005-12-07 00:51:29 +01:00
// System.out.println("NOW: " + wordhash);
2006-12-05 03:47:51 +01:00
migrationCount = migrateWords2index ( dbroot , wordhash , wordIndexCache ) ;
if ( migrationCount > = 0 ) {
2005-11-15 12:55:09 +01:00
if ( migrationCount = = 0 )
log . logInfo ( " SKIPPED " + wordhash + " : empty " ) ;
else if ( migrationCount > 0 )
log . logInfo ( " MIGRATED " + wordhash + " : " + migrationCount + " entries " ) ;
else
2005-12-07 00:51:29 +01:00
log . logInfo ( " REVERSED " + wordhash + " : " + ( - migrationCount ) + " entries " ) ;
2006-12-05 03:47:51 +01:00
} else {
log . logInfo ( " SKIPPED " + wordhash ) ;
2005-11-15 12:55:09 +01:00
}
2005-06-21 03:17:25 +02:00
} catch ( Exception e ) {
2006-04-20 10:20:12 +02:00
log . logSevere ( " Exception " , e ) ;
2005-06-15 03:22:07 +02:00
}
2005-12-07 00:51:29 +01:00
log . logInfo ( " FINISHED MIGRATION JOB, WAIT FOR DUMP " ) ;
2006-12-05 03:47:51 +01:00
wordIndexCache . close ( ) ;
log . logInfo ( " TERMINATED MIGRATION " ) ;
}
public static int migrateWords2index ( File oldDatabaseRoot , String wordhash , plasmaWordIndex wi ) throws IOException {
// returns the number of entries that had been added to the assortments
// can be negative if some assortments have been moved to the backend
File db = plasmaWordIndexFile . wordHash2path ( oldDatabaseRoot , wordhash ) ;
if ( ! ( db . exists ( ) ) ) {
serverLog . logSevere ( " migrateWordIndex " , " word index file for hash " + wordhash + " not found " ) ;
return - 1 ;
}
plasmaWordIndexFile entity = null ;
try {
2006-12-06 13:51:46 +01:00
entity = new plasmaWordIndexFile ( oldDatabaseRoot , wordhash ) ;
2006-12-05 03:47:51 +01:00
int size = entity . size ( ) ;
indexContainer container = new indexContainer ( wordhash , indexRWIEntryNew . urlEntryRow ) ;
try {
Iterator entries = entity . elements ( true ) ;
indexRWIEntry entry ;
while ( entries . hasNext ( ) ) {
entry = ( indexRWIEntry ) entries . next ( ) ;
// System.out.println("ENTRY = " + entry.getUrlHash());
2006-12-07 03:40:57 +01:00
container . add ( entry , System . currentTimeMillis ( ) ) ;
2006-12-05 03:47:51 +01:00
}
// we have read all elements, now delete the entity
entity . deleteComplete ( ) ;
entity . close ( ) ;
entity = null ;
wi . addEntries ( container , container . updated ( ) , false ) ;
return size ;
} catch ( kelondroException e ) {
// database corrupted, we simply give up the database and delete it
try { entity . close ( ) ; } catch ( Exception ee ) { }
entity = null ;
try { db . delete ( ) ; } catch ( Exception ee ) { }
serverLog . logSevere ( " migrateWordIndex " , " database for hash " + wordhash + " corrupted; deleted " ) ;
return - 1 ;
}
} finally {
if ( entity ! = null ) try { entity . close ( ) ; } catch ( Exception e ) { }
}
}
public static void migrateAssortments ( String homePath ) {
// run with "java -classpath classes yacy -migrateassortments"
try { serverLog . configureLogging ( new File ( homePath , " DATA/LOG/yacy.logging " ) ) ; } catch ( Exception e ) { }
serverLog log = new serverLog ( " ASSORTMENTMIGRATION " ) ;
File aclusterroot = new File ( new File ( homePath ) , " DATA/PLASMADB/ACLUSTER " ) ;
File indexRoot = new File ( new File ( homePath ) , " DATA/INDEX " ) ;
plasmaWordIndex wordIndexCache = new plasmaWordIndex ( indexRoot , 60000000 , 60000000 , 10000 , log ) ;
log . logInfo ( " STARTING MIGRATION " ) ;
String [ ] a = aclusterroot . list ( ) ;
AssortmentImporter importer = new AssortmentImporter ( wordIndexCache ) ;
for ( int i = a . length - 1 ; i > = 0 ; i - - ) {
if ( a [ i ] . startsWith ( " indexAssortment " ) ) {
importer . init ( new File ( aclusterroot , a [ i ] ) , 16000000 , 2000 ) ;
importer . run ( ) ;
}
}
log . logInfo ( " FINISHED MIGRATION JOB, WAIT FOR DUMP " ) ;
wordIndexCache . close ( ) ;
2005-12-07 00:51:29 +01:00
log . logInfo ( " TERMINATED MIGRATION " ) ;
2006-06-09 07:38:59 +02:00
}
2005-10-13 14:31:32 +02:00
2006-06-09 07:38:59 +02:00
/ * *
* @param homePath path to the YaCy directory
* @param dbcache cache size in MB
* /
2006-04-20 10:20:12 +02:00
public static void minimizeUrlDB ( String homePath , int dbcache ) {
2006-02-21 15:10:00 +01:00
// run with "java -classpath classes yacy -minimizeUrlDB"
2006-06-28 11:04:53 +02:00
try { serverLog . configureLogging ( new File ( homePath , " DATA/LOG/yacy.logging " ) ) ; } catch ( Exception e ) { }
2006-10-19 23:14:37 +02:00
File indexRoot = new File ( new File ( homePath ) , " DATA/INDEX " ) ;
2007-01-31 10:22:22 +01:00
File indexRoot2 = new File ( new File ( homePath ) , " DATA/INDEX2 " ) ;
2006-04-20 10:20:12 +02:00
serverLog log = new serverLog ( " URL-CLEANUP " ) ;
2005-10-05 12:45:33 +02:00
try {
log . logInfo ( " STARTING URL CLEANUP " ) ;
// db containing all currently loades urls
2006-06-09 07:38:59 +02:00
int cache = dbcache * 1024 ; // in KB
2006-04-20 10:20:12 +02:00
log . logFine ( " URLDB-Caches: " + cache + " bytes " ) ;
2006-12-05 03:47:51 +01:00
plasmaCrawlLURL currentUrlDB = new plasmaCrawlLURL ( indexRoot , cache , 10000 ) ;
2005-10-05 12:45:33 +02:00
// db used to hold all neede urls
2007-01-31 10:22:22 +01:00
plasmaCrawlLURL minimizedUrlDB = new plasmaCrawlLURL ( indexRoot2 , cache , 10000 ) ;
2005-10-05 12:45:33 +02:00
Runtime rt = Runtime . getRuntime ( ) ;
2006-12-05 03:47:51 +01:00
int cacheMem = ( int ) ( serverMemory . max - rt . totalMemory ( ) ) ;
if ( cacheMem < 2048000 ) throw new OutOfMemoryError ( " Not enough memory available to start clean up. " ) ;
2006-06-09 07:38:59 +02:00
2006-12-05 03:47:51 +01:00
plasmaWordIndex wordIndex = new plasmaWordIndex ( indexRoot , cacheMem , cacheMem , 10000 , log ) ;
2007-01-31 10:22:22 +01:00
Iterator indexContainerIterator = wordIndex . wordContainers ( " AAAAAAAAAAAA " , false , false ) ;
2005-10-05 12:45:33 +02:00
long urlCounter = 0 , wordCounter = 0 ;
long wordChunkStart = System . currentTimeMillis ( ) , wordChunkEnd = 0 ;
2007-01-31 10:22:22 +01:00
String wordChunkStartHash = " AAAAAAAAAAAA " , wordChunkEndHash ;
2005-10-05 12:45:33 +02:00
2006-07-26 13:21:51 +02:00
while ( indexContainerIterator . hasNext ( ) ) {
2006-05-28 03:09:31 +02:00
indexContainer wordIdxContainer = null ;
2005-10-05 12:45:33 +02:00
try {
wordCounter + + ;
2006-07-26 13:21:51 +02:00
wordIdxContainer = ( indexContainer ) indexContainerIterator . next ( ) ;
2005-10-05 12:45:33 +02:00
// the combined container will fit, read the container
2006-02-25 09:42:45 +01:00
Iterator wordIdxEntries = wordIdxContainer . entries ( ) ;
2006-11-08 17:17:47 +01:00
indexRWIEntry iEntry ;
2005-10-13 15:57:15 +02:00
while ( wordIdxEntries . hasNext ( ) ) {
2006-11-08 17:17:47 +01:00
iEntry = ( indexRWIEntry ) wordIdxEntries . next ( ) ;
2006-08-02 21:59:28 +02:00
String urlHash = iEntry . urlHash ( ) ;
2005-12-15 11:31:00 +01:00
if ( ( currentUrlDB . exists ( urlHash ) ) & & ( ! minimizedUrlDB . exists ( urlHash ) ) ) try {
2006-11-08 17:17:47 +01:00
indexURLEntry urlEntry = currentUrlDB . load ( urlHash , null ) ;
2005-12-15 11:31:00 +01:00
urlCounter + + ;
2006-10-16 17:04:16 +02:00
minimizedUrlDB . store ( urlEntry ) ;
2005-10-05 12:45:33 +02:00
if ( urlCounter % 500 = = 0 ) {
log . logInfo ( urlCounter + " URLs found so far. " ) ;
}
2005-12-15 11:31:00 +01:00
} catch ( IOException e ) { }
2005-10-05 12:45:33 +02:00
}
if ( wordCounter % 500 = = 0 ) {
2006-07-26 13:21:51 +02:00
wordChunkEndHash = wordIdxContainer . getWordHash ( ) ;
2005-10-05 12:45:33 +02:00
wordChunkEnd = System . currentTimeMillis ( ) ;
long duration = wordChunkEnd - wordChunkStart ;
log . logInfo ( wordCounter + " words scanned " +
" [ " + wordChunkStartHash + " .. " + wordChunkEndHash + " ] \ n " +
" Duration: " + 500 * 1000 / duration + " words/s " +
" | Free memory: " + rt . freeMemory ( ) +
" | Total memory: " + rt . totalMemory ( ) ) ;
wordChunkStart = wordChunkEnd ;
wordChunkStartHash = wordChunkEndHash ;
}
2006-07-26 13:21:51 +02:00
// we have read all elements, now we can close it
wordIdxContainer = null ;
2005-10-05 12:45:33 +02:00
} catch ( Exception e ) {
2006-04-20 10:20:12 +02:00
log . logSevere ( " Exception " , e ) ;
2005-10-05 12:45:33 +02:00
} finally {
2006-02-25 09:42:45 +01:00
if ( wordIdxContainer ! = null ) try { wordIdxContainer = null ; } catch ( Exception e ) { }
2005-10-05 12:45:33 +02:00
}
}
2006-10-22 09:09:45 +02:00
log . logInfo ( " current LURL DB contains " + currentUrlDB . size ( ) + " entries. " ) ;
log . logInfo ( " mimimized LURL DB contains " + minimizedUrlDB . size ( ) + " entries. " ) ;
2005-10-05 12:45:33 +02:00
currentUrlDB . close ( ) ;
minimizedUrlDB . close ( ) ;
2006-12-05 03:47:51 +01:00
wordIndex . close ( ) ;
2005-10-05 12:45:33 +02:00
2006-06-09 07:38:59 +02:00
// TODO: rename the mimimized UrlDB to the name of the previous UrlDB
2005-10-05 12:45:33 +02:00
log . logInfo ( " FINISHED URL CLEANUP, WAIT FOR DUMP " ) ;
2006-09-14 12:12:41 +02:00
log . logInfo ( " You can now backup your old URL DB and rename minimized/urlHash.db to urlHash.db " ) ;
2006-06-09 07:38:59 +02:00
2005-10-05 12:45:33 +02:00
log . logInfo ( " TERMINATED URL CLEANUP " ) ;
2006-06-09 07:38:59 +02:00
} catch ( Exception e ) {
log . logSevere ( " Exception: " + e . getMessage ( ) , e ) ;
} catch ( Error e ) {
log . logSevere ( " Error: " + e . getMessage ( ) , e ) ;
2005-10-05 12:45:33 +02:00
}
}
2005-08-02 21:40:29 +02:00
/ * *
* Reads all words from the given file and creates a hashmap , where key is
* the plasma word hash and value is the word itself .
*
* @param wordlist File where the words are stored .
* @return HashMap with the hash - word - relation .
* /
2005-04-07 21:19:42 +02:00
private static HashMap loadWordMap ( File wordlist ) {
2005-05-11 11:44:36 +02:00
// returns a hash-word - Relation
2005-04-07 21:19:42 +02:00
HashMap wordmap = new HashMap ( ) ;
try {
String word ;
BufferedReader br = new BufferedReader ( new InputStreamReader ( new FileInputStream ( wordlist ) ) ) ;
2006-11-23 03:16:30 +01:00
while ( ( word = br . readLine ( ) ) ! = null ) wordmap . put ( plasmaCondenser . word2hash ( word ) , word ) ;
2005-04-07 21:19:42 +02:00
br . close ( ) ;
} catch ( IOException e ) { }
return wordmap ;
}
2005-08-02 21:40:29 +02:00
/ * *
* Reads all words from the given file and creats as HashSet , which contains
* all found words .
*
* @param wordlist File where the words are stored .
* @return HashSet with the words
* /
2005-04-07 21:19:42 +02:00
private static HashSet loadWordSet ( File wordlist ) {
2005-05-11 11:44:36 +02:00
// returns a set of words
2005-04-07 21:19:42 +02:00
HashSet wordset = new HashSet ( ) ;
try {
String word ;
BufferedReader br = new BufferedReader ( new InputStreamReader ( new FileInputStream ( wordlist ) ) ) ;
while ( ( word = br . readLine ( ) ) ! = null ) wordset . add ( word ) ;
br . close ( ) ;
} catch ( IOException e ) { }
return wordset ;
}
2005-08-02 21:40:29 +02:00
/ * *
* Cleans a wordlist in a file according to the length of the words . The
* file with the given filename is read and then only the words in the given
* length - range are written back to the file .
*
* @param wordlist Name of the file the words are stored in .
* @param minlength Minimal needed length for each word to be stored .
* @param maxlength Maximal allowed length for each word to be stored .
* /
2005-04-07 21:19:42 +02:00
private static void cleanwordlist ( String wordlist , int minlength , int maxlength ) {
// start up
System . out . println ( copyright ) ;
System . out . println ( hline ) ;
2005-08-30 23:10:39 +02:00
serverLog . logConfig ( " CLEAN-WORDLIST " , " START " ) ;
2005-08-02 21:40:29 +02:00
2005-04-07 21:19:42 +02:00
String word ;
TreeSet wordset = new TreeSet ( ) ;
int count = 0 ;
try {
BufferedReader br = new BufferedReader ( new InputStreamReader ( new FileInputStream ( wordlist ) ) ) ;
2005-05-11 11:44:36 +02:00
String seps = " ' .,:/-& " ;
2005-04-07 21:19:42 +02:00
while ( ( word = br . readLine ( ) ) ! = null ) {
word = word . toLowerCase ( ) . trim ( ) ;
2005-05-11 11:44:36 +02:00
for ( int i = 0 ; i < seps . length ( ) ; i + + ) {
2005-04-07 21:19:42 +02:00
if ( word . indexOf ( seps . charAt ( i ) ) > = 0 ) word = word . substring ( 0 , word . indexOf ( seps . charAt ( i ) ) ) ;
2005-05-11 11:44:36 +02:00
}
2005-04-07 21:19:42 +02:00
if ( ( word . length ( ) > = minlength ) & & ( word . length ( ) < = maxlength ) ) wordset . add ( word ) ;
count + + ;
}
br . close ( ) ;
2005-08-02 21:40:29 +02:00
2005-04-07 21:19:42 +02:00
if ( wordset . size ( ) ! = count ) {
count = count - wordset . size ( ) ;
BufferedWriter bw = new BufferedWriter ( new PrintWriter ( new FileWriter ( wordlist ) ) ) ;
while ( wordset . size ( ) > 0 ) {
word = ( String ) wordset . first ( ) ;
bw . write ( word + " \ n " ) ;
wordset . remove ( word ) ;
}
bw . close ( ) ;
serverLog . logInfo ( " CLEAN-WORDLIST " , " shrinked wordlist by " + count + " words. " ) ;
} else {
serverLog . logInfo ( " CLEAN-WORDLIST " , " not necessary to change wordlist " ) ;
}
} catch ( IOException e ) {
2005-08-30 23:32:59 +02:00
serverLog . logSevere ( " CLEAN-WORDLIST " , " ERROR: " + e . getMessage ( ) ) ;
2005-04-07 21:19:42 +02:00
System . exit ( - 1 ) ;
}
2005-08-02 21:40:29 +02:00
2005-04-07 21:19:42 +02:00
// finished
2005-08-30 23:10:39 +02:00
serverLog . logConfig ( " CLEAN-WORDLIST " , " FINISHED " ) ;
2005-04-07 21:19:42 +02:00
}
2005-08-02 21:40:29 +02:00
/ * *
* Gets all words from the stopword - list and removes them in the databases .
* FIXME : Really ? Don ' t know if I read this correctly .
*
* @param homePath Root - Path where all information is to be found .
* /
2005-04-07 21:19:42 +02:00
private static void deleteStopwords ( String homePath ) {
// start up
System . out . println ( copyright ) ;
System . out . println ( hline ) ;
2005-08-30 23:10:39 +02:00
serverLog . logConfig ( " DELETE-STOPWORDS " , " START " ) ;
2005-08-02 21:40:29 +02:00
2005-04-07 21:19:42 +02:00
Properties config = configuration ( " DELETE-STOPWORDS " , homePath ) ;
File dbRoot = new File ( homePath , config . getProperty ( " dbPath " ) ) ;
2005-08-02 21:40:29 +02:00
2005-04-07 21:19:42 +02:00
// load stopwords
HashSet stopwords = loadWordSet ( new File ( homePath , " yacy.stopwords " ) ) ;
serverLog . logInfo ( " DELETE-STOPWORDS " , " loaded stopwords, " + stopwords . size ( ) + " entries in list, starting scanning " ) ;
2005-08-02 21:40:29 +02:00
2005-04-07 21:19:42 +02:00
// find all hashes
File f ;
String w ;
2005-05-11 11:44:36 +02:00
int count = 0 ;
long thisamount , totalamount = 0 ;
Iterator i = stopwords . iterator ( ) ;
2005-04-07 21:19:42 +02:00
while ( i . hasNext ( ) ) {
2005-05-11 11:44:36 +02:00
w = ( String ) i . next ( ) ;
2006-11-23 03:16:30 +01:00
f = plasmaWordIndexFile . wordHash2path ( dbRoot , plasmaCondenser . word2hash ( w ) ) ;
2005-05-11 11:44:36 +02:00
if ( f . exists ( ) ) {
thisamount = f . length ( ) ;
if ( f . delete ( ) ) {
count + + ;
totalamount + = thisamount ;
serverLog . logInfo ( " DELETE-STOPWORDS " , " deleted index for word ' " + w + " ', " + thisamount + " bytes " ) ;
}
}
2005-04-07 21:19:42 +02:00
}
2005-08-02 21:40:29 +02:00
2005-05-11 11:44:36 +02:00
serverLog . logInfo ( " DELETE-STOPWORDS " , " TOTALS: deleted " + count + " indexes; " + ( totalamount / 1024 ) + " kbytes " ) ;
2005-08-02 21:40:29 +02:00
2005-04-07 21:19:42 +02:00
// finished
2005-08-30 23:10:39 +02:00
serverLog . logConfig ( " DELETE-STOPWORDS " , " FINISHED " ) ;
2005-04-07 21:19:42 +02:00
}
2005-08-02 21:40:29 +02:00
2005-11-11 00:48:20 +01:00
private static void transferCR ( String targetaddress , String crfile ) {
File f = new File ( crfile ) ;
try {
byte [ ] b = serverFileUtils . read ( f ) ;
String result = yacyClient . transfer ( targetaddress , f . getName ( ) , b ) ;
if ( result = = null )
serverLog . logInfo ( " TRANSFER-CR " , " transmitted file " + crfile + " to " + targetaddress + " successfully " ) ;
else
serverLog . logInfo ( " TRANSFER-CR " , " error transmitting file " + crfile + " to " + targetaddress + " : " + result ) ;
} catch ( IOException e ) {
serverLog . logInfo ( " TRANSFER-CR " , " could not read file " + crfile ) ;
}
}
2006-05-26 14:18:12 +02:00
/ * *
2006-03-13 21:12:31 +01:00
* Generates a text file containing all domains in this peer ' s DB .
2006-05-26 14:18:12 +02:00
* This may be useful to calculate the YaCy - Blockrank .
2006-03-13 21:12:31 +01:00
*
2006-05-26 14:18:12 +02:00
* @param format String which determines the format of the file . Possible values : " html " , " zip " , " gzip " or " plain "
* @see urllist
2006-03-13 21:12:31 +01:00
* /
2006-07-24 10:08:33 +02:00
private static void domlist ( String homePath , String source , String format , String targetName ) {
2006-03-13 21:12:31 +01:00
File root = new File ( homePath ) ;
2005-11-21 02:30:30 +01:00
try {
2006-12-05 03:47:51 +01:00
final plasmaSwitchboard sb = new plasmaSwitchboard ( homePath , " yacy.init " , " DATA/SETTINGS/httpProxy.conf " ) ;
2006-07-24 10:08:33 +02:00
HashMap doms = new HashMap ( ) ;
2006-12-05 03:47:51 +01:00
System . out . println ( " Started domain list extraction from " + sb . wordIndex . loadedURL . size ( ) + " url entries. " ) ;
2006-07-22 12:56:40 +02:00
System . out . println ( " a dump will be written after double-check of all extracted domains. " ) ;
System . out . println ( " This process may fail in case of too less memory. To increase memory, start with " ) ;
2006-07-24 18:40:59 +02:00
System . out . println ( " java -Xmx<megabytes>m -classpath classes yacy -domlist [ -source { nurl | lurl | eurl } ] [ -format { text | zip | gzip | html } ] [ <path to DATA folder> ] " ) ;
2006-07-22 12:56:40 +02:00
int c = 0 ;
2006-07-22 13:33:01 +02:00
long start = System . currentTimeMillis ( ) ;
2006-07-24 10:08:33 +02:00
if ( source . equals ( " lurl " ) ) {
2006-12-05 03:47:51 +01:00
Iterator eiter = sb . wordIndex . loadedURL . entries ( true , false , null ) ;
2006-11-08 17:17:47 +01:00
indexURLEntry entry ;
2006-07-24 10:08:33 +02:00
while ( eiter . hasNext ( ) ) {
try {
2006-11-08 17:17:47 +01:00
entry = ( indexURLEntry ) eiter . next ( ) ;
indexURLEntry . Components comp = entry . comp ( ) ;
2006-10-19 00:25:07 +02:00
if ( ( entry ! = null ) & & ( comp . url ( ) ! = null ) ) doms . put ( comp . url ( ) . getHost ( ) , null ) ;
2006-07-24 10:08:33 +02:00
} catch ( Exception e ) {
// here a MalformedURLException may occur
// just ignore
}
c + + ;
if ( c % 10000 = = 0 ) System . out . println (
c + " urls checked, " +
doms . size ( ) + " domains collected, " +
( ( Runtime . getRuntime ( ) . maxMemory ( ) - Runtime . getRuntime ( ) . totalMemory ( ) + Runtime . getRuntime ( ) . freeMemory ( ) ) / 1024 / 1024 ) + " MB available, " +
2006-12-05 03:47:51 +01:00
( ( System . currentTimeMillis ( ) - start ) * ( sb . wordIndex . loadedURL . size ( ) - c ) / c / 60000 ) + " minutes remaining. " ) ;
2006-07-22 12:00:21 +02:00
}
2006-07-24 10:08:33 +02:00
}
if ( source . equals ( " eurl " ) ) {
2006-12-05 03:47:51 +01:00
Iterator eiter = sb . errorURL . entries ( true , false , null ) ;
2006-07-24 10:08:33 +02:00
plasmaCrawlEURL . Entry entry ;
while ( eiter . hasNext ( ) ) {
try {
entry = ( plasmaCrawlEURL . Entry ) eiter . next ( ) ;
if ( ( entry ! = null ) & & ( entry . url ( ) ! = null ) ) doms . put ( entry . url ( ) . getHost ( ) , entry . failreason ( ) ) ;
} catch ( Exception e ) {
// here a MalformedURLException may occur
// just ignore
}
c + + ;
if ( c % 10000 = = 0 ) System . out . println (
2006-07-22 13:33:01 +02:00
c + " urls checked, " +
doms . size ( ) + " domains collected, " +
2006-07-22 13:43:56 +02:00
( ( Runtime . getRuntime ( ) . maxMemory ( ) - Runtime . getRuntime ( ) . totalMemory ( ) + Runtime . getRuntime ( ) . freeMemory ( ) ) / 1024 / 1024 ) + " MB available, " +
2006-12-05 03:47:51 +01:00
( ( System . currentTimeMillis ( ) - start ) * ( sb . wordIndex . loadedURL . size ( ) - c ) / c / 60000 ) + " minutes remaining. " ) ;
2006-07-22 13:33:01 +02:00
}
2005-11-21 02:30:30 +01:00
}
2006-07-24 18:40:59 +02:00
if ( source . equals ( " nurl " ) ) {
2006-12-05 03:47:51 +01:00
Iterator eiter = sb . noticeURL . entries ( true , false , null ) ;
2006-07-24 18:40:59 +02:00
plasmaCrawlNURL . Entry entry ;
while ( eiter . hasNext ( ) ) {
try {
entry = ( plasmaCrawlNURL . Entry ) eiter . next ( ) ;
if ( ( entry ! = null ) & & ( entry . url ( ) ! = null ) ) doms . put ( entry . url ( ) . getHost ( ) , " profile= " + entry . profileHandle ( ) + " , depth= " + entry . depth ( ) ) ;
} catch ( Exception e ) {
// here a MalformedURLException may occur
// just ignore
}
c + + ;
if ( c % 10000 = = 0 ) System . out . println (
c + " urls checked, " +
doms . size ( ) + " domains collected, " +
( ( Runtime . getRuntime ( ) . maxMemory ( ) - Runtime . getRuntime ( ) . totalMemory ( ) + Runtime . getRuntime ( ) . freeMemory ( ) ) / 1024 / 1024 ) + " MB available, " +
2006-12-05 03:47:51 +01:00
( ( System . currentTimeMillis ( ) - start ) * ( sb . wordIndex . loadedURL . size ( ) - c ) / c / 60000 ) + " minutes remaining. " ) ;
2006-07-24 18:40:59 +02:00
}
}
2005-12-07 02:40:52 +01:00
2005-12-17 22:19:51 +01:00
if ( format . equals ( " html " ) ) {
2006-07-22 12:56:40 +02:00
// output file in HTML format
2005-12-17 16:43:13 +01:00
File file = new File ( root , targetName + " .html " ) ;
BufferedOutputStream bos = new BufferedOutputStream ( new FileOutputStream ( file ) ) ;
2006-07-22 12:56:40 +02:00
System . out . println ( " Started domain list dump to file " + file ) ;
2006-07-24 10:08:33 +02:00
Iterator i = doms . entrySet ( ) . iterator ( ) ;
Map . Entry entry ;
2005-12-17 16:43:13 +01:00
String key ;
2006-03-13 21:12:31 +01:00
bos . write ( ( " <!DOCTYPE HTML PUBLIC \" -//W3C//DTD HTML 4.01 Transitional//EN \" \" http://www.w3.org/TR/html4/loose.dtd \" > " ) . getBytes ( ) ) ;
bos . write ( serverCore . crlf ) ;
2006-07-24 10:08:33 +02:00
bos . write ( ( " <html><head><title>YaCy " + source + " domainlist</title></head><body> " ) . getBytes ( ) ) ;
2006-03-13 21:12:31 +01:00
bos . write ( serverCore . crlf ) ;
2005-12-17 16:43:13 +01:00
while ( i . hasNext ( ) ) {
2006-07-24 10:08:33 +02:00
entry = ( Map . Entry ) i . next ( ) ;
key = ( String ) entry . getKey ( ) ;
bos . write ( ( " <a href= \" http:// " + key + " \" > " + key + " </a> " +
2006-07-24 18:40:59 +02:00
( ( entry . getValue ( ) = = null ) ? " " : ( " " + ( ( String ) entry . getValue ( ) ) ) ) + " <br> "
2006-07-24 10:08:33 +02:00
) . getBytes ( ) ) ;
2005-12-17 16:43:13 +01:00
bos . write ( serverCore . crlf ) ;
}
2006-03-13 21:12:31 +01:00
bos . write ( ( " </body></html> " ) . getBytes ( ) ) ;
2005-12-17 16:43:13 +01:00
bos . close ( ) ;
2006-07-22 12:56:40 +02:00
2005-12-17 22:19:51 +01:00
} else if ( format . equals ( " zip " ) ) {
2006-07-22 12:56:40 +02:00
// output file in plain text but compressed with ZIP
2005-12-17 16:43:13 +01:00
File file = new File ( root , targetName + " .zip " ) ;
2006-07-22 12:56:40 +02:00
System . out . println ( " Started domain list dump to file " + file ) ;
2006-07-24 10:08:33 +02:00
serverFileUtils . saveSet ( file , " zip " , doms . keySet ( ) , new String ( serverCore . crlf ) ) ;
2006-07-22 12:56:40 +02:00
2006-03-13 21:12:31 +01:00
} else if ( format . equals ( " gzip " ) ) {
2006-07-22 12:56:40 +02:00
// output file in plain text but compressed with GZIP
2006-03-13 21:12:31 +01:00
File file = new File ( root , targetName + " .txt.gz " ) ;
2006-07-22 12:56:40 +02:00
System . out . println ( " Started domain list dump to file " + file ) ;
2006-07-24 10:08:33 +02:00
serverFileUtils . saveSet ( file , " gzip " , doms . keySet ( ) , new String ( serverCore . crlf ) ) ;
} else {
2005-12-17 22:19:51 +01:00
// plain text list
2006-07-24 00:39:41 +02:00
File file = new File ( root , targetName + " .txt " ) ;
System . out . println ( " Started domain list dump to file " + file ) ;
2006-07-24 10:08:33 +02:00
serverFileUtils . saveSet ( file , " plain " , doms . keySet ( ) , new String ( serverCore . crlf ) ) ;
2005-12-17 22:19:51 +01:00
}
2006-12-05 03:47:51 +01:00
sb . close ( ) ;
2005-12-07 02:40:52 +01:00
} catch ( IOException e ) {
e . printStackTrace ( ) ;
}
}
2006-07-24 10:08:33 +02:00
private static void urllist ( String homePath , String source , boolean html , String targetName ) {
2005-12-07 02:40:52 +01:00
File root = new File ( homePath ) ;
try {
2006-12-05 03:47:51 +01:00
final plasmaSwitchboard sb = new plasmaSwitchboard ( homePath , " yacy.init " , " DATA/SETTINGS/httpProxy.conf " ) ;
2005-12-07 02:40:52 +01:00
File file = new File ( root , targetName ) ;
BufferedOutputStream bos = new BufferedOutputStream ( new FileOutputStream ( file ) ) ;
2006-07-24 10:08:33 +02:00
if ( source . equals ( " lurl " ) ) {
2006-12-05 03:47:51 +01:00
Iterator eiter = sb . wordIndex . loadedURL . entries ( true , false , null ) ;
2006-11-08 17:17:47 +01:00
indexURLEntry entry ;
2006-07-24 10:08:33 +02:00
while ( eiter . hasNext ( ) ) {
2006-11-08 17:17:47 +01:00
entry = ( indexURLEntry ) eiter . next ( ) ;
indexURLEntry . Components comp = entry . comp ( ) ;
2006-10-19 00:25:07 +02:00
if ( ( entry ! = null ) & & ( comp . url ( ) ! = null ) ) {
2006-07-24 10:08:33 +02:00
if ( html ) {
2006-10-19 00:25:07 +02:00
bos . write ( ( " <a href= \" " + comp . url ( ) . toNormalform ( ) + " \" > " + comp . descr ( ) + " </a><br> " ) . getBytes ( " UTF-8 " ) ) ;
2006-07-24 10:08:33 +02:00
bos . write ( serverCore . crlf ) ;
} else {
2006-10-19 00:25:07 +02:00
bos . write ( comp . url ( ) . toNormalform ( ) . getBytes ( ) ) ;
2006-07-24 10:08:33 +02:00
bos . write ( serverCore . crlf ) ;
}
}
}
}
if ( source . equals ( " eurl " ) ) {
2006-12-05 03:47:51 +01:00
Iterator eiter = sb . errorURL . entries ( true , false , null ) ;
2006-07-24 10:08:33 +02:00
plasmaCrawlEURL . Entry entry ;
while ( eiter . hasNext ( ) ) {
entry = ( plasmaCrawlEURL . Entry ) eiter . next ( ) ;
if ( ( entry ! = null ) & & ( entry . url ( ) ! = null ) ) {
if ( html ) {
bos . write ( ( " <a href= \" " + entry . url ( ) + " \" > " + entry . url ( ) + " </a> " + entry . failreason ( ) + " <br> " ) . getBytes ( " UTF-8 " ) ) ;
bos . write ( serverCore . crlf ) ;
} else {
bos . write ( entry . url ( ) . toString ( ) . getBytes ( ) ) ;
bos . write ( serverCore . crlf ) ;
}
2005-12-07 02:40:52 +01:00
}
}
}
2006-07-24 18:40:59 +02:00
if ( source . equals ( " nurl " ) ) {
2006-12-05 03:47:51 +01:00
Iterator eiter = sb . noticeURL . entries ( true , false , null ) ;
2006-07-24 18:40:59 +02:00
plasmaCrawlNURL . Entry entry ;
while ( eiter . hasNext ( ) ) {
entry = ( plasmaCrawlNURL . Entry ) eiter . next ( ) ;
if ( ( entry ! = null ) & & ( entry . url ( ) ! = null ) ) {
if ( html ) {
bos . write ( ( " <a href= \" " + entry . url ( ) + " \" > " + entry . url ( ) + " </a> " + " profile= " + entry . profileHandle ( ) + " , depth= " + entry . depth ( ) + " <br> " ) . getBytes ( " UTF-8 " ) ) ;
bos . write ( serverCore . crlf ) ;
} else {
bos . write ( entry . url ( ) . toString ( ) . getBytes ( ) ) ;
bos . write ( serverCore . crlf ) ;
}
}
}
}
2005-12-07 02:40:52 +01:00
bos . close ( ) ;
2006-12-05 03:47:51 +01:00
sb . close ( ) ;
2005-11-21 02:30:30 +01:00
} catch ( IOException e ) {
e . printStackTrace ( ) ;
}
}
2006-12-05 03:47:51 +01:00
private static void migratelurls ( String homePath , File urlHash ) {
final plasmaSwitchboard sb = new plasmaSwitchboard ( homePath , " yacy.init " , " DATA/SETTINGS/httpProxy.conf " ) ;
2006-10-20 15:50:00 +02:00
kelondroTree oldindex = null ;
2006-10-19 23:14:37 +02:00
try {
2006-11-08 17:17:47 +01:00
oldindex = new kelondroTree ( urlHash , 1000 , - 1 , indexURLEntryOld . rowdef ) ;
2006-10-20 15:50:00 +02:00
} catch ( IOException e ) {
System . out . println ( " ERROR: CANNOT OPEN OLD INDEX: " + e . getMessage ( ) ) ;
}
2006-10-13 01:14:41 +02:00
2006-10-20 15:50:00 +02:00
long start = System . currentTimeMillis ( ) ;
long last = start ;
int tc = oldindex . size ( ) , c = 0 ;
Iterator eiter = oldindex . contentRows ( - 1 ) ;
kelondroRow . Entry oldrow ;
2006-11-08 17:17:47 +01:00
indexURLEntry oldentry ;
indexURLEntry newentry ;
indexURLEntry . Components comp ;
2006-10-20 15:50:00 +02:00
byte [ ] dummymd5 = new byte [ 0 ] ;
while ( eiter . hasNext ( ) ) {
2006-10-21 14:23:06 +02:00
try {
oldrow = ( kelondroRow . Entry ) eiter . next ( ) ;
} catch ( Exception e ) {
// an IOException may occur here
2006-10-24 15:48:16 +02:00
//e.printStackTrace();
2006-10-21 14:23:06 +02:00
oldrow = null ;
}
2006-10-20 15:50:00 +02:00
if ( oldrow ! = null ) try {
2006-11-08 17:17:47 +01:00
oldentry = new indexURLEntryOld ( oldrow , null ) ;
2006-10-20 15:50:00 +02:00
comp = oldentry . comp ( ) ;
2006-12-05 03:47:51 +01:00
newentry = sb . wordIndex . loadedURL . newEntry (
2006-10-19 23:14:37 +02:00
comp . url ( ) ,
comp . descr ( ) ,
" " ,
" " ,
" " ,
oldentry . moddate ( ) ,
oldentry . loaddate ( ) ,
oldentry . freshdate ( ) ,
oldentry . referrerHash ( ) ,
dummymd5 ,
oldentry . size ( ) ,
oldentry . wordCount ( ) ,
oldentry . doctype ( ) ,
2006-11-23 03:16:30 +01:00
new kelondroBitfield ( 4 ) ,
2006-10-19 23:14:37 +02:00
oldentry . language ( ) ,
0 , 0 , 0 , 0 , 0 , 0 ) ;
2006-12-05 03:47:51 +01:00
sb . wordIndex . loadedURL . store ( newentry ) ;
2006-10-19 23:14:37 +02:00
c + + ;
2006-10-20 15:50:00 +02:00
} catch ( IOException e ) {
// ignore
}
if ( System . currentTimeMillis ( ) - last > 60000 ) {
System . out . println ( " Migrated " + c + " from " + tc + " urls. Estimated remaining time: " + ( ( System . currentTimeMillis ( ) - start ) * ( tc - c ) / c / 60000 ) + " minutes " ) ;
last = System . currentTimeMillis ( ) ;
2006-10-19 23:14:37 +02:00
}
2006-10-13 01:14:41 +02:00
}
2006-12-05 03:47:51 +01:00
sb . close ( ) ;
2006-10-20 15:50:00 +02:00
try { oldindex . close ( ) ; } catch ( IOException e ) { }
System . out . println ( " MIGRATION OF " + c + " URLs FINISHED " ) ;
2006-10-13 01:14:41 +02:00
}
2005-12-07 02:40:52 +01:00
private static String [ ] shift ( String [ ] args , int pos , int count ) {
String [ ] newargs = new String [ args . length - count ] ;
System . arraycopy ( args , 0 , newargs , 0 , pos ) ;
System . arraycopy ( args , pos + count , newargs , pos , args . length - pos - count ) ;
return newargs ;
}
2005-12-07 12:10:08 +01:00
/ * *
* Uses an Iteration over urlHash . db to detect malformed URL - Entries .
* Damaged URL - Entries will be marked in a HashSet and removed at the end of the function .
*
* @param homePath Root - Path where all information is to be found .
* /
private static void urldbcleanup ( String homePath ) {
File root = new File ( homePath ) ;
2006-10-19 23:14:37 +02:00
File indexroot = new File ( root , " DATA/INDEX " ) ;
2006-04-20 10:20:12 +02:00
serverLog log = new serverLog ( " URLDBCLEANUP " ) ;
2006-08-21 08:42:42 +02:00
try { serverLog . configureLogging ( new File ( homePath , " DATA/LOG/yacy.logging " ) ) ; } catch ( Exception e ) { }
2005-12-07 12:10:08 +01:00
try {
2006-12-05 03:47:51 +01:00
plasmaCrawlLURL currentUrlDB = new plasmaCrawlLURL ( indexroot , 4194304 , 10000 ) ;
2006-05-23 10:59:45 +02:00
currentUrlDB . urldbcleanup ( ) ;
2005-12-07 12:10:08 +01:00
currentUrlDB . close ( ) ;
} catch ( IOException e ) {
2006-04-20 10:20:12 +02:00
log . logSevere ( " IOException " , e ) ;
2005-12-07 12:10:08 +01:00
}
}
2006-01-15 11:29:48 +01:00
private static void RWIHashList ( String homePath , String targetName , String resource , String format ) {
2006-01-14 00:59:04 +01:00
plasmaWordIndex WordIndex = null ;
2006-01-04 14:55:45 +01:00
serverLog log = new serverLog ( " HASHLIST " ) ;
File homeDBroot = new File ( new File ( homePath ) , " DATA/PLASMADB " ) ;
2006-10-19 23:14:37 +02:00
File indexRoot = new File ( new File ( homePath ) , " DATA/INDEX " ) ;
2006-12-08 13:57:17 +01:00
String wordChunkStartHash = " AAAAAAAAAAAA " ;
2006-06-28 11:04:53 +02:00
try { serverLog . configureLogging ( new File ( homePath , " DATA/LOG/yacy.logging " ) ) ; } catch ( Exception e ) { }
2006-01-04 14:55:45 +01:00
log . logInfo ( " STARTING CREATION OF RWI-HASHLIST " ) ;
File root = new File ( homePath ) ;
try {
2006-07-26 13:21:51 +02:00
Iterator indexContainerIterator = null ;
2006-01-10 02:04:22 +01:00
if ( resource . equals ( " all " ) ) {
2006-12-05 03:47:51 +01:00
WordIndex = new plasmaWordIndex ( indexRoot , 8 * 1024 * 1024 , 8 * 1024 * 1024 , 3000 , log ) ;
indexContainerIterator = WordIndex . wordContainers ( wordChunkStartHash , false , false ) ;
} else if ( resource . startsWith ( " assortment " ) ) {
2006-01-11 01:32:44 +01:00
int a = Integer . parseInt ( resource . substring ( 10 ) ) ;
2006-07-04 01:57:33 +02:00
plasmaWordIndexAssortment assortment = new plasmaWordIndexAssortment ( new File ( homeDBroot , " ACLUSTER " ) , a , 8 * 1024 * 1024 , 3000 , null ) ;
2006-12-08 13:57:17 +01:00
indexContainerIterator = assortment . wordContainers ( ) ;
2006-01-11 01:43:00 +01:00
} else if ( resource . equals ( " words " ) ) {
2006-12-05 03:47:51 +01:00
plasmaWordIndexFileCluster fileDB = new plasmaWordIndexFileCluster ( homeDBroot ) ;
indexContainerIterator = fileDB . wordContainers ( wordChunkStartHash , false ) ;
}
2006-01-04 14:55:45 +01:00
int counter = 0 ;
2006-07-26 13:21:51 +02:00
indexContainer container = null ;
2006-01-15 11:29:48 +01:00
if ( format . equals ( " zip " ) ) {
log . logInfo ( " Writing Hashlist to ZIP-file: " + targetName + " .zip " ) ;
ZipEntry zipEntry = new ZipEntry ( targetName + " .txt " ) ;
File file = new File ( root , targetName + " .zip " ) ;
ZipOutputStream bos = new ZipOutputStream ( new FileOutputStream ( file ) ) ;
bos . putNextEntry ( zipEntry ) ;
2006-07-26 13:21:51 +02:00
while ( indexContainerIterator . hasNext ( ) ) {
2006-01-15 11:29:48 +01:00
counter + + ;
2006-07-26 13:21:51 +02:00
container = ( indexContainer ) indexContainerIterator . next ( ) ;
bos . write ( ( container . getWordHash ( ) ) . getBytes ( ) ) ;
2006-01-15 11:29:48 +01:00
bos . write ( serverCore . crlf ) ;
if ( counter % 500 = = 0 ) {
2006-07-26 13:21:51 +02:00
log . logInfo ( " Found " + counter + " Hashs until now. Last found Hash: " + container . getWordHash ( ) ) ;
2006-01-15 11:29:48 +01:00
}
}
2007-01-31 10:22:22 +01:00
bos . flush ( ) ;
2006-01-15 11:29:48 +01:00
bos . close ( ) ;
2006-07-26 13:21:51 +02:00
} else {
2006-01-15 11:29:48 +01:00
log . logInfo ( " Writing Hashlist to TXT-file: " + targetName + " .txt " ) ;
File file = new File ( root , targetName + " .txt " ) ;
BufferedOutputStream bos = new BufferedOutputStream ( new FileOutputStream ( file ) ) ;
2006-07-26 13:21:51 +02:00
while ( indexContainerIterator . hasNext ( ) ) {
2006-01-15 11:29:48 +01:00
counter + + ;
2006-07-26 13:21:51 +02:00
container = ( indexContainer ) indexContainerIterator . next ( ) ;
bos . write ( ( container . getWordHash ( ) ) . getBytes ( ) ) ;
2006-01-15 11:29:48 +01:00
bos . write ( serverCore . crlf ) ;
if ( counter % 500 = = 0 ) {
2006-07-26 13:21:51 +02:00
log . logInfo ( " Found " + counter + " Hashs until now. Last found Hash: " + container . getWordHash ( ) ) ;
2006-01-15 11:29:48 +01:00
}
2006-01-04 14:55:45 +01:00
}
2007-01-31 10:22:22 +01:00
bos . flush ( ) ;
2006-01-15 11:29:48 +01:00
bos . close ( ) ;
2006-01-04 14:55:45 +01:00
}
2006-07-26 13:21:51 +02:00
log . logInfo ( " Total number of Hashs: " + counter + " . Last found Hash: " + container . getWordHash ( ) ) ;
2006-01-04 14:55:45 +01:00
} catch ( IOException e ) {
2006-04-20 10:20:12 +02:00
log . logSevere ( " IOException " , e ) ;
2006-01-10 02:04:22 +01:00
}
2006-01-14 00:59:04 +01:00
if ( WordIndex ! = null ) {
2006-12-05 03:47:51 +01:00
WordIndex . close ( ) ;
2006-01-14 00:59:04 +01:00
WordIndex = null ;
}
2006-01-04 14:55:45 +01:00
}
2006-01-30 09:28:22 +01:00
/ * *
2006-05-26 14:18:12 +02:00
* Searching for peers affected by Bug documented in < a href = " http://www.yacy-forum.de/viewtopic.php?p=16056#16056 " > YaCy - Forum Posting 16056 < / a >
2006-01-30 09:28:22 +01:00
* @param homePath
2006-05-26 14:18:12 +02:00
* @see < a href = " http://www.yacy-forum.de/viewtopic.php?p=16056#16056 " > YaCy - Forum Posting 16056 < / a >
2006-01-30 09:28:22 +01:00
* /
public static void testPeerDB ( String homePath ) {
try {
File yacyDBPath = new File ( new File ( homePath ) , " DATA/YACYDB " ) ;
String [ ] dbFileNames = { " seed.new.db " , " seed.old.db " , " seed.pot.db " } ;
for ( int i = 0 ; i < dbFileNames . length ; i + + ) {
File dbFile = new File ( yacyDBPath , dbFileNames [ i ] ) ;
2007-02-02 14:12:31 +01:00
kelondroMapObjects db = new kelondroMapObjects ( new kelondroDyn ( dbFile , ( 1024 * 0x400 ) / 3 , 3000 , yacySeedDB . commonHashLength , 480 , '#' , true , false ) , 500 , yacySeedDB . sortFields , yacySeedDB . longaccFields , yacySeedDB . doubleaccFields , null , null ) ;
2006-01-30 09:28:22 +01:00
2007-01-30 00:51:10 +01:00
kelondroMapObjects . mapIterator it ;
2006-01-30 09:28:22 +01:00
it = db . maps ( true , false ) ;
while ( it . hasNext ( ) ) {
Map dna = ( Map ) it . next ( ) ;
String peerHash = ( String ) dna . get ( " key " ) ;
if ( peerHash . length ( ) < yacySeedDB . commonHashLength ) {
String peerName = ( String ) dna . get ( " Name " ) ;
String peerIP = ( String ) dna . get ( " IP " ) ;
String peerPort = ( String ) dna . get ( " Port " ) ;
while ( peerHash . length ( ) < yacySeedDB . commonHashLength ) { peerHash = peerHash + " _ " ; }
2006-01-30 09:31:14 +01:00
System . err . println ( " Invalid Peer-Hash found in ' " + dbFileNames [ i ] + " ': " + peerName + " : " + peerHash + " , http:// " + peerIP + " : " + peerPort ) ;
2006-01-30 09:28:22 +01:00
}
}
db . close ( ) ;
}
} catch ( Exception e ) {
e . printStackTrace ( ) ;
}
}
2006-01-10 02:04:22 +01:00
2005-08-02 21:40:29 +02:00
/ * *
2005-12-11 01:25:02 +01:00
* Main - method which is started by java . Checks for special arguments or
* starts up the application .
*
* @param args
* Given arguments from the command line .
* /
2005-04-07 21:19:42 +02:00
public static void main ( String args [ ] ) {
2005-10-12 14:34:08 +02:00
2006-10-23 02:59:55 +02:00
// check assertion status
//ClassLoader.getSystemClassLoader().setDefaultAssertionStatus(true);
boolean assertionenabled = false ;
assert assertionenabled = true ;
if ( assertionenabled ) System . out . println ( " Asserts are enabled " ) ;
2005-10-12 14:34:08 +02:00
// check memory amount
2005-09-21 02:12:37 +02:00
System . gc ( ) ;
2006-08-18 03:33:54 +02:00
long startupMemFree = Runtime . getRuntime ( ) . freeMemory ( ) ; // the amount of free memory in the Java Virtual Machine
2005-09-21 02:12:37 +02:00
long startupMemTotal = Runtime . getRuntime ( ) . totalMemory ( ) ; // the total amount of memory in the Java virtual machine; may vary over time
2006-08-18 03:33:54 +02:00
serverMemory . available ( ) ; // force initialization of class serverMemory
2005-10-12 14:34:08 +02:00
// go into headless awt mode
System . setProperty ( " java.awt.headless " , " true " ) ;
2006-02-18 13:00:13 +01:00
//which XML Parser?
2006-07-26 16:26:45 +02:00
// if(System.getProperty("javax.xml.parsers.DocumentBuilderFactory")==null){
// System.setProperty("javax.xml.parsers.DocumentBuilderFactory", "org.apache.crimson.jaxp.DocumentBuilderFactoryImpl");
// }
// if(System.getProperty("javax.xml.parsers.SAXParserFactory")==null){
// System.setProperty("javax.xml.parsers.SAXParserFactory", "org.apache.crimson.jaxp.SAXParserFactoryImpl");
// }
2005-09-21 14:21:01 +02:00
2005-09-01 17:27:41 +02:00
String applicationRoot = System . getProperty ( " user.dir " ) . replace ( '\\' , '/' ) ;
2005-07-01 01:19:08 +02:00
//System.out.println("args.length=" + args.length);
//System.out.print("args=["); for (int i = 0; i < args.length; i++) System.out.print(args[i] + ", "); System.out.println("]");
2006-07-19 13:20:22 +02:00
if ( ( args . length > = 1 ) & & ( ( args [ 0 ] . toLowerCase ( ) . equals ( " -startup " ) ) | | ( args [ 0 ] . equals ( " -start " ) ) ) ) {
2005-04-07 21:19:42 +02:00
// normal start-up of yacy
if ( args . length = = 2 ) applicationRoot = args [ 1 ] ;
2005-09-21 14:21:01 +02:00
startup ( applicationRoot , startupMemFree , startupMemTotal ) ;
2006-07-19 13:20:22 +02:00
} else if ( ( args . length > = 1 ) & & ( ( args [ 0 ] . toLowerCase ( ) . equals ( " -shutdown " ) ) | | ( args [ 0 ] . equals ( " -stop " ) ) ) ) {
2005-04-07 21:19:42 +02:00
// normal shutdown of yacy
if ( args . length = = 2 ) applicationRoot = args [ 1 ] ;
shutdown ( applicationRoot ) ;
2006-07-19 13:20:22 +02:00
} else if ( ( args . length > = 1 ) & & ( args [ 0 ] . toLowerCase ( ) . equals ( " -migratewords " ) ) ) {
2006-12-05 03:47:51 +01:00
// migrate words from DATA/PLASMADB/WORDS path to collection index
2005-06-15 03:22:07 +02:00
// attention: this may run long and should not be interrupted!
if ( args . length = = 2 ) applicationRoot = args [ 1 ] ;
migrateWords ( applicationRoot ) ;
2006-12-05 03:47:51 +01:00
} else if ( ( args . length > = 1 ) & & ( args [ 0 ] . toLowerCase ( ) . equals ( " -migrateassortments " ) ) ) {
// migrate assortments from DATA/PLASMADB/ACLUSTER path to collection index
// attention: this may run long and should not be interrupted!
if ( args . length = = 2 ) applicationRoot = args [ 1 ] ;
migrateAssortments ( applicationRoot ) ;
2006-07-19 13:20:22 +02:00
} else if ( ( args . length > = 1 ) & & ( args [ 0 ] . toLowerCase ( ) . equals ( " -minimizeurldb " ) ) ) {
2005-10-05 12:45:33 +02:00
// migrate words from DATA/PLASMADB/WORDS path to assortment cache, if possible
// attention: this may run long and should not be interrupted!
2006-04-20 10:20:12 +02:00
int dbcache = 4 ;
2006-07-19 13:20:22 +02:00
if ( args . length > = 3 & & args [ 1 ] . toLowerCase ( ) . equals ( " -cache " ) ) {
2006-04-20 10:20:12 +02:00
dbcache = Integer . parseInt ( args [ 2 ] ) ;
args = shift ( args , 1 , 2 ) ;
}
2005-10-05 12:45:33 +02:00
if ( args . length = = 2 ) applicationRoot = args [ 1 ] ;
2006-04-20 10:20:12 +02:00
minimizeUrlDB ( applicationRoot , dbcache ) ;
2006-07-19 13:20:22 +02:00
} else if ( ( args . length > = 1 ) & & ( args [ 0 ] . toLowerCase ( ) . equals ( " -testpeerdb " ) ) ) {
2006-01-30 09:28:22 +01:00
if ( args . length = = 2 ) {
applicationRoot = args [ 1 ] ;
} else if ( args . length > 2 ) {
System . err . println ( " Usage: -testPeerDB [homeDbRoot] " ) ;
}
testPeerDB ( applicationRoot ) ;
2006-07-19 13:20:22 +02:00
} else if ( ( args . length > = 1 ) & & ( args [ 0 ] . toLowerCase ( ) . equals ( " -deletestopwords " ) ) ) {
2005-04-07 21:19:42 +02:00
// delete those words in the index that are listed in the stopwords file
if ( args . length = = 2 ) applicationRoot = args [ 1 ] ;
deleteStopwords ( applicationRoot ) ;
2006-07-19 13:20:22 +02:00
} else if ( ( args . length > = 1 ) & & ( args [ 0 ] . toLowerCase ( ) . equals ( " -genwordstat " ) ) ) {
2005-04-07 21:19:42 +02:00
// this can help to create a stop-word list
2005-05-11 11:44:36 +02:00
// to use this, you need a 'yacy.words' file in the root path
// start this with "java -classpath classes yacy -genwordstat [<rootdir>]"
2005-04-07 21:19:42 +02:00
if ( args . length = = 2 ) applicationRoot = args [ 1 ] ;
genWordstat ( applicationRoot ) ;
2006-07-19 13:20:22 +02:00
} else if ( ( args . length = = 4 ) & & ( args [ 0 ] . toLowerCase ( ) . equals ( " -cleanwordlist " ) ) ) {
2005-04-07 21:19:42 +02:00
// this can be used to organize and clean a word-list
2005-05-11 11:44:36 +02:00
// start this with "java -classpath classes yacy -cleanwordlist <word-file> <minlength> <maxlength>"
2005-04-07 21:19:42 +02:00
int minlength = Integer . parseInt ( args [ 2 ] ) ;
int maxlength = Integer . parseInt ( args [ 3 ] ) ;
cleanwordlist ( args [ 1 ] , minlength , maxlength ) ;
2006-07-19 13:20:22 +02:00
} else if ( ( args . length > = 1 ) & & ( args [ 0 ] . toLowerCase ( ) . equals ( " -transfercr " ) ) ) {
2005-11-11 00:48:20 +01:00
// transfer a single cr file to a remote peer
2005-11-21 02:30:30 +01:00
String targetaddress = args [ 1 ] ;
String crfile = args [ 2 ] ;
transferCR ( targetaddress , crfile ) ;
2006-07-19 13:20:22 +02:00
} else if ( ( args . length > = 1 ) & & ( args [ 0 ] . toLowerCase ( ) . equals ( " -domlist " ) ) ) {
2005-11-21 02:30:30 +01:00
// generate a url list and save it in a file
2006-07-24 10:08:33 +02:00
String source = " lurl " ;
if ( args . length > = 3 & & args [ 1 ] . toLowerCase ( ) . equals ( " -source " ) ) {
2006-07-24 18:40:59 +02:00
if ( ( args [ 2 ] . equals ( " nurl " ) ) | |
( args [ 2 ] . equals ( " lurl " ) ) | |
2006-07-24 10:08:33 +02:00
( args [ 2 ] . equals ( " eurl " ) ) )
source = args [ 2 ] ;
args = shift ( args , 1 , 2 ) ;
}
2005-12-17 22:19:51 +01:00
String format = " txt " ;
2006-07-19 13:20:22 +02:00
if ( args . length > = 3 & & args [ 1 ] . toLowerCase ( ) . equals ( " -format " ) ) {
2006-07-24 10:08:33 +02:00
if ( ( args [ 2 ] . equals ( " html " ) ) | |
( args [ 2 ] . equals ( " zip " ) ) | |
( args [ 2 ] . equals ( " gzip " ) ) )
format = args [ 2 ] ;
2005-12-07 02:40:52 +01:00
args = shift ( args , 1 , 2 ) ;
}
if ( args . length = = 2 ) applicationRoot = args [ 1 ] ;
2006-07-24 18:40:59 +02:00
String outfile = " domlist_ " + source + " _ " + System . currentTimeMillis ( ) ;
2006-07-24 10:08:33 +02:00
domlist ( applicationRoot , source , format , outfile ) ;
2006-07-19 13:20:22 +02:00
} else if ( ( args . length > = 1 ) & & ( args [ 0 ] . toLowerCase ( ) . equals ( " -urllist " ) ) ) {
2005-12-07 02:40:52 +01:00
// generate a url list and save it in a file
2006-07-24 10:08:33 +02:00
String source = " lurl " ;
if ( args . length > = 3 & & args [ 1 ] . toLowerCase ( ) . equals ( " -source " ) ) {
2006-07-24 18:40:59 +02:00
if ( ( args [ 2 ] . equals ( " nurl " ) ) | |
( args [ 2 ] . equals ( " lurl " ) ) | |
2006-07-24 10:08:33 +02:00
( args [ 2 ] . equals ( " eurl " ) ) )
source = args [ 2 ] ;
args = shift ( args , 1 , 2 ) ;
}
2005-12-07 02:40:52 +01:00
boolean html = false ;
2006-07-19 13:20:22 +02:00
if ( args . length > = 3 & & args [ 1 ] . toLowerCase ( ) . equals ( " -format " ) ) {
2005-12-07 02:40:52 +01:00
if ( args [ 2 ] . equals ( " html " ) ) html = true ;
args = shift ( args , 1 , 2 ) ;
}
2005-11-21 02:30:30 +01:00
if ( args . length = = 2 ) applicationRoot = args [ 1 ] ;
2006-07-24 18:40:59 +02:00
String outfile = " urllist_ " + source + " _ " + System . currentTimeMillis ( ) + ( ( html ) ? " .html " : " .txt " ) ;
2006-07-24 10:08:33 +02:00
urllist ( applicationRoot , source , html , outfile ) ;
2006-10-13 01:14:41 +02:00
} else if ( ( args . length > = 1 ) & & ( args [ 0 ] . toLowerCase ( ) . equals ( " -migratelurls " ) ) ) {
2006-10-19 23:14:37 +02:00
File root = new File ( applicationRoot ) ;
2006-12-05 03:47:51 +01:00
migratelurls ( applicationRoot , new File ( root , " DATA/PLASMADB/urlHash.db " ) ) ;
2006-07-19 13:20:22 +02:00
} else if ( ( args . length > = 1 ) & & ( args [ 0 ] . toLowerCase ( ) . equals ( " -urldbcleanup " ) ) ) {
2005-12-07 12:10:08 +01:00
// generate a url list and save it in a file
if ( args . length = = 2 ) applicationRoot = args [ 1 ] ;
urldbcleanup ( applicationRoot ) ;
2006-07-19 13:20:22 +02:00
} else if ( ( args . length > = 1 ) & & ( args [ 0 ] . toLowerCase ( ) . equals ( " -rwihashlist " ) ) ) {
2006-01-04 14:55:45 +01:00
// generate a url list and save it in a file
2006-01-10 02:04:22 +01:00
String domain = " all " ;
2006-01-15 11:29:48 +01:00
String format = " txt " ;
2006-01-10 02:04:22 +01:00
if ( args . length > = 2 ) domain = args [ 1 ] ;
2006-01-15 11:29:48 +01:00
if ( args . length > = 3 ) format = args [ 2 ] ;
if ( args . length = = 4 ) applicationRoot = args [ 3 ] ;
2006-01-04 14:55:45 +01:00
String outfile = " rwihashlist_ " + System . currentTimeMillis ( ) ;
2006-01-15 11:29:48 +01:00
RWIHashList ( applicationRoot , outfile , domain , format ) ;
2005-04-07 21:19:42 +02:00
} else {
if ( args . length = = 1 ) applicationRoot = args [ 0 ] ;
2005-09-21 14:21:01 +02:00
startup ( applicationRoot , startupMemFree , startupMemTotal ) ;
2005-04-07 21:19:42 +02:00
}
}
}
2005-05-11 11:44:36 +02:00
2005-08-02 21:40:29 +02:00
/ * *
* This class is a helper class whose instance is started , when the java virtual
* machine shuts down . Signals the plasmaSwitchboard to shut down .
* /
2005-07-03 14:40:36 +02:00
class shutdownHookThread extends Thread {
2005-09-21 02:12:37 +02:00
private plasmaSwitchboard sb = null ;
2005-05-11 11:44:36 +02:00
private Thread mainThread = null ;
2005-08-02 21:40:29 +02:00
2005-09-21 02:12:37 +02:00
public shutdownHookThread ( Thread mainThread , plasmaSwitchboard sb ) {
2005-09-20 17:36:22 +02:00
super ( ) ;
2005-09-21 02:12:37 +02:00
this . sb = sb ;
2005-05-11 11:44:36 +02:00
this . mainThread = mainThread ;
}
2005-08-02 21:40:29 +02:00
2005-07-03 14:40:36 +02:00
public void run ( ) {
2005-05-11 11:44:36 +02:00
try {
2005-09-21 02:12:37 +02:00
if ( ! this . sb . isTerminated ( ) ) {
2005-08-30 23:10:39 +02:00
serverLog . logConfig ( " SHUTDOWN " , " Shutdown via shutdown hook. " ) ;
2005-08-02 21:40:29 +02:00
2005-05-11 11:44:36 +02:00
// sending the yacy main thread a shutdown signal
2006-02-01 12:03:37 +01:00
serverLog . logFine ( " SHUTDOWN " , " Signaling shutdown to the switchboard. " ) ;
2005-09-21 02:12:37 +02:00
this . sb . terminate ( ) ;
2005-08-02 21:40:29 +02:00
2005-05-11 11:44:36 +02:00
// waiting for the yacy thread to finish execution
2006-02-01 12:03:37 +01:00
serverLog . logFine ( " SHUTDOWN " , " Waiting for main thread to finish. " ) ;
2006-09-20 12:13:23 +02:00
if ( this . mainThread . isAlive ( ) & & ! this . sb . isTerminated ( ) ) {
this . mainThread . join ( ) ;
}
2005-05-11 11:44:36 +02:00
}
} catch ( Exception e ) {
2005-08-30 23:32:59 +02:00
serverLog . logSevere ( " SHUTDOWN " , " Unexpected error. " + e . getClass ( ) . getName ( ) , e ) ;
2005-05-11 11:44:36 +02:00
}
}
2005-09-22 20:54:36 +02:00
}