yacy_search_server/source/net/yacy/yacy.java

1083 lines
51 KiB
Java
Raw Normal View History

package net.yacy;
// yacy.java
// -----------------------
// (C) by Michael Peter Christen; mc@yacy.net
// first published on http://www.yacy.net
// Frankfurt, Germany, 2004, 2005
//
// $LastChangedDate$
// $LastChangedRevision$
// $LastChangedBy$
//
// This program is free software; you can redistribute it and/or modify
// it under the terms of the GNU General Public License as published by
// the Free Software Foundation; either version 2 of the License, or
// (at your option) any later version.
//
// This program is distributed in the hope that it will be useful,
// but WITHOUT ANY WARRANTY; without even the implied warranty of
// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
// GNU General Public License for more details.
//
// You should have received a copy of the GNU General Public License
// along with this program; if not, write to the Free Software
// Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
//import java.io.BufferedInputStream;
import java.io.BufferedOutputStream;
import java.io.BufferedReader;
import java.io.BufferedWriter;
import java.io.File;
import java.io.FileInputStream;
import java.io.FileNotFoundException;
import java.io.FileOutputStream;
import java.io.FileWriter;
import java.io.IOException;
import java.io.InputStreamReader;
import java.io.PrintWriter;
import java.io.RandomAccessFile;
import java.nio.channels.FileChannel;
import java.nio.channels.FileLock;
import java.util.Iterator;
import java.util.Map;
import java.util.Properties;
import java.util.TreeMap;
import java.util.TreeSet;
import java.util.concurrent.Semaphore;
import java.util.zip.ZipEntry;
import java.util.zip.ZipOutputStream;
import net.yacy.cora.date.GenericFormatter;
import net.yacy.cora.document.ASCII;
2011-12-15 15:15:53 +01:00
import net.yacy.cora.document.UTF8;
import net.yacy.cora.lod.JenaTripleStore;
import net.yacy.cora.protocol.ClientIdentification;
import net.yacy.cora.protocol.RequestHeader;
import net.yacy.cora.protocol.http.HTTPClient;
import net.yacy.cora.sorting.Array;
2011-12-16 23:59:29 +01:00
import net.yacy.cora.sorting.OrderedScoreMap;
import net.yacy.cora.sorting.ScoreMap;
import net.yacy.gui.YaCyApp;
import net.yacy.gui.framework.Browser;
import net.yacy.kelondro.blob.MapDataMining;
import net.yacy.kelondro.data.meta.URIMetadataRow;
import net.yacy.kelondro.data.word.Word;
import net.yacy.kelondro.data.word.WordReference;
import net.yacy.kelondro.logging.Log;
import net.yacy.kelondro.order.Base64Order;
import net.yacy.kelondro.rwi.Reference;
import net.yacy.kelondro.rwi.ReferenceContainer;
import net.yacy.kelondro.util.FileUtils;
import net.yacy.kelondro.util.Formatter;
import net.yacy.kelondro.util.MemoryControl;
import net.yacy.kelondro.util.OS;
import net.yacy.peers.SeedDB;
import net.yacy.peers.operation.yacyBuildProperties;
import net.yacy.peers.operation.yacyRelease;
import net.yacy.peers.operation.yacyVersion;
import net.yacy.search.Switchboard;
import net.yacy.search.SwitchboardConstants;
import net.yacy.search.index.MetadataRepository;
import net.yacy.search.index.Segment;
import com.google.common.io.Files;
import de.anomic.data.Translator;
import de.anomic.http.server.HTTPDemon;
import de.anomic.server.serverCore;
import de.anomic.tools.enumerateFiles;
/**
* This is the main class of YaCy. Several threads are started from here:
* <ul>
* <li>one single instance of the plasmaSwitchboard is generated, which itself
* starts a thread with a plasmaHTMLCache object. This object simply counts
* files sizes in the cache and terminates them. It also generates a
* plasmaCrawlerLoader object, which may itself start some more httpc-calling
* threads to load web pages. They terminate automatically when a page has
* loaded.
* <li>one serverCore - thread is started, which implements a multi-threaded
* server. The process may start itself many more processes that handle
* connections.lo
* <li>finally, all idle-dependent processes are written in a queue in
* plasmaSwitchboard which are worked off inside an idle-sensitive loop of the
* main process. (here)
* </ul>
*
* On termination, the following must be done:
* <ul>
* <li>stop feeding of the crawling process because it otherwise fills the
* indexing queue.
* <li>say goodbye to connected peers and disable new connections. Don't wait for
* success.
* <li>first terminate the serverCore thread. This prevents that new cache
* objects are queued.
* <li>wait that the plasmaHTMLCache terminates (it should be normal that this
* process already has terminated).
* <li>then wait for termination of all loader process of the
* plasmaCrawlerLoader.
* <li>work off the indexing and cache storage queue. These values are inside a
* RAM cache and would be lost otherwise.
* <li>write all settings.
* <li>terminate.
* </ul>
*/
public final class yacy {
// static objects
public static final String vString = yacyBuildProperties.getVersion();
public static float version = 0.1f;
public static final String vDATE = yacyBuildProperties.getBuildDate();
public static final String copyright = "[ YaCy v" + vString + ", build " + vDATE + " by Michael Christen / www.yacy.net ]";
public static final String hline = "-------------------------------------------------------------------------------";
public static final Semaphore shutdownSemaphore = new Semaphore(0);
/**
* a reference to the {@link Switchboard} created by the
* {@link yacy#startup(String, long, long)} method.
*/
private static Switchboard sb = null;
public static String homedir;
public static File dataHome_g;
/**
* Starts up the whole application. Sets up all datastructures and starts
* the main threads.
*
* @param homePath Root-path where all information is to be found.
* @param startupFree free memory at startup time, to be used later for statistics
*/
private static void startup(final File dataHome, final File appHome, final long startupMemFree, final long startupMemTotal, final boolean gui) {
try {
// start up
System.out.println(copyright);
System.out.println(hline);
// check java version
try {
"a".codePointAt(0); // needs at least Java 1.5
} catch (final NoSuchMethodError e) {
System.err.println("STARTUP: Java Version too low. You need at least Java 1.5 to run YaCy"); // TODO: is 1.6 now
Thread.sleep(3000);
System.exit(-1);
}
// ensure that there is a DATA directory, if not, create one and if that fails warn and die
mkdirsIfNeseccary(dataHome);
dataHome_g = dataHome;
mkdirsIfNeseccary(appHome);
File f = new File(dataHome, "DATA/");
mkdirsIfNeseccary(f);
if (!(f.exists())) {
System.err.println("Error creating DATA-directory in " + dataHome.toString() + " . Please check your write-permission for this folder. YaCy will now terminate.");
System.exit(-1);
}
homedir = appHome.toString();
// setting up logging
f = new File(dataHome, "DATA/LOG/");
mkdirsIfNeseccary(f);
f = new File(dataHome, "DATA/LOG/yacy.logging");
final File f0 = new File(appHome, "defaults/yacy.logging");
if (!f.exists() || f0.lastModified() > f.lastModified()) try {
Files.copy(f0, f);
} catch (final IOException e){
System.out.println("could not copy yacy.logging");
}
try{
Log.configureLogging(dataHome, appHome, new File(dataHome, "DATA/LOG/yacy.logging"));
} catch (final IOException e) {
System.out.println("could not find logging properties in homePath=" + dataHome);
Log.logException(e);
}
Log.logConfig("STARTUP", "YaCy version: " + yacyBuildProperties.getVersion() + "/" + yacyBuildProperties.getSVNRevision());
Log.logConfig("STARTUP", "Java version: " + System.getProperty("java.version", "no-java-version"));
Log.logConfig("STARTUP", "Operation system: " + System.getProperty("os.name","unknown"));
Log.logConfig("STARTUP", "Application root-path: " + appHome);
Log.logConfig("STARTUP", "Data root-path: " + dataHome);
Log.logConfig("STARTUP", "Time zone: UTC" + GenericFormatter.UTCDiffString() + "; UTC+0000 is " + System.currentTimeMillis());
Log.logConfig("STARTUP", "Maximum file system path length: " + OS.maxPathLength);
f = new File(dataHome, "DATA/yacy.running");
if (f.exists()) { // another instance running? VM crash? User will have to care about this
Log.logSevere("STARTUP", "WARNING: the file " + f + " exists, this usually means that a YaCy instance is still running");
delete(f);
}
if(!f.createNewFile())
Log.logSevere("STARTUP", "WARNING: the file " + f + " can not be created!");
try { new FileOutputStream(f).write(Integer.toString(OS.getPID()).getBytes()); } catch (final Exception e) { } // write PID
f.deleteOnExit();
FileChannel channel = null;
FileLock lock = null;
try {
channel = new RandomAccessFile(f,"rw").getChannel();
lock = channel.tryLock(); // lock yacy.running
} catch (final Exception e) { }
final String oldconf = "DATA/SETTINGS/httpProxy.conf".replace("/", File.separator);
final String newconf = "DATA/SETTINGS/yacy.conf".replace("/", File.separator);
final File oldconffile = new File(dataHome, oldconf);
if (oldconffile.exists()) {
final File newconfFile = new File(dataHome, newconf);
if(!oldconffile.renameTo(newconfFile))
Log.logSevere("STARTUP", "WARNING: the file " + oldconffile + " can not be renamed to "+ newconfFile +"!");
}
try {
sb = new Switchboard(dataHome, appHome, "defaults/yacy.init".replace("/", File.separator), newconf);
} catch (final RuntimeException e) {
Log.logSevere("STARTUP", "YaCy cannot start: " + e.getMessage(), e);
System.exit(-1);
}
//sbSync.V(); // signal that the sb reference was set
// switch the memory strategy
MemoryControl.setStandardStrategy(sb.getConfigBool("memory.standardStrategy", true));
// save information about available memory at startup time
sb.setConfig("memoryFreeAfterStartup", startupMemFree);
sb.setConfig("memoryTotalAfterStartup", startupMemTotal);
// start gui if wanted
if (gui) YaCyApp.start("localhost", (int) sb.getConfigLong("port", 8090));
// hardcoded, forced, temporary value-migration
sb.setConfig("htTemplatePath", "htroot/env/templates");
int oldRev;
try {
oldRev = Integer.parseInt(sb.getConfig("svnRevision", "0"));
} catch (final NumberFormatException e) {
oldRev = 0;
}
final int newRev = Integer.parseInt(yacyBuildProperties.getSVNRevision());
sb.setConfig("svnRevision", yacyBuildProperties.getSVNRevision());
sb.setConfig("applicationRoot", appHome.toString());
sb.setConfig("dataRoot", dataHome.toString());
yacyVersion.latestRelease = version;
// read environment
final int timeout = Math.max(5000, Integer.parseInt(sb.getConfig("httpdTimeout", "5000")));
// create some directories
final File htRootPath = new File(appHome, sb.getConfig("htRootPath", "htroot"));
final File htDocsPath = sb.getDataPath(SwitchboardConstants.HTDOCS_PATH, SwitchboardConstants.HTDOCS_PATH_DEFAULT);
mkdirIfNeseccary(htDocsPath);
//final File htTemplatePath = new File(homePath, sb.getConfig("htTemplatePath","htdocs"));
// create default notifier picture
//TODO: Use templates instead of copying images ...
if (!((new File(htDocsPath, "notifier.gif")).exists())) try {
Files.copy(new File(htRootPath, "env/grafics/empty.gif"),
new File(htDocsPath, "notifier.gif"));
} catch (final IOException e) {}
final File htdocsReadme = new File(htDocsPath, "readme.txt");
if (!(htdocsReadme.exists())) try {FileUtils.copy((
"This is your root directory for individual Web Content\r\n" +
"\r\n" +
"Please place your html files into the www subdirectory.\r\n" +
"The URL of that path is either\r\n" +
"http://www.<your-peer-name>.yacy or\r\n" +
"http://<your-ip>:<your-port>/www\r\n" +
"\r\n" +
"Other subdirectories may be created; they map to corresponding sub-domains.\r\n" +
"This directory shares it's content with the applications htroot path, so you\r\n" +
"may access your yacy search page with\r\n" +
"http://<your-peer-name>.yacy/\r\n" +
"\r\n").getBytes(), htdocsReadme);} catch (final IOException e) {
System.out.println("Error creating htdocs readme: " + e.getMessage());
}
final File wwwDefaultPath = new File(htDocsPath, "www");
mkdirIfNeseccary(wwwDefaultPath);
final File shareDefaultPath = new File(htDocsPath, "share");
mkdirIfNeseccary(shareDefaultPath);
migration.migrate(sb, oldRev, newRev);
// delete old release files
final int deleteOldDownloadsAfterDays = (int) sb.getConfigLong("update.deleteOld", 30);
yacyRelease.deleteOldDownloads(sb.releasePath, deleteOldDownloadsAfterDays );
// set user-agent
HTTPClient.setDefaultUserAgent(ClientIdentification.getUserAgent());
// initial fill of the triplestore
File triplestore = new File(sb.getConfig("triplestore", new File(dataHome, "DATA/TRIPLESTORE").getAbsolutePath()));
mkdirIfNeseccary(triplestore);
for (String s: triplestore.list()) {
if ((s.endsWith(".rdf") || s.endsWith(".nt")) && !s.equals("local.rdf") && !s.endsWith("_triplestore.rdf") && !s.startsWith("private_store_")) {
try {
JenaTripleStore.load(new File(triplestore, s).getAbsolutePath());
} catch (IOException e) {
Log.logException(e);
}
}
}
if (sb.getConfigBool("triplestore.persistent", false)) {
File local = new File(triplestore, "local.rdf");
if (local.exists()) {
try {
JenaTripleStore.load(local.getAbsolutePath());
} catch (IOException e) {
Log.logException(e);
}
}
}
// start main threads
final String port = sb.getConfig("port", "8090");
try {
final HTTPDemon protocolHandler = new HTTPDemon(sb);
final serverCore server = new serverCore(
timeout /*control socket timeout in milliseconds*/,
true /* block attacks (wrong protocol) */,
protocolHandler /*command class*/,
sb,
30000 /*command max length incl. GET args*/);
server.setName("httpd:"+port);
server.setPriority(Thread.MAX_PRIORITY);
server.setObeyIntermission(false);
// start the server
sb.deployThread("10_httpd", "HTTPD Server/Proxy", "the HTTPD, used as web server and proxy", null, server, 0, 0, 0, 0);
//server.start();
// open the browser window
final boolean browserPopUpTrigger = sb.getConfig(SwitchboardConstants.BROWSER_POP_UP_TRIGGER, "true").equals("true");
if (browserPopUpTrigger) try {
final String browserPopUpPage = sb.getConfig(SwitchboardConstants.BROWSER_POP_UP_PAGE, "ConfigBasic.html");
//boolean properPW = (sb.getConfig("adminAccount", "").isEmpty()) && (sb.getConfig(httpd.ADMIN_ACCOUNT_B64MD5, "").length() > 0);
//if (!properPW) browserPopUpPage = "ConfigBasic.html";
Browser.openBrowser((server.withSSL()?"https":"http") + "://localhost:" + serverCore.getPortNr(port) + "/" + browserPopUpPage);
} catch (final Throwable e) {
// cannot open browser. This may be normal in headless environments
//Log.logException(e);
}
// enable browser popup, http server is ready now
sb.tray.setReady();
//regenerate Locales from Translationlist, if needed
final File locale_source = sb.getAppPath("locale.source", "locales");
final String lang = sb.getConfig("locale.language", "");
if (!lang.equals("") && !lang.equals("default")) { //locale is used
String currentRev = "";
try{
final BufferedReader br = new BufferedReader(new InputStreamReader(new FileInputStream(new File(sb.getDataPath("locale.translated_html", "DATA/LOCALE/htroot"), lang+"/version" ))));
currentRev = br.readLine();
br.close();
}catch(final IOException e){
//Error
}
if (!currentRev.equals(sb.getConfig("svnRevision", ""))) try { //is this another version?!
final File sourceDir = new File(sb.getConfig("htRootPath", "htroot"));
final File destDir = new File(sb.getDataPath("locale.translated_html", "DATA/LOCALE/htroot"), lang);
if (Translator.translateFilesRecursive(sourceDir, destDir, new File(locale_source, lang + ".lng"), "html,template,inc", "locale")){ //translate it
//write the new Versionnumber
final BufferedWriter bw = new BufferedWriter(new PrintWriter(new FileWriter(new File(destDir, "version"))));
bw.write(sb.getConfig("svnRevision", "Error getting Version"));
bw.close();
}
} catch (final IOException e) {}
}
// initialize number formatter with this locale
Formatter.setLocale(lang);
// registering shutdown hook
Log.logConfig("STARTUP", "Registering Shutdown Hook");
final Runtime run = Runtime.getRuntime();
run.addShutdownHook(new shutdownHookThread(Thread.currentThread(), sb));
// save information about available memory after all initializations
//try {
sb.setConfig("memoryFreeAfterInitBGC", MemoryControl.free());
sb.setConfig("memoryTotalAfterInitBGC", MemoryControl.total());
System.gc();
sb.setConfig("memoryFreeAfterInitAGC", MemoryControl.free());
sb.setConfig("memoryTotalAfterInitAGC", MemoryControl.total());
//} catch (ConcurrentModificationException e) {}
// wait for server shutdown
try {
sb.waitForShutdown();
} catch (final Exception e) {
Log.logSevere("MAIN CONTROL LOOP", "PANIC: " + e.getMessage(),e);
}
// shut down
Array.terminate();
Log.logConfig("SHUTDOWN", "caught termination signal");
server.terminate(false);
server.interrupt();
server.close();
// idle until the processes are down
if (server.isAlive()) {
server.interrupt();
}
Log.logConfig("SHUTDOWN", "server has terminated");
sb.close();
} catch (final Exception e) {
Log.logSevere("STARTUP", "Unexpected Error: " + e.getClass().getName(),e);
//System.exit(1);
}
if(lock != null) lock.release();
if(channel != null) channel.close();
} catch (final Exception ee) {
Log.logSevere("STARTUP", "FATAL ERROR: " + ee.getMessage(),ee);
} finally {
}
// save the triple store
if (sb.getConfigBool("triplestore.persistent", false)) {
JenaTripleStore.saveAll();
}
Log.logConfig("SHUTDOWN", "goodbye. (this is the last line)");
Log.shutdown();
shutdownSemaphore.release(1000);
try {
System.exit(0);
} catch (final Exception e) {} // was once stopped by de.anomic.net.ftpc$sm.checkExit(ftpc.java:1790)
}
/**
* @param f
*/
private static void delete(final File f) {
if(!f.delete())
Log.logSevere("STARTUP", "WARNING: the file " + f + " can not be deleted!");
}
/**
* @see File#mkdir()
* @param path
*/
private static void mkdirIfNeseccary(final File path) {
if (!(path.exists()))
if(!path.mkdir())
Log.logWarning("STARTUP", "could not create directory "+ path.toString());
}
/**
* @see File#mkdirs()
* @param path
*/
public static void mkdirsIfNeseccary(final File path) {
if (!(path.exists()))
if(!path.mkdirs())
Log.logWarning("STARTUP", "could not create directories "+ path.toString());
}
/**
* Loads the configuration from the data-folder.
* FIXME: Why is this called over and over again from every method, instead
* of setting the configurationdata once for this class in main?
*
* @param mes Where are we called from, so that the errormessages can be
* more descriptive.
* @param homePath Root-path where all the information is to be found.
* @return Properties read from the configurationfile.
*/
private static Properties configuration(final String mes, final File homePath) {
Log.logConfig(mes, "Application Root Path: " + homePath.toString());
// read data folder
final File dataFolder = new File(homePath, "DATA");
if (!(dataFolder.exists())) {
Log.logSevere(mes, "Application was never started or root path wrong.");
System.exit(-1);
}
final Properties config = new Properties();
FileInputStream fis = null;
try {
fis = new FileInputStream(new File(homePath, "DATA/SETTINGS/yacy.conf"));
config.load(fis);
} catch (final FileNotFoundException e) {
Log.logSevere(mes, "could not find configuration file.");
System.exit(-1);
} catch (final IOException e) {
Log.logSevere(mes, "could not read configuration file.");
System.exit(-1);
} finally {
if(fis != null) {
try {
fis.close();
} catch (final IOException e) {
Log.logException(e);
}
}
}
return config;
}
/**
* Call the shutdown-page of YaCy to tell it to shut down. This method is
* called if you start yacy with the argument -shutdown.
*
* @param homePath Root-path where all the information is to be found.
*/
public static void shutdown(final File homePath) {
// start up
System.out.println(copyright);
System.out.println(hline);
submitURL(homePath, "Steering.html?shutdown=", "Terminate YaCy");
}
public static void update(final File homePath) {
// start up
System.out.println(copyright);
System.out.println(hline);
submitURL(homePath, "ConfigUpdate_p.html?autoUpdate=", "Update YaCy to most recent version");
}
private static void submitURL(final File homePath, final String path, final String processdescription) {
final Properties config = configuration("COMMAND-STEERING", homePath);
// read port
final int port = serverCore.getPortNr(config.getProperty("port", "8090"));
// read password
String encodedPassword = (String) config.get(SwitchboardConstants.ADMIN_ACCOUNT_B64MD5);
if (encodedPassword == null) encodedPassword = ""; // not defined
// send 'wget' to web interface
final RequestHeader requestHeader = new RequestHeader();
requestHeader.put(RequestHeader.AUTHORIZATION, "realm=" + encodedPassword); // for http-authentify
// final Client con = new Client(10000, requestHeader);
final HTTPClient con = new HTTPClient();
con.setHeader(requestHeader.entrySet());
// ResponseContainer res = null;
try {
// res = con.GET("http://localhost:"+ port +"/" + path);
con.GETbytes("http://localhost:"+ port +"/" + path);
// read response
// if (res.getStatusLine().startsWith("2")) {
if (con.getStatusCode() > 199 && con.getStatusCode() < 300) {
Log.logConfig("COMMAND-STEERING", "YACY accepted steering command: " + processdescription);
// final ByteArrayOutputStream bos = new ByteArrayOutputStream(); //This is stream is not used???
// try {
// FileUtils.copyToStream(new BufferedInputStream(res.getDataAsStream()), new BufferedOutputStream(bos));
// } finally {
// res.closeStream();
// }
} else {
// Log.logSevere("COMMAND-STEERING", "error response from YACY socket: " + res.getStatusLine());
Log.logSevere("COMMAND-STEERING", "error response from YACY socket: " + con.getHttpResponse().getStatusLine());
System.exit(-1);
}
} catch (final IOException e) {
Log.logSevere("COMMAND-STEERING", "could not establish connection to YACY socket: " + e.getMessage());
System.exit(-1);
// } finally {
// // release connection
// if(res != null) {
// res.closeStream();
// }
}
try {
HTTPClient.closeConnectionManager();
} catch (final InterruptedException e) {
e.printStackTrace();
}
// finished
Log.logConfig("COMMAND-STEERING", "SUCCESSFULLY FINISHED COMMAND: " + processdescription);
}
/**
* This method gets all found words and outputs a statistic about the score
* of the words. The output of this method can be used to create stop-word
* lists. This method will be called if you start yacy with the argument
* -genwordstat.
* FIXME: How can stop-word list be created from this output? What type of
* score is output?
*
* @param homePath Root-Path where all the information is to be found.
*/
private static void genWordstat(final File homePath) {
// start up
System.out.println(copyright);
System.out.println(hline);
// load words
Log.logInfo("GEN-WORDSTAT", "loading words...");
final TreeMap<byte[], String> words = loadWordMap(new File(homePath, "yacy.words"));
// find all hashes
Log.logInfo("GEN-WORDSTAT", "searching all word-hash databases...");
final File dbRoot = new File(homePath, "DATA/INDEX/freeworld/");
final enumerateFiles ef = new enumerateFiles(new File(dbRoot, "WORDS"), true, false, true, true);
File f;
byte[] h;
final ScoreMap<byte[]> hs = new OrderedScoreMap<byte[]>(Base64Order.standardCoder);
while (ef.hasMoreElements()) {
f = ef.nextElement();
h = f.getName().substring(0, Word.commonHashLength).getBytes();
hs.inc(h, (int) f.length());
}
// list the hashes in reverse order
Log.logInfo("GEN-WORDSTAT", "listing words in reverse size order...");
String w;
final Iterator<byte[]> i = hs.keys(false);
while (i.hasNext()) {
h = i.next();
w = words.get(h);
if (w == null) System.out.print("# " + h); else System.out.print(w);
System.out.println(" - " + hs.get(h));
}
// finished
Log.logConfig("GEN-WORDSTAT", "FINISHED");
}
/**
* @param homePath path to the YaCy directory
* @param networkName
*/
public static void minimizeUrlDB(final File dataHome, final File appHome, final String networkName) {
// run with "java -classpath classes yacy -minimizeUrlDB"
try {Log.configureLogging(dataHome, appHome, new File(dataHome, "DATA/LOG/yacy.logging"));} catch (final Exception e) {}
final File indexPrimaryRoot = new File(dataHome, "DATA/INDEX");
final File indexRoot2 = new File(dataHome, "DATA/INDEX2");
final Log log = new Log("URL-CLEANUP");
*) Asynchronous queuing of crawl job URLs (stackCrawl) various checks like the blacklist check or the robots.txt disallow check are now done by a separate thread to unburden the indexer thread(s) TODO: maybe we have to introduce a threadpool here if it turn out that this single thread is a bottleneck because of the time consuming robots.txt downloads *) improved index transfer The index selection and transmission is done in parallel now to improve index transfer performance. TODO: maybe we could speed up performance by unsing multiple transmission threads in parallel instead of only a single one. *) gzip encoded post requests it is now configureable if a gzip encoded post request should be send on intex transfer/distribution *) storage Peer (very experimentell and not optimized yet) Now it's possible to send the result of the yacy indexer thread to a remote peer istead of storing the indexed words locally. This could be done by setting the property "storagePeerHash" in the yacy config file - Please note that if the index transfer fails, the index ist stored locally. - TODO: currently this index transfer is done by the indexer thread. To seedup the indexer a) this transmission should be done in parallel and b) multiple chunks should be bundled and transfered together *) general performance improvements - better memory cleanup after http request processing has finished - replacing some string concatenations with stringBuffers - replacing BufferedInputStreams with serverByteBuffer - replacing vectors with arraylists wherever possible - replacing hashtables with hashmaps wherever possible This was done because function calls to verctor or hashtable functions take 3 time longer than calls to functions of arraylists or hashmaps. TODO: we should take a look on the class serverObject which is inherited from hashmap Do we realy need a synchronization for this class? TODO: replace arraylists with linkedLists if random access to the list elements is not needed *) Robots Parser supports if-modified-since downloads now If the downloaded robots.txt file is older than 7 days the robots parser tries to download the robots.txt with the if-modified-since header to avoid unnecessary downloads if the file was not changed. Additionally the ETag header is used to detect changes. *) Crawler: better handling of unsupported mimeTypes + FileExtension *) Bugfix: plasmaWordIndexEntity was not closed correctly in - query.java - plasmaswitchboard.java *) function minimizeUrlDB added to yacy.java this function tests the current urlHashDB for unused urls ATTENTION: please don't use this function at the moment because it causes the wordIndexDB to flush all words into the word directory! git-svn-id: https://svn.berlios.de/svnroot/repos/yacy/trunk@853 6c8d7289-2bf4-0310-a012-ef5d649a1542
2005-10-05 12:45:33 +02:00
try {
log.logInfo("STARTING URL CLEANUP");
*) Asynchronous queuing of crawl job URLs (stackCrawl) various checks like the blacklist check or the robots.txt disallow check are now done by a separate thread to unburden the indexer thread(s) TODO: maybe we have to introduce a threadpool here if it turn out that this single thread is a bottleneck because of the time consuming robots.txt downloads *) improved index transfer The index selection and transmission is done in parallel now to improve index transfer performance. TODO: maybe we could speed up performance by unsing multiple transmission threads in parallel instead of only a single one. *) gzip encoded post requests it is now configureable if a gzip encoded post request should be send on intex transfer/distribution *) storage Peer (very experimentell and not optimized yet) Now it's possible to send the result of the yacy indexer thread to a remote peer istead of storing the indexed words locally. This could be done by setting the property "storagePeerHash" in the yacy config file - Please note that if the index transfer fails, the index ist stored locally. - TODO: currently this index transfer is done by the indexer thread. To seedup the indexer a) this transmission should be done in parallel and b) multiple chunks should be bundled and transfered together *) general performance improvements - better memory cleanup after http request processing has finished - replacing some string concatenations with stringBuffers - replacing BufferedInputStreams with serverByteBuffer - replacing vectors with arraylists wherever possible - replacing hashtables with hashmaps wherever possible This was done because function calls to verctor or hashtable functions take 3 time longer than calls to functions of arraylists or hashmaps. TODO: we should take a look on the class serverObject which is inherited from hashmap Do we realy need a synchronization for this class? TODO: replace arraylists with linkedLists if random access to the list elements is not needed *) Robots Parser supports if-modified-since downloads now If the downloaded robots.txt file is older than 7 days the robots parser tries to download the robots.txt with the if-modified-since header to avoid unnecessary downloads if the file was not changed. Additionally the ETag header is used to detect changes. *) Crawler: better handling of unsupported mimeTypes + FileExtension *) Bugfix: plasmaWordIndexEntity was not closed correctly in - query.java - plasmaswitchboard.java *) function minimizeUrlDB added to yacy.java this function tests the current urlHashDB for unused urls ATTENTION: please don't use this function at the moment because it causes the wordIndexDB to flush all words into the word directory! git-svn-id: https://svn.berlios.de/svnroot/repos/yacy/trunk@853 6c8d7289-2bf4-0310-a012-ef5d649a1542
2005-10-05 12:45:33 +02:00
// db containing all currently loades urls
final MetadataRepository currentUrlDB = new MetadataRepository(new File(new File(indexPrimaryRoot, networkName), "TEXT"), "text.urlmd", false, false);
*) Asynchronous queuing of crawl job URLs (stackCrawl) various checks like the blacklist check or the robots.txt disallow check are now done by a separate thread to unburden the indexer thread(s) TODO: maybe we have to introduce a threadpool here if it turn out that this single thread is a bottleneck because of the time consuming robots.txt downloads *) improved index transfer The index selection and transmission is done in parallel now to improve index transfer performance. TODO: maybe we could speed up performance by unsing multiple transmission threads in parallel instead of only a single one. *) gzip encoded post requests it is now configureable if a gzip encoded post request should be send on intex transfer/distribution *) storage Peer (very experimentell and not optimized yet) Now it's possible to send the result of the yacy indexer thread to a remote peer istead of storing the indexed words locally. This could be done by setting the property "storagePeerHash" in the yacy config file - Please note that if the index transfer fails, the index ist stored locally. - TODO: currently this index transfer is done by the indexer thread. To seedup the indexer a) this transmission should be done in parallel and b) multiple chunks should be bundled and transfered together *) general performance improvements - better memory cleanup after http request processing has finished - replacing some string concatenations with stringBuffers - replacing BufferedInputStreams with serverByteBuffer - replacing vectors with arraylists wherever possible - replacing hashtables with hashmaps wherever possible This was done because function calls to verctor or hashtable functions take 3 time longer than calls to functions of arraylists or hashmaps. TODO: we should take a look on the class serverObject which is inherited from hashmap Do we realy need a synchronization for this class? TODO: replace arraylists with linkedLists if random access to the list elements is not needed *) Robots Parser supports if-modified-since downloads now If the downloaded robots.txt file is older than 7 days the robots parser tries to download the robots.txt with the if-modified-since header to avoid unnecessary downloads if the file was not changed. Additionally the ETag header is used to detect changes. *) Crawler: better handling of unsupported mimeTypes + FileExtension *) Bugfix: plasmaWordIndexEntity was not closed correctly in - query.java - plasmaswitchboard.java *) function minimizeUrlDB added to yacy.java this function tests the current urlHashDB for unused urls ATTENTION: please don't use this function at the moment because it causes the wordIndexDB to flush all words into the word directory! git-svn-id: https://svn.berlios.de/svnroot/repos/yacy/trunk@853 6c8d7289-2bf4-0310-a012-ef5d649a1542
2005-10-05 12:45:33 +02:00
// db used to hold all neede urls
final MetadataRepository minimizedUrlDB = new MetadataRepository(new File(new File(indexRoot2, networkName), "TEXT"), "text.urlmd", false, false);
final int cacheMem = (int)(MemoryControl.maxMemory() - MemoryControl.total());
if (cacheMem < 2048000) throw new OutOfMemoryError("Not enough memory available to start clean up.");
final Segment wordIndex = new Segment(
log,
new File(new File(indexPrimaryRoot, "freeworld"), "TEXT"),
10000,
Integer.MAX_VALUE, false, false);
final Iterator<ReferenceContainer<WordReference>> indexContainerIterator = wordIndex.termIndex().referenceContainerIterator("AAAAAAAAAAAA".getBytes(), false, false);
*) Asynchronous queuing of crawl job URLs (stackCrawl) various checks like the blacklist check or the robots.txt disallow check are now done by a separate thread to unburden the indexer thread(s) TODO: maybe we have to introduce a threadpool here if it turn out that this single thread is a bottleneck because of the time consuming robots.txt downloads *) improved index transfer The index selection and transmission is done in parallel now to improve index transfer performance. TODO: maybe we could speed up performance by unsing multiple transmission threads in parallel instead of only a single one. *) gzip encoded post requests it is now configureable if a gzip encoded post request should be send on intex transfer/distribution *) storage Peer (very experimentell and not optimized yet) Now it's possible to send the result of the yacy indexer thread to a remote peer istead of storing the indexed words locally. This could be done by setting the property "storagePeerHash" in the yacy config file - Please note that if the index transfer fails, the index ist stored locally. - TODO: currently this index transfer is done by the indexer thread. To seedup the indexer a) this transmission should be done in parallel and b) multiple chunks should be bundled and transfered together *) general performance improvements - better memory cleanup after http request processing has finished - replacing some string concatenations with stringBuffers - replacing BufferedInputStreams with serverByteBuffer - replacing vectors with arraylists wherever possible - replacing hashtables with hashmaps wherever possible This was done because function calls to verctor or hashtable functions take 3 time longer than calls to functions of arraylists or hashmaps. TODO: we should take a look on the class serverObject which is inherited from hashmap Do we realy need a synchronization for this class? TODO: replace arraylists with linkedLists if random access to the list elements is not needed *) Robots Parser supports if-modified-since downloads now If the downloaded robots.txt file is older than 7 days the robots parser tries to download the robots.txt with the if-modified-since header to avoid unnecessary downloads if the file was not changed. Additionally the ETag header is used to detect changes. *) Crawler: better handling of unsupported mimeTypes + FileExtension *) Bugfix: plasmaWordIndexEntity was not closed correctly in - query.java - plasmaswitchboard.java *) function minimizeUrlDB added to yacy.java this function tests the current urlHashDB for unused urls ATTENTION: please don't use this function at the moment because it causes the wordIndexDB to flush all words into the word directory! git-svn-id: https://svn.berlios.de/svnroot/repos/yacy/trunk@853 6c8d7289-2bf4-0310-a012-ef5d649a1542
2005-10-05 12:45:33 +02:00
long urlCounter = 0, wordCounter = 0;
long wordChunkStart = System.currentTimeMillis(), wordChunkEnd = 0;
String wordChunkStartHash = "AAAAAAAAAAAA", wordChunkEndHash;
while (indexContainerIterator.hasNext()) {
ReferenceContainer<WordReference> wordIdxContainer = null;
*) Asynchronous queuing of crawl job URLs (stackCrawl) various checks like the blacklist check or the robots.txt disallow check are now done by a separate thread to unburden the indexer thread(s) TODO: maybe we have to introduce a threadpool here if it turn out that this single thread is a bottleneck because of the time consuming robots.txt downloads *) improved index transfer The index selection and transmission is done in parallel now to improve index transfer performance. TODO: maybe we could speed up performance by unsing multiple transmission threads in parallel instead of only a single one. *) gzip encoded post requests it is now configureable if a gzip encoded post request should be send on intex transfer/distribution *) storage Peer (very experimentell and not optimized yet) Now it's possible to send the result of the yacy indexer thread to a remote peer istead of storing the indexed words locally. This could be done by setting the property "storagePeerHash" in the yacy config file - Please note that if the index transfer fails, the index ist stored locally. - TODO: currently this index transfer is done by the indexer thread. To seedup the indexer a) this transmission should be done in parallel and b) multiple chunks should be bundled and transfered together *) general performance improvements - better memory cleanup after http request processing has finished - replacing some string concatenations with stringBuffers - replacing BufferedInputStreams with serverByteBuffer - replacing vectors with arraylists wherever possible - replacing hashtables with hashmaps wherever possible This was done because function calls to verctor or hashtable functions take 3 time longer than calls to functions of arraylists or hashmaps. TODO: we should take a look on the class serverObject which is inherited from hashmap Do we realy need a synchronization for this class? TODO: replace arraylists with linkedLists if random access to the list elements is not needed *) Robots Parser supports if-modified-since downloads now If the downloaded robots.txt file is older than 7 days the robots parser tries to download the robots.txt with the if-modified-since header to avoid unnecessary downloads if the file was not changed. Additionally the ETag header is used to detect changes. *) Crawler: better handling of unsupported mimeTypes + FileExtension *) Bugfix: plasmaWordIndexEntity was not closed correctly in - query.java - plasmaswitchboard.java *) function minimizeUrlDB added to yacy.java this function tests the current urlHashDB for unused urls ATTENTION: please don't use this function at the moment because it causes the wordIndexDB to flush all words into the word directory! git-svn-id: https://svn.berlios.de/svnroot/repos/yacy/trunk@853 6c8d7289-2bf4-0310-a012-ef5d649a1542
2005-10-05 12:45:33 +02:00
try {
wordCounter++;
wordIdxContainer = indexContainerIterator.next();
*) Asynchronous queuing of crawl job URLs (stackCrawl) various checks like the blacklist check or the robots.txt disallow check are now done by a separate thread to unburden the indexer thread(s) TODO: maybe we have to introduce a threadpool here if it turn out that this single thread is a bottleneck because of the time consuming robots.txt downloads *) improved index transfer The index selection and transmission is done in parallel now to improve index transfer performance. TODO: maybe we could speed up performance by unsing multiple transmission threads in parallel instead of only a single one. *) gzip encoded post requests it is now configureable if a gzip encoded post request should be send on intex transfer/distribution *) storage Peer (very experimentell and not optimized yet) Now it's possible to send the result of the yacy indexer thread to a remote peer istead of storing the indexed words locally. This could be done by setting the property "storagePeerHash" in the yacy config file - Please note that if the index transfer fails, the index ist stored locally. - TODO: currently this index transfer is done by the indexer thread. To seedup the indexer a) this transmission should be done in parallel and b) multiple chunks should be bundled and transfered together *) general performance improvements - better memory cleanup after http request processing has finished - replacing some string concatenations with stringBuffers - replacing BufferedInputStreams with serverByteBuffer - replacing vectors with arraylists wherever possible - replacing hashtables with hashmaps wherever possible This was done because function calls to verctor or hashtable functions take 3 time longer than calls to functions of arraylists or hashmaps. TODO: we should take a look on the class serverObject which is inherited from hashmap Do we realy need a synchronization for this class? TODO: replace arraylists with linkedLists if random access to the list elements is not needed *) Robots Parser supports if-modified-since downloads now If the downloaded robots.txt file is older than 7 days the robots parser tries to download the robots.txt with the if-modified-since header to avoid unnecessary downloads if the file was not changed. Additionally the ETag header is used to detect changes. *) Crawler: better handling of unsupported mimeTypes + FileExtension *) Bugfix: plasmaWordIndexEntity was not closed correctly in - query.java - plasmaswitchboard.java *) function minimizeUrlDB added to yacy.java this function tests the current urlHashDB for unused urls ATTENTION: please don't use this function at the moment because it causes the wordIndexDB to flush all words into the word directory! git-svn-id: https://svn.berlios.de/svnroot/repos/yacy/trunk@853 6c8d7289-2bf4-0310-a012-ef5d649a1542
2005-10-05 12:45:33 +02:00
// the combined container will fit, read the container
final Iterator<WordReference> wordIdxEntries = wordIdxContainer.entries();
Reference iEntry;
while (wordIdxEntries.hasNext()) {
iEntry = wordIdxEntries.next();
final byte[] urlHash = iEntry.urlhash();
if ((currentUrlDB.exists(urlHash)) && (!minimizedUrlDB.exists(urlHash))) try {
final URIMetadataRow urlEntry = currentUrlDB.load(urlHash);
urlCounter++;
minimizedUrlDB.store(urlEntry);
*) Asynchronous queuing of crawl job URLs (stackCrawl) various checks like the blacklist check or the robots.txt disallow check are now done by a separate thread to unburden the indexer thread(s) TODO: maybe we have to introduce a threadpool here if it turn out that this single thread is a bottleneck because of the time consuming robots.txt downloads *) improved index transfer The index selection and transmission is done in parallel now to improve index transfer performance. TODO: maybe we could speed up performance by unsing multiple transmission threads in parallel instead of only a single one. *) gzip encoded post requests it is now configureable if a gzip encoded post request should be send on intex transfer/distribution *) storage Peer (very experimentell and not optimized yet) Now it's possible to send the result of the yacy indexer thread to a remote peer istead of storing the indexed words locally. This could be done by setting the property "storagePeerHash" in the yacy config file - Please note that if the index transfer fails, the index ist stored locally. - TODO: currently this index transfer is done by the indexer thread. To seedup the indexer a) this transmission should be done in parallel and b) multiple chunks should be bundled and transfered together *) general performance improvements - better memory cleanup after http request processing has finished - replacing some string concatenations with stringBuffers - replacing BufferedInputStreams with serverByteBuffer - replacing vectors with arraylists wherever possible - replacing hashtables with hashmaps wherever possible This was done because function calls to verctor or hashtable functions take 3 time longer than calls to functions of arraylists or hashmaps. TODO: we should take a look on the class serverObject which is inherited from hashmap Do we realy need a synchronization for this class? TODO: replace arraylists with linkedLists if random access to the list elements is not needed *) Robots Parser supports if-modified-since downloads now If the downloaded robots.txt file is older than 7 days the robots parser tries to download the robots.txt with the if-modified-since header to avoid unnecessary downloads if the file was not changed. Additionally the ETag header is used to detect changes. *) Crawler: better handling of unsupported mimeTypes + FileExtension *) Bugfix: plasmaWordIndexEntity was not closed correctly in - query.java - plasmaswitchboard.java *) function minimizeUrlDB added to yacy.java this function tests the current urlHashDB for unused urls ATTENTION: please don't use this function at the moment because it causes the wordIndexDB to flush all words into the word directory! git-svn-id: https://svn.berlios.de/svnroot/repos/yacy/trunk@853 6c8d7289-2bf4-0310-a012-ef5d649a1542
2005-10-05 12:45:33 +02:00
if (urlCounter % 500 == 0) {
log.logInfo(urlCounter + " URLs found so far.");
}
} catch (final IOException e) {}
*) Asynchronous queuing of crawl job URLs (stackCrawl) various checks like the blacklist check or the robots.txt disallow check are now done by a separate thread to unburden the indexer thread(s) TODO: maybe we have to introduce a threadpool here if it turn out that this single thread is a bottleneck because of the time consuming robots.txt downloads *) improved index transfer The index selection and transmission is done in parallel now to improve index transfer performance. TODO: maybe we could speed up performance by unsing multiple transmission threads in parallel instead of only a single one. *) gzip encoded post requests it is now configureable if a gzip encoded post request should be send on intex transfer/distribution *) storage Peer (very experimentell and not optimized yet) Now it's possible to send the result of the yacy indexer thread to a remote peer istead of storing the indexed words locally. This could be done by setting the property "storagePeerHash" in the yacy config file - Please note that if the index transfer fails, the index ist stored locally. - TODO: currently this index transfer is done by the indexer thread. To seedup the indexer a) this transmission should be done in parallel and b) multiple chunks should be bundled and transfered together *) general performance improvements - better memory cleanup after http request processing has finished - replacing some string concatenations with stringBuffers - replacing BufferedInputStreams with serverByteBuffer - replacing vectors with arraylists wherever possible - replacing hashtables with hashmaps wherever possible This was done because function calls to verctor or hashtable functions take 3 time longer than calls to functions of arraylists or hashmaps. TODO: we should take a look on the class serverObject which is inherited from hashmap Do we realy need a synchronization for this class? TODO: replace arraylists with linkedLists if random access to the list elements is not needed *) Robots Parser supports if-modified-since downloads now If the downloaded robots.txt file is older than 7 days the robots parser tries to download the robots.txt with the if-modified-since header to avoid unnecessary downloads if the file was not changed. Additionally the ETag header is used to detect changes. *) Crawler: better handling of unsupported mimeTypes + FileExtension *) Bugfix: plasmaWordIndexEntity was not closed correctly in - query.java - plasmaswitchboard.java *) function minimizeUrlDB added to yacy.java this function tests the current urlHashDB for unused urls ATTENTION: please don't use this function at the moment because it causes the wordIndexDB to flush all words into the word directory! git-svn-id: https://svn.berlios.de/svnroot/repos/yacy/trunk@853 6c8d7289-2bf4-0310-a012-ef5d649a1542
2005-10-05 12:45:33 +02:00
}
*) Asynchronous queuing of crawl job URLs (stackCrawl) various checks like the blacklist check or the robots.txt disallow check are now done by a separate thread to unburden the indexer thread(s) TODO: maybe we have to introduce a threadpool here if it turn out that this single thread is a bottleneck because of the time consuming robots.txt downloads *) improved index transfer The index selection and transmission is done in parallel now to improve index transfer performance. TODO: maybe we could speed up performance by unsing multiple transmission threads in parallel instead of only a single one. *) gzip encoded post requests it is now configureable if a gzip encoded post request should be send on intex transfer/distribution *) storage Peer (very experimentell and not optimized yet) Now it's possible to send the result of the yacy indexer thread to a remote peer istead of storing the indexed words locally. This could be done by setting the property "storagePeerHash" in the yacy config file - Please note that if the index transfer fails, the index ist stored locally. - TODO: currently this index transfer is done by the indexer thread. To seedup the indexer a) this transmission should be done in parallel and b) multiple chunks should be bundled and transfered together *) general performance improvements - better memory cleanup after http request processing has finished - replacing some string concatenations with stringBuffers - replacing BufferedInputStreams with serverByteBuffer - replacing vectors with arraylists wherever possible - replacing hashtables with hashmaps wherever possible This was done because function calls to verctor or hashtable functions take 3 time longer than calls to functions of arraylists or hashmaps. TODO: we should take a look on the class serverObject which is inherited from hashmap Do we realy need a synchronization for this class? TODO: replace arraylists with linkedLists if random access to the list elements is not needed *) Robots Parser supports if-modified-since downloads now If the downloaded robots.txt file is older than 7 days the robots parser tries to download the robots.txt with the if-modified-since header to avoid unnecessary downloads if the file was not changed. Additionally the ETag header is used to detect changes. *) Crawler: better handling of unsupported mimeTypes + FileExtension *) Bugfix: plasmaWordIndexEntity was not closed correctly in - query.java - plasmaswitchboard.java *) function minimizeUrlDB added to yacy.java this function tests the current urlHashDB for unused urls ATTENTION: please don't use this function at the moment because it causes the wordIndexDB to flush all words into the word directory! git-svn-id: https://svn.berlios.de/svnroot/repos/yacy/trunk@853 6c8d7289-2bf4-0310-a012-ef5d649a1542
2005-10-05 12:45:33 +02:00
if (wordCounter%500 == 0) {
wordChunkEndHash = ASCII.String(wordIdxContainer.getTermHash());
*) Asynchronous queuing of crawl job URLs (stackCrawl) various checks like the blacklist check or the robots.txt disallow check are now done by a separate thread to unburden the indexer thread(s) TODO: maybe we have to introduce a threadpool here if it turn out that this single thread is a bottleneck because of the time consuming robots.txt downloads *) improved index transfer The index selection and transmission is done in parallel now to improve index transfer performance. TODO: maybe we could speed up performance by unsing multiple transmission threads in parallel instead of only a single one. *) gzip encoded post requests it is now configureable if a gzip encoded post request should be send on intex transfer/distribution *) storage Peer (very experimentell and not optimized yet) Now it's possible to send the result of the yacy indexer thread to a remote peer istead of storing the indexed words locally. This could be done by setting the property "storagePeerHash" in the yacy config file - Please note that if the index transfer fails, the index ist stored locally. - TODO: currently this index transfer is done by the indexer thread. To seedup the indexer a) this transmission should be done in parallel and b) multiple chunks should be bundled and transfered together *) general performance improvements - better memory cleanup after http request processing has finished - replacing some string concatenations with stringBuffers - replacing BufferedInputStreams with serverByteBuffer - replacing vectors with arraylists wherever possible - replacing hashtables with hashmaps wherever possible This was done because function calls to verctor or hashtable functions take 3 time longer than calls to functions of arraylists or hashmaps. TODO: we should take a look on the class serverObject which is inherited from hashmap Do we realy need a synchronization for this class? TODO: replace arraylists with linkedLists if random access to the list elements is not needed *) Robots Parser supports if-modified-since downloads now If the downloaded robots.txt file is older than 7 days the robots parser tries to download the robots.txt with the if-modified-since header to avoid unnecessary downloads if the file was not changed. Additionally the ETag header is used to detect changes. *) Crawler: better handling of unsupported mimeTypes + FileExtension *) Bugfix: plasmaWordIndexEntity was not closed correctly in - query.java - plasmaswitchboard.java *) function minimizeUrlDB added to yacy.java this function tests the current urlHashDB for unused urls ATTENTION: please don't use this function at the moment because it causes the wordIndexDB to flush all words into the word directory! git-svn-id: https://svn.berlios.de/svnroot/repos/yacy/trunk@853 6c8d7289-2bf4-0310-a012-ef5d649a1542
2005-10-05 12:45:33 +02:00
wordChunkEnd = System.currentTimeMillis();
final long duration = wordChunkEnd - wordChunkStart;
*) Asynchronous queuing of crawl job URLs (stackCrawl) various checks like the blacklist check or the robots.txt disallow check are now done by a separate thread to unburden the indexer thread(s) TODO: maybe we have to introduce a threadpool here if it turn out that this single thread is a bottleneck because of the time consuming robots.txt downloads *) improved index transfer The index selection and transmission is done in parallel now to improve index transfer performance. TODO: maybe we could speed up performance by unsing multiple transmission threads in parallel instead of only a single one. *) gzip encoded post requests it is now configureable if a gzip encoded post request should be send on intex transfer/distribution *) storage Peer (very experimentell and not optimized yet) Now it's possible to send the result of the yacy indexer thread to a remote peer istead of storing the indexed words locally. This could be done by setting the property "storagePeerHash" in the yacy config file - Please note that if the index transfer fails, the index ist stored locally. - TODO: currently this index transfer is done by the indexer thread. To seedup the indexer a) this transmission should be done in parallel and b) multiple chunks should be bundled and transfered together *) general performance improvements - better memory cleanup after http request processing has finished - replacing some string concatenations with stringBuffers - replacing BufferedInputStreams with serverByteBuffer - replacing vectors with arraylists wherever possible - replacing hashtables with hashmaps wherever possible This was done because function calls to verctor or hashtable functions take 3 time longer than calls to functions of arraylists or hashmaps. TODO: we should take a look on the class serverObject which is inherited from hashmap Do we realy need a synchronization for this class? TODO: replace arraylists with linkedLists if random access to the list elements is not needed *) Robots Parser supports if-modified-since downloads now If the downloaded robots.txt file is older than 7 days the robots parser tries to download the robots.txt with the if-modified-since header to avoid unnecessary downloads if the file was not changed. Additionally the ETag header is used to detect changes. *) Crawler: better handling of unsupported mimeTypes + FileExtension *) Bugfix: plasmaWordIndexEntity was not closed correctly in - query.java - plasmaswitchboard.java *) function minimizeUrlDB added to yacy.java this function tests the current urlHashDB for unused urls ATTENTION: please don't use this function at the moment because it causes the wordIndexDB to flush all words into the word directory! git-svn-id: https://svn.berlios.de/svnroot/repos/yacy/trunk@853 6c8d7289-2bf4-0310-a012-ef5d649a1542
2005-10-05 12:45:33 +02:00
log.logInfo(wordCounter + " words scanned " +
"[" + wordChunkStartHash + " .. " + wordChunkEndHash + "]\n" +
*) Asynchronous queuing of crawl job URLs (stackCrawl) various checks like the blacklist check or the robots.txt disallow check are now done by a separate thread to unburden the indexer thread(s) TODO: maybe we have to introduce a threadpool here if it turn out that this single thread is a bottleneck because of the time consuming robots.txt downloads *) improved index transfer The index selection and transmission is done in parallel now to improve index transfer performance. TODO: maybe we could speed up performance by unsing multiple transmission threads in parallel instead of only a single one. *) gzip encoded post requests it is now configureable if a gzip encoded post request should be send on intex transfer/distribution *) storage Peer (very experimentell and not optimized yet) Now it's possible to send the result of the yacy indexer thread to a remote peer istead of storing the indexed words locally. This could be done by setting the property "storagePeerHash" in the yacy config file - Please note that if the index transfer fails, the index ist stored locally. - TODO: currently this index transfer is done by the indexer thread. To seedup the indexer a) this transmission should be done in parallel and b) multiple chunks should be bundled and transfered together *) general performance improvements - better memory cleanup after http request processing has finished - replacing some string concatenations with stringBuffers - replacing BufferedInputStreams with serverByteBuffer - replacing vectors with arraylists wherever possible - replacing hashtables with hashmaps wherever possible This was done because function calls to verctor or hashtable functions take 3 time longer than calls to functions of arraylists or hashmaps. TODO: we should take a look on the class serverObject which is inherited from hashmap Do we realy need a synchronization for this class? TODO: replace arraylists with linkedLists if random access to the list elements is not needed *) Robots Parser supports if-modified-since downloads now If the downloaded robots.txt file is older than 7 days the robots parser tries to download the robots.txt with the if-modified-since header to avoid unnecessary downloads if the file was not changed. Additionally the ETag header is used to detect changes. *) Crawler: better handling of unsupported mimeTypes + FileExtension *) Bugfix: plasmaWordIndexEntity was not closed correctly in - query.java - plasmaswitchboard.java *) function minimizeUrlDB added to yacy.java this function tests the current urlHashDB for unused urls ATTENTION: please don't use this function at the moment because it causes the wordIndexDB to flush all words into the word directory! git-svn-id: https://svn.berlios.de/svnroot/repos/yacy/trunk@853 6c8d7289-2bf4-0310-a012-ef5d649a1542
2005-10-05 12:45:33 +02:00
"Duration: "+ 500*1000/duration + " words/s" +
" | Free memory: " + MemoryControl.free() +
" | Total memory: " + MemoryControl.total());
*) Asynchronous queuing of crawl job URLs (stackCrawl) various checks like the blacklist check or the robots.txt disallow check are now done by a separate thread to unburden the indexer thread(s) TODO: maybe we have to introduce a threadpool here if it turn out that this single thread is a bottleneck because of the time consuming robots.txt downloads *) improved index transfer The index selection and transmission is done in parallel now to improve index transfer performance. TODO: maybe we could speed up performance by unsing multiple transmission threads in parallel instead of only a single one. *) gzip encoded post requests it is now configureable if a gzip encoded post request should be send on intex transfer/distribution *) storage Peer (very experimentell and not optimized yet) Now it's possible to send the result of the yacy indexer thread to a remote peer istead of storing the indexed words locally. This could be done by setting the property "storagePeerHash" in the yacy config file - Please note that if the index transfer fails, the index ist stored locally. - TODO: currently this index transfer is done by the indexer thread. To seedup the indexer a) this transmission should be done in parallel and b) multiple chunks should be bundled and transfered together *) general performance improvements - better memory cleanup after http request processing has finished - replacing some string concatenations with stringBuffers - replacing BufferedInputStreams with serverByteBuffer - replacing vectors with arraylists wherever possible - replacing hashtables with hashmaps wherever possible This was done because function calls to verctor or hashtable functions take 3 time longer than calls to functions of arraylists or hashmaps. TODO: we should take a look on the class serverObject which is inherited from hashmap Do we realy need a synchronization for this class? TODO: replace arraylists with linkedLists if random access to the list elements is not needed *) Robots Parser supports if-modified-since downloads now If the downloaded robots.txt file is older than 7 days the robots parser tries to download the robots.txt with the if-modified-since header to avoid unnecessary downloads if the file was not changed. Additionally the ETag header is used to detect changes. *) Crawler: better handling of unsupported mimeTypes + FileExtension *) Bugfix: plasmaWordIndexEntity was not closed correctly in - query.java - plasmaswitchboard.java *) function minimizeUrlDB added to yacy.java this function tests the current urlHashDB for unused urls ATTENTION: please don't use this function at the moment because it causes the wordIndexDB to flush all words into the word directory! git-svn-id: https://svn.berlios.de/svnroot/repos/yacy/trunk@853 6c8d7289-2bf4-0310-a012-ef5d649a1542
2005-10-05 12:45:33 +02:00
wordChunkStart = wordChunkEnd;
wordChunkStartHash = wordChunkEndHash;
}
// we have read all elements, now we can close it
wordIdxContainer = null;
} catch (final Exception e) {
log.logSevere("Exception", e);
*) Asynchronous queuing of crawl job URLs (stackCrawl) various checks like the blacklist check or the robots.txt disallow check are now done by a separate thread to unburden the indexer thread(s) TODO: maybe we have to introduce a threadpool here if it turn out that this single thread is a bottleneck because of the time consuming robots.txt downloads *) improved index transfer The index selection and transmission is done in parallel now to improve index transfer performance. TODO: maybe we could speed up performance by unsing multiple transmission threads in parallel instead of only a single one. *) gzip encoded post requests it is now configureable if a gzip encoded post request should be send on intex transfer/distribution *) storage Peer (very experimentell and not optimized yet) Now it's possible to send the result of the yacy indexer thread to a remote peer istead of storing the indexed words locally. This could be done by setting the property "storagePeerHash" in the yacy config file - Please note that if the index transfer fails, the index ist stored locally. - TODO: currently this index transfer is done by the indexer thread. To seedup the indexer a) this transmission should be done in parallel and b) multiple chunks should be bundled and transfered together *) general performance improvements - better memory cleanup after http request processing has finished - replacing some string concatenations with stringBuffers - replacing BufferedInputStreams with serverByteBuffer - replacing vectors with arraylists wherever possible - replacing hashtables with hashmaps wherever possible This was done because function calls to verctor or hashtable functions take 3 time longer than calls to functions of arraylists or hashmaps. TODO: we should take a look on the class serverObject which is inherited from hashmap Do we realy need a synchronization for this class? TODO: replace arraylists with linkedLists if random access to the list elements is not needed *) Robots Parser supports if-modified-since downloads now If the downloaded robots.txt file is older than 7 days the robots parser tries to download the robots.txt with the if-modified-since header to avoid unnecessary downloads if the file was not changed. Additionally the ETag header is used to detect changes. *) Crawler: better handling of unsupported mimeTypes + FileExtension *) Bugfix: plasmaWordIndexEntity was not closed correctly in - query.java - plasmaswitchboard.java *) function minimizeUrlDB added to yacy.java this function tests the current urlHashDB for unused urls ATTENTION: please don't use this function at the moment because it causes the wordIndexDB to flush all words into the word directory! git-svn-id: https://svn.berlios.de/svnroot/repos/yacy/trunk@853 6c8d7289-2bf4-0310-a012-ef5d649a1542
2005-10-05 12:45:33 +02:00
} finally {
if (wordIdxContainer != null) try { wordIdxContainer = null; } catch (final Exception e) {}
*) Asynchronous queuing of crawl job URLs (stackCrawl) various checks like the blacklist check or the robots.txt disallow check are now done by a separate thread to unburden the indexer thread(s) TODO: maybe we have to introduce a threadpool here if it turn out that this single thread is a bottleneck because of the time consuming robots.txt downloads *) improved index transfer The index selection and transmission is done in parallel now to improve index transfer performance. TODO: maybe we could speed up performance by unsing multiple transmission threads in parallel instead of only a single one. *) gzip encoded post requests it is now configureable if a gzip encoded post request should be send on intex transfer/distribution *) storage Peer (very experimentell and not optimized yet) Now it's possible to send the result of the yacy indexer thread to a remote peer istead of storing the indexed words locally. This could be done by setting the property "storagePeerHash" in the yacy config file - Please note that if the index transfer fails, the index ist stored locally. - TODO: currently this index transfer is done by the indexer thread. To seedup the indexer a) this transmission should be done in parallel and b) multiple chunks should be bundled and transfered together *) general performance improvements - better memory cleanup after http request processing has finished - replacing some string concatenations with stringBuffers - replacing BufferedInputStreams with serverByteBuffer - replacing vectors with arraylists wherever possible - replacing hashtables with hashmaps wherever possible This was done because function calls to verctor or hashtable functions take 3 time longer than calls to functions of arraylists or hashmaps. TODO: we should take a look on the class serverObject which is inherited from hashmap Do we realy need a synchronization for this class? TODO: replace arraylists with linkedLists if random access to the list elements is not needed *) Robots Parser supports if-modified-since downloads now If the downloaded robots.txt file is older than 7 days the robots parser tries to download the robots.txt with the if-modified-since header to avoid unnecessary downloads if the file was not changed. Additionally the ETag header is used to detect changes. *) Crawler: better handling of unsupported mimeTypes + FileExtension *) Bugfix: plasmaWordIndexEntity was not closed correctly in - query.java - plasmaswitchboard.java *) function minimizeUrlDB added to yacy.java this function tests the current urlHashDB for unused urls ATTENTION: please don't use this function at the moment because it causes the wordIndexDB to flush all words into the word directory! git-svn-id: https://svn.berlios.de/svnroot/repos/yacy/trunk@853 6c8d7289-2bf4-0310-a012-ef5d649a1542
2005-10-05 12:45:33 +02:00
}
}
log.logInfo("current LURL DB contains " + currentUrlDB.size() + " entries.");
log.logInfo("mimimized LURL DB contains " + minimizedUrlDB.size() + " entries.");
*) Asynchronous queuing of crawl job URLs (stackCrawl) various checks like the blacklist check or the robots.txt disallow check are now done by a separate thread to unburden the indexer thread(s) TODO: maybe we have to introduce a threadpool here if it turn out that this single thread is a bottleneck because of the time consuming robots.txt downloads *) improved index transfer The index selection and transmission is done in parallel now to improve index transfer performance. TODO: maybe we could speed up performance by unsing multiple transmission threads in parallel instead of only a single one. *) gzip encoded post requests it is now configureable if a gzip encoded post request should be send on intex transfer/distribution *) storage Peer (very experimentell and not optimized yet) Now it's possible to send the result of the yacy indexer thread to a remote peer istead of storing the indexed words locally. This could be done by setting the property "storagePeerHash" in the yacy config file - Please note that if the index transfer fails, the index ist stored locally. - TODO: currently this index transfer is done by the indexer thread. To seedup the indexer a) this transmission should be done in parallel and b) multiple chunks should be bundled and transfered together *) general performance improvements - better memory cleanup after http request processing has finished - replacing some string concatenations with stringBuffers - replacing BufferedInputStreams with serverByteBuffer - replacing vectors with arraylists wherever possible - replacing hashtables with hashmaps wherever possible This was done because function calls to verctor or hashtable functions take 3 time longer than calls to functions of arraylists or hashmaps. TODO: we should take a look on the class serverObject which is inherited from hashmap Do we realy need a synchronization for this class? TODO: replace arraylists with linkedLists if random access to the list elements is not needed *) Robots Parser supports if-modified-since downloads now If the downloaded robots.txt file is older than 7 days the robots parser tries to download the robots.txt with the if-modified-since header to avoid unnecessary downloads if the file was not changed. Additionally the ETag header is used to detect changes. *) Crawler: better handling of unsupported mimeTypes + FileExtension *) Bugfix: plasmaWordIndexEntity was not closed correctly in - query.java - plasmaswitchboard.java *) function minimizeUrlDB added to yacy.java this function tests the current urlHashDB for unused urls ATTENTION: please don't use this function at the moment because it causes the wordIndexDB to flush all words into the word directory! git-svn-id: https://svn.berlios.de/svnroot/repos/yacy/trunk@853 6c8d7289-2bf4-0310-a012-ef5d649a1542
2005-10-05 12:45:33 +02:00
currentUrlDB.close();
minimizedUrlDB.close();
wordIndex.close();
// TODO: rename the mimimized UrlDB to the name of the previous UrlDB
*) Asynchronous queuing of crawl job URLs (stackCrawl) various checks like the blacklist check or the robots.txt disallow check are now done by a separate thread to unburden the indexer thread(s) TODO: maybe we have to introduce a threadpool here if it turn out that this single thread is a bottleneck because of the time consuming robots.txt downloads *) improved index transfer The index selection and transmission is done in parallel now to improve index transfer performance. TODO: maybe we could speed up performance by unsing multiple transmission threads in parallel instead of only a single one. *) gzip encoded post requests it is now configureable if a gzip encoded post request should be send on intex transfer/distribution *) storage Peer (very experimentell and not optimized yet) Now it's possible to send the result of the yacy indexer thread to a remote peer istead of storing the indexed words locally. This could be done by setting the property "storagePeerHash" in the yacy config file - Please note that if the index transfer fails, the index ist stored locally. - TODO: currently this index transfer is done by the indexer thread. To seedup the indexer a) this transmission should be done in parallel and b) multiple chunks should be bundled and transfered together *) general performance improvements - better memory cleanup after http request processing has finished - replacing some string concatenations with stringBuffers - replacing BufferedInputStreams with serverByteBuffer - replacing vectors with arraylists wherever possible - replacing hashtables with hashmaps wherever possible This was done because function calls to verctor or hashtable functions take 3 time longer than calls to functions of arraylists or hashmaps. TODO: we should take a look on the class serverObject which is inherited from hashmap Do we realy need a synchronization for this class? TODO: replace arraylists with linkedLists if random access to the list elements is not needed *) Robots Parser supports if-modified-since downloads now If the downloaded robots.txt file is older than 7 days the robots parser tries to download the robots.txt with the if-modified-since header to avoid unnecessary downloads if the file was not changed. Additionally the ETag header is used to detect changes. *) Crawler: better handling of unsupported mimeTypes + FileExtension *) Bugfix: plasmaWordIndexEntity was not closed correctly in - query.java - plasmaswitchboard.java *) function minimizeUrlDB added to yacy.java this function tests the current urlHashDB for unused urls ATTENTION: please don't use this function at the moment because it causes the wordIndexDB to flush all words into the word directory! git-svn-id: https://svn.berlios.de/svnroot/repos/yacy/trunk@853 6c8d7289-2bf4-0310-a012-ef5d649a1542
2005-10-05 12:45:33 +02:00
log.logInfo("FINISHED URL CLEANUP, WAIT FOR DUMP");
log.logInfo("You can now backup your old URL DB and rename minimized/urlHash.db to urlHash.db");
*) Asynchronous queuing of crawl job URLs (stackCrawl) various checks like the blacklist check or the robots.txt disallow check are now done by a separate thread to unburden the indexer thread(s) TODO: maybe we have to introduce a threadpool here if it turn out that this single thread is a bottleneck because of the time consuming robots.txt downloads *) improved index transfer The index selection and transmission is done in parallel now to improve index transfer performance. TODO: maybe we could speed up performance by unsing multiple transmission threads in parallel instead of only a single one. *) gzip encoded post requests it is now configureable if a gzip encoded post request should be send on intex transfer/distribution *) storage Peer (very experimentell and not optimized yet) Now it's possible to send the result of the yacy indexer thread to a remote peer istead of storing the indexed words locally. This could be done by setting the property "storagePeerHash" in the yacy config file - Please note that if the index transfer fails, the index ist stored locally. - TODO: currently this index transfer is done by the indexer thread. To seedup the indexer a) this transmission should be done in parallel and b) multiple chunks should be bundled and transfered together *) general performance improvements - better memory cleanup after http request processing has finished - replacing some string concatenations with stringBuffers - replacing BufferedInputStreams with serverByteBuffer - replacing vectors with arraylists wherever possible - replacing hashtables with hashmaps wherever possible This was done because function calls to verctor or hashtable functions take 3 time longer than calls to functions of arraylists or hashmaps. TODO: we should take a look on the class serverObject which is inherited from hashmap Do we realy need a synchronization for this class? TODO: replace arraylists with linkedLists if random access to the list elements is not needed *) Robots Parser supports if-modified-since downloads now If the downloaded robots.txt file is older than 7 days the robots parser tries to download the robots.txt with the if-modified-since header to avoid unnecessary downloads if the file was not changed. Additionally the ETag header is used to detect changes. *) Crawler: better handling of unsupported mimeTypes + FileExtension *) Bugfix: plasmaWordIndexEntity was not closed correctly in - query.java - plasmaswitchboard.java *) function minimizeUrlDB added to yacy.java this function tests the current urlHashDB for unused urls ATTENTION: please don't use this function at the moment because it causes the wordIndexDB to flush all words into the word directory! git-svn-id: https://svn.berlios.de/svnroot/repos/yacy/trunk@853 6c8d7289-2bf4-0310-a012-ef5d649a1542
2005-10-05 12:45:33 +02:00
log.logInfo("TERMINATED URL CLEANUP");
} catch (final Exception e) {
log.logSevere("Exception: " + e.getMessage(), e);
} catch (final Error e) {
log.logSevere("Error: " + e.getMessage(), e);
*) Asynchronous queuing of crawl job URLs (stackCrawl) various checks like the blacklist check or the robots.txt disallow check are now done by a separate thread to unburden the indexer thread(s) TODO: maybe we have to introduce a threadpool here if it turn out that this single thread is a bottleneck because of the time consuming robots.txt downloads *) improved index transfer The index selection and transmission is done in parallel now to improve index transfer performance. TODO: maybe we could speed up performance by unsing multiple transmission threads in parallel instead of only a single one. *) gzip encoded post requests it is now configureable if a gzip encoded post request should be send on intex transfer/distribution *) storage Peer (very experimentell and not optimized yet) Now it's possible to send the result of the yacy indexer thread to a remote peer istead of storing the indexed words locally. This could be done by setting the property "storagePeerHash" in the yacy config file - Please note that if the index transfer fails, the index ist stored locally. - TODO: currently this index transfer is done by the indexer thread. To seedup the indexer a) this transmission should be done in parallel and b) multiple chunks should be bundled and transfered together *) general performance improvements - better memory cleanup after http request processing has finished - replacing some string concatenations with stringBuffers - replacing BufferedInputStreams with serverByteBuffer - replacing vectors with arraylists wherever possible - replacing hashtables with hashmaps wherever possible This was done because function calls to verctor or hashtable functions take 3 time longer than calls to functions of arraylists or hashmaps. TODO: we should take a look on the class serverObject which is inherited from hashmap Do we realy need a synchronization for this class? TODO: replace arraylists with linkedLists if random access to the list elements is not needed *) Robots Parser supports if-modified-since downloads now If the downloaded robots.txt file is older than 7 days the robots parser tries to download the robots.txt with the if-modified-since header to avoid unnecessary downloads if the file was not changed. Additionally the ETag header is used to detect changes. *) Crawler: better handling of unsupported mimeTypes + FileExtension *) Bugfix: plasmaWordIndexEntity was not closed correctly in - query.java - plasmaswitchboard.java *) function minimizeUrlDB added to yacy.java this function tests the current urlHashDB for unused urls ATTENTION: please don't use this function at the moment because it causes the wordIndexDB to flush all words into the word directory! git-svn-id: https://svn.berlios.de/svnroot/repos/yacy/trunk@853 6c8d7289-2bf4-0310-a012-ef5d649a1542
2005-10-05 12:45:33 +02:00
}
}
/**
* Reads all words from the given file and creates a treemap, where key is
* the plasma word hash and value is the word itself.
*
* @param wordlist File where the words are stored.
* @return HashMap with the hash-word - relation.
*/
private static TreeMap<byte[], String> loadWordMap(final File wordlist) {
// returns a hash-word - Relation
final TreeMap<byte[], String> wordmap = new TreeMap<byte[], String>(Base64Order.enhancedCoder);
try {
String word;
final BufferedReader br = new BufferedReader(new InputStreamReader(new FileInputStream(wordlist)));
while ((word = br.readLine()) != null) wordmap.put(Word.word2hash(word), word);
br.close();
} catch (final IOException e) {}
return wordmap;
}
/**
* Cleans a wordlist in a file according to the length of the words. The
* file with the given filename is read and then only the words in the given
* length-range are written back to the file.
*
* @param wordlist Name of the file the words are stored in.
* @param minlength Minimal needed length for each word to be stored.
* @param maxlength Maximal allowed length for each word to be stored.
*/
private static void cleanwordlist(final String wordlist, final int minlength, final int maxlength) {
// start up
System.out.println(copyright);
System.out.println(hline);
Log.logConfig("CLEAN-WORDLIST", "START");
String word;
final TreeSet<String> wordset = new TreeSet<String>();
int count = 0;
try {
final BufferedReader br = new BufferedReader(new InputStreamReader(new FileInputStream(wordlist)));
final String seps = "' .,:/-&";
while ((word = br.readLine()) != null) {
word = word.toLowerCase().trim();
for (int i = 0; i < seps.length(); i++) {
if (word.indexOf(seps.charAt(i)) >= 0) word = word.substring(0, word.indexOf(seps.charAt(i)));
}
if ((word.length() >= minlength) && (word.length() <= maxlength)) wordset.add(word);
count++;
}
br.close();
if (wordset.size() != count) {
count = count - wordset.size();
final BufferedWriter bw = new BufferedWriter(new PrintWriter(new FileWriter(wordlist)));
while (!wordset.isEmpty()) {
word = wordset.first();
bw.write(word + "\n");
wordset.remove(word);
}
bw.close();
Log.logInfo("CLEAN-WORDLIST", "shrinked wordlist by " + count + " words.");
} else {
Log.logInfo("CLEAN-WORDLIST", "not necessary to change wordlist");
}
} catch (final IOException e) {
Log.logSevere("CLEAN-WORDLIST", "ERROR: " + e.getMessage());
System.exit(-1);
}
// finished
Log.logConfig("CLEAN-WORDLIST", "FINISHED");
}
private static String[] shift(final String[] args, final int pos, final int count) {
final String[] newargs = new String[args.length - count];
System.arraycopy(args, 0, newargs, 0, pos);
System.arraycopy(args, pos + count, newargs, pos, args.length - pos - count);
return newargs;
}
/**
* Uses an Iteration over urlHash.db to detect malformed URL-Entries.
* Damaged URL-Entries will be marked in a HashSet and removed at the end of the function.
*
* @param homePath Root-Path where all information is to be found.
*/
private static void urldbcleanup(final File dataHome, final File appHome, final String networkName) {
final File root = dataHome;
final File indexroot = new File(root, "DATA/INDEX");
try {Log.configureLogging(dataHome, appHome, new File(dataHome, "DATA/LOG/yacy.logging"));} catch (final Exception e) {}
final MetadataRepository currentUrlDB = new MetadataRepository(new File(new File(indexroot, networkName), "TEXT"), "text.urlmd", false, false);
currentUrlDB.deadlinkCleaner();
currentUrlDB.close();
}
private static void RWIHashList(final File dataHome, final File appHome, final String targetName, final String resource, final String format) {
Segment WordIndex = null;
final Log log = new Log("HASHLIST");
final File indexPrimaryRoot = new File(dataHome, "DATA/INDEX");
final String wordChunkStartHash = "AAAAAAAAAAAA";
try {Log.configureLogging(dataHome, appHome, new File(dataHome, "DATA/LOG/yacy.logging"));} catch (final Exception e) {}
log.logInfo("STARTING CREATION OF RWI-HASHLIST");
final File root = dataHome;
try {
Iterator<ReferenceContainer<WordReference>> indexContainerIterator = null;
if (resource.equals("all")) {
WordIndex = new Segment(
log,
new File(new File(indexPrimaryRoot, "freeworld"), "TEXT"),
10000,
Integer.MAX_VALUE, false, false);
indexContainerIterator = WordIndex.termIndex().referenceContainerIterator(wordChunkStartHash.getBytes(), false, false);
}
int counter = 0;
ReferenceContainer<WordReference> container = null;
if (format.equals("zip")) {
log.logInfo("Writing Hashlist to ZIP-file: " + targetName + ".zip");
final ZipEntry zipEntry = new ZipEntry(targetName + ".txt");
final File file = new File(root, targetName + ".zip");
final ZipOutputStream bos = new ZipOutputStream(new FileOutputStream(file));
bos.putNextEntry(zipEntry);
if(indexContainerIterator != null) {
while (indexContainerIterator.hasNext()) {
counter++;
container = indexContainerIterator.next();
bos.write(container.getTermHash());
bos.write(serverCore.CRLF);
if (counter % 500 == 0) {
log.logInfo("Found " + counter + " Hashs until now. Last found Hash: " + ASCII.String(container.getTermHash()));
}
}
}
bos.flush();
bos.close();
} else {
log.logInfo("Writing Hashlist to TXT-file: " + targetName + ".txt");
final File file = new File(root, targetName + ".txt");
final BufferedOutputStream bos = new BufferedOutputStream(new FileOutputStream(file));
if(indexContainerIterator != null) {
while (indexContainerIterator.hasNext()) {
counter++;
container = indexContainerIterator.next();
bos.write(container.getTermHash());
bos.write(serverCore.CRLF);
if (counter % 500 == 0) {
log.logInfo("Found " + counter + " Hashs until now. Last found Hash: " + ASCII.String(container.getTermHash()));
}
}
}
bos.flush();
bos.close();
}
log.logInfo("Total number of Hashs: " + counter + ". Last found Hash: " + (container == null ? "null" : ASCII.String(container.getTermHash())));
} catch (final IOException e) {
log.logSevere("IOException", e);
}
if (WordIndex != null) {
WordIndex.close();
WordIndex = null;
}
}
/**
* Searching for peers affected by Bug
* @param homePath
*/
public static void testPeerDB(final File homePath) {
try {
final File yacyDBPath = new File(homePath, "DATA/INDEX/freeworld/NETWORK");
final String[] dbFileNames = {"seed.new.db","seed.old.db","seed.pot.db"};
for (final String dbFileName : dbFileNames) {
final File dbFile = new File(yacyDBPath,dbFileName);
2012-07-05 10:23:07 +02:00
final MapDataMining db = new MapDataMining(dbFile, Word.commonHashLength, Base64Order.enhancedCoder, 1024 * 512, 500, SeedDB.sortFields, SeedDB.longaccFields, SeedDB.doubleaccFields);
2011-12-15 15:15:53 +01:00
Iterator<Map.Entry<byte[], Map<String, String>>> it;
it = db.entries(true, false);
while (it.hasNext()) {
2011-12-15 15:15:53 +01:00
final Map.Entry<byte[], Map<String, String>> dna = it.next();
String peerHash = UTF8.String(dna.getKey());
if (peerHash.length() < Word.commonHashLength) {
2011-12-15 15:15:53 +01:00
final String peerName = dna.getValue().get("Name");
final String peerIP = dna.getValue().get("IP");
final String peerPort = dna.getValue().get("Port");
while (peerHash.length() < Word.commonHashLength) { peerHash = peerHash + "_"; }
System.err.println("Invalid Peer-Hash found in '" + dbFileName + "': " + peerName + ":" + peerHash + ", http://" + peerIP + ":" + peerPort);
}
}
db.close();
}
} catch (final Exception e) {
Log.logException(e);
}
}
/**
* Main-method which is started by java. Checks for special arguments or
* starts up the application.
*
* @param args
* Given arguments from the command line.
*/
public static void main(String args[]) {
try {
// check assertion status
//ClassLoader.getSystemClassLoader().setDefaultAssertionStatus(true);
boolean assertionenabled = false;
assert assertionenabled = true;
if (assertionenabled) System.out.println("Asserts are enabled");
// check memory amount
System.gc();
final long startupMemFree = MemoryControl.free();
final long startupMemTotal = MemoryControl.total();
// maybe go into headless awt mode: we have three cases depending on OS and one exception:
// windows : better do not go into headless mode
// mac : go into headless mode because an application is shown in gui which may not be wanted
// linux : go into headless mode because this does not need any head operation
// exception : if the -gui option is used then do not go into headless mode since that uses a gui
boolean headless = true;
if (OS.isWindows) headless = false;
if (args.length >= 1 && args[0].toLowerCase().equals("-gui")) headless = false;
System.setProperty("java.awt.headless", headless ? "true" : "false");
String s = ""; for (final String a: args) s += a + " ";
yacyRelease.startParameter = s.trim();
File applicationRoot = new File(System.getProperty("user.dir").replace('\\', '/'));
File dataRoot = applicationRoot;
//System.out.println("args.length=" + args.length);
//System.out.print("args=["); for (int i = 0; i < args.length; i++) System.out.print(args[i] + ", "); System.out.println("]");
if ((args.length >= 1) && (args[0].toLowerCase().equals("-startup") || args[0].equals("-start"))) {
// normal start-up of yacy
if (args.length > 1) dataRoot = new File(System.getProperty("user.home").replace('\\', '/'), args[1]);
startup(dataRoot, applicationRoot, startupMemFree, startupMemTotal, false);
} else if (args.length >= 1 && args[0].toLowerCase().equals("-gui")) {
// start-up of yacy with gui
if (args.length > 1) dataRoot = new File(System.getProperty("user.home").replace('\\', '/'), args[1]);
startup(dataRoot, applicationRoot, startupMemFree, startupMemTotal, true);
} else if ((args.length >= 1) && ((args[0].toLowerCase().equals("-shutdown")) || (args[0].equals("-stop")))) {
// normal shutdown of yacy
if (args.length == 2) applicationRoot= new File(args[1]);
shutdown(applicationRoot);
} else if ((args.length >= 1) && (args[0].toLowerCase().equals("-update"))) {
// aut-update yacy
if (args.length == 2) applicationRoot= new File(args[1]);
update(applicationRoot);
} else if ((args.length >= 1) && (args[0].toLowerCase().equals("-version"))) {
// show yacy version
System.out.println(copyright);
} else if ((args.length >= 1) && (args[0].toLowerCase().equals("-minimizeurldb"))) {
// migrate words from DATA/PLASMADB/WORDS path to assortment cache, if possible
// attention: this may run long and should not be interrupted!
if (args.length >= 3 && args[1].toLowerCase().equals("-cache")) {
args = shift(args, 1, 2);
}
if (args.length == 2) applicationRoot= new File(args[1]);
minimizeUrlDB(dataRoot, applicationRoot, "freeworld");
} else if ((args.length >= 1) && (args[0].toLowerCase().equals("-testpeerdb"))) {
if (args.length == 2) {
applicationRoot = new File(args[1]);
} else if (args.length > 2) {
System.err.println("Usage: -testPeerDB [homeDbRoot]");
}
testPeerDB(applicationRoot);
} else if ((args.length >= 1) && (args[0].toLowerCase().equals("-genwordstat"))) {
// this can help to create a stop-word list
// to use this, you need a 'yacy.words' file in the root path
// start this with "java -classpath classes yacy -genwordstat [<rootdir>]"
if (args.length == 2) applicationRoot= new File(args[1]);
genWordstat(applicationRoot);
} else if ((args.length == 4) && (args[0].toLowerCase().equals("-cleanwordlist"))) {
// this can be used to organize and clean a word-list
// start this with "java -classpath classes yacy -cleanwordlist <word-file> <minlength> <maxlength>"
final int minlength = Integer.parseInt(args[2]);
final int maxlength = Integer.parseInt(args[3]);
cleanwordlist(args[1], minlength, maxlength);
} else if ((args.length >= 1) && (args[0].toLowerCase().equals("-urldbcleanup"))) {
// generate a url list and save it in a file
if (args.length == 2) applicationRoot= new File(args[1]);
urldbcleanup(dataRoot, applicationRoot, "freeworld");
} else if ((args.length >= 1) && (args[0].toLowerCase().equals("-rwihashlist"))) {
// generate a url list and save it in a file
String domain = "all";
String format = "txt";
if (args.length >= 2) domain= args[1];
if (args.length >= 3) format= args[2];
if (args.length == 4) applicationRoot= new File(args[3]);
final String outfile = "rwihashlist_" + System.currentTimeMillis();
RWIHashList(dataRoot, applicationRoot, outfile, domain, format);
} else {
if (args.length == 1) applicationRoot= new File(args[0]);
startup(dataRoot, applicationRoot, startupMemFree, startupMemTotal, false);
}
} finally {
Log.shutdown();
}
}
}
/**
* This class is a helper class whose instance is started, when the java virtual
* machine shuts down. Signals the plasmaSwitchboard to shut down.
*/
class shutdownHookThread extends Thread {
private final Switchboard sb;
private final Thread mainThread;
public shutdownHookThread(final Thread mainThread, final Switchboard sb) {
super();
this.sb = sb;
this.mainThread = mainThread;
}
2012-07-05 10:23:07 +02:00
@Override
public void run() {
try {
if (!this.sb.isTerminated()) {
Log.logConfig("SHUTDOWN","Shutdown via shutdown hook.");
// sending the yacy main thread a shutdown signal
Log.logFine("SHUTDOWN","Signaling shutdown to the switchboard.");
this.sb.terminate("shutdown hook");
// waiting for the yacy thread to finish execution
Log.logFine("SHUTDOWN","Waiting for main thread to finish.");
if (this.mainThread.isAlive() && !this.sb.isTerminated()) {
this.mainThread.join();
}
}
} catch (final Exception e) {
Log.logSevere("SHUTDOWN","Unexpected error. " + e.getClass().getName(),e);
}
}
}