yacy_search_server/source/de/anomic/plasma/plasmaDbImporter.java

332 lines
14 KiB
Java
Raw Normal View History

package de.anomic.plasma;
import java.io.File;
import java.util.Iterator;
import java.util.Vector;
import de.anomic.server.serverDate;
import de.anomic.server.logging.serverLog;
public class plasmaDbImporter extends Thread {
public static final Vector finishedJobs = new Vector();
public static final ThreadGroup runningJobs = new ThreadGroup("DbImport");
public static int currMaxJobNr = 0;
private final int jobNr;
private final plasmaCrawlLURL homeUrlDB;
private final plasmaWordIndex homeWordIndex;
private final plasmaCrawlLURL importUrlDB;
private final plasmaWordIndex importWordIndex;
//private final String importPath;
private final File importRoot;
private final int importStartSize;
private final serverLog log;
private boolean stopped = false;
private boolean paused = false;
private String wordHash = "------------";
long wordChunkStart = System.currentTimeMillis(), wordChunkEnd = this.wordChunkStart;
String wordChunkStartHash = "------------", wordChunkEndHash;
private long urlCounter = 0, wordCounter = 0, entryCounter = 0;
private long globalStart = System.currentTimeMillis();
private long globalEnd;
private String error;
public void stoppIt() {
this.stopped = true;
this.continueIt();
}
public void pauseIt() {
synchronized(this) {
this.paused = true;
}
}
public void continueIt() {
synchronized(this) {
if (this.paused) {
this.paused = false;
this.notifyAll();
}
}
}
public boolean isPaused() {
synchronized(this) {
return this.paused;
}
}
/**
* Can be used to close all still running importer threads
* e.g. on server shutdown
*/
public static void close() {
/* waiting for all threads to finish */
int threadCount = runningJobs.activeCount();
Thread[] threadList = new Thread[threadCount];
threadCount = plasmaDbImporter.runningJobs.enumerate(threadList);
if (threadCount == 0) return;
serverLog log = new serverLog("DB-IMPORT");
try {
// trying to gracefull stop all still running sessions ...
log.logInfo("Signaling shutdown to " + threadCount + " remaining dbImporter threads ...");
for ( int currentThreadIdx = 0; currentThreadIdx < threadCount; currentThreadIdx++ ) {
Thread currentThread = threadList[currentThreadIdx];
if (currentThread.isAlive()) {
((plasmaDbImporter)currentThread).stoppIt();
}
}
// waiting a few ms for the session objects to continue processing
try { Thread.sleep(500); } catch (InterruptedException ex) {}
// interrupting all still running or pooled threads ...
log.logInfo("Sending interruption signal to " + runningJobs.activeCount() + " remaining dbImporter threads ...");
plasmaDbImporter.runningJobs.interrupt();
// we need to use a timeout here because of missing interruptable session threads ...
log.logFine("Waiting for " + runningJobs.activeCount() + " remaining dbImporter threads to finish shutdown ...");
for ( int currentThreadIdx = 0; currentThreadIdx < threadCount; currentThreadIdx++ ) {
Thread currentThread = threadList[currentThreadIdx];
if (currentThread.isAlive()) {
log.logFine("Waiting for dbImporter thread '" + currentThread.getName() + "' [" + currentThreadIdx + "] to finish shutdown.");
try { currentThread.join(500); } catch (InterruptedException ex) {}
}
}
log.logInfo("Shutdown of remaining dbImporter threads finished.");
} catch (Exception e) {
log.logSevere("Unexpected error while trying to shutdown all remaining dbImporter threads.",e);
}
}
public String getError() {
return this.error;
}
public int getJobNr() {
return this.jobNr;
}
public String getCurrentWordhash() {
return this.wordHash;
}
public long getUrlCounter() {
return this.urlCounter;
}
public long getWordEntityCounter() {
return this.wordCounter;
}
public long getWordEntryCounter() {
return this.entryCounter;
}
public File getImportRoot() {
return this.importRoot;
}
public int getImportWordDbSize() {
return this.importWordIndex.size();
}
public plasmaDbImporter(plasmaWordIndex theHomeIndexDB, plasmaCrawlLURL theHomeUrlDB, String theImportPath) {
super(runningJobs,"DB-Import_" + theImportPath);
this.log = new serverLog("DB-IMPORT");
synchronized(runningJobs) {
this.jobNr = currMaxJobNr;
currMaxJobNr++;
}
if (theImportPath == null) throw new NullPointerException();
//this.importPath = theImportPath;
this.importRoot = new File(theImportPath);
if (theHomeIndexDB == null) throw new NullPointerException();
this.homeWordIndex = theHomeIndexDB;
if (theHomeUrlDB == null) throw new NullPointerException();
this.homeUrlDB = theHomeUrlDB;
if (this.homeWordIndex.getRoot().equals(this.importRoot)) {
throw new IllegalArgumentException("Import and home DB directory must not be equal");
}
// configure import DB
String errorMsg = null;
if (!this.importRoot.exists()) errorMsg = "Import directory does not exist.";
if (!this.importRoot.canRead()) errorMsg = "Import directory is not readable.";
if (!this.importRoot.canWrite()) errorMsg = "Import directory is not writeable";
if (!this.importRoot.isDirectory()) errorMsg = "ImportDirectory is not a directory.";
if (errorMsg != null) {
this.log.logSevere(errorMsg + "\nName: " + this.importRoot.getAbsolutePath());
throw new IllegalArgumentException(errorMsg);
}
this.log.logFine("Initializing source word index db.");
this.importWordIndex = new plasmaWordIndex(this.importRoot, 8*1024*1024, this.log);
this.log.logFine("Initializing import URL db.");
this.importUrlDB = new plasmaCrawlLURL(new File(this.importRoot, "urlHash.db"), 4*1024*1024);
this.importStartSize = this.importWordIndex.size();
}
public void run() {
try {
importWordsDB();
} finally {
this.globalEnd = System.currentTimeMillis();
finishedJobs.add(this);
}
}
public long getTotalRuntime() {
return (this.globalEnd == 0)?System.currentTimeMillis()-this.globalStart:this.globalEnd-this.globalStart;
}
public int getProcessingStatus() {
return (this.importStartSize-this.importWordIndex.size())/(this.importStartSize/100);
}
public long getElapsedTime() {
return System.currentTimeMillis()-this.globalStart;
}
public long getEstimatedTime() {
return (this.wordCounter==0)?0:this.importWordIndex.size()*((System.currentTimeMillis()-this.globalStart)/this.wordCounter);
}
public void importWordsDB() {
this.log.logInfo("STARTING DB-IMPORT");
try {
this.log.logInfo("Importing DB from '" + this.importRoot.getAbsolutePath() + "' to '" + this.homeWordIndex.getRoot().getAbsolutePath() + "'.");
this.log.logInfo("Home word index contains " + this.homeWordIndex.size() + " words and " + this.homeUrlDB.size() + " URLs.");
this.log.logInfo("Import word index contains " + this.importWordIndex.size() + " words and " + this.importUrlDB.size() + " URLs.");
// iterate over all words from import db
Iterator importWordHashIterator = this.importWordIndex.wordHashes(wordChunkStartHash, true, true);
while (!isAborted() && importWordHashIterator.hasNext()) {
plasmaWordIndexEntity importWordIdxEntity = null;
try {
wordCounter++;
wordHash = (String) importWordHashIterator.next();
importWordIdxEntity = importWordIndex.getEntity(wordHash, true, -1);
if (importWordIdxEntity.size() == 0) {
importWordIdxEntity.deleteComplete();
continue;
}
// creating a container used to hold the imported entries
plasmaWordIndexEntryContainer newContainer = new plasmaWordIndexEntryContainer(wordHash,importWordIdxEntity.size());
// the combined container will fit, read the container
Iterator importWordIdxEntries = importWordIdxEntity.elements(true);
plasmaWordIndexEntry importWordIdxEntry;
while (importWordIdxEntries.hasNext()) {
// testing if import process was aborted
if (isAborted()) break;
// getting next word index entry
entryCounter++;
importWordIdxEntry = (plasmaWordIndexEntry) importWordIdxEntries.next();
String urlHash = importWordIdxEntry.getUrlHash();
if ((this.importUrlDB.exists(urlHash)) && (!this.homeUrlDB.exists(urlHash))) {
urlCounter++;
// importing the new url
plasmaCrawlLURL.Entry urlEntry = this.importUrlDB.getEntry(urlHash);
this.homeUrlDB.newEntry(urlEntry);
if (urlCounter % 500 == 0) {
this.log.logFine(urlCounter + " URLs processed so far.");
}
}
// adding word index entity to container
newContainer.add(importWordIdxEntry,System.currentTimeMillis());
if (entryCounter % 500 == 0) {
this.log.logFine(entryCounter + " word entries and " + wordCounter + " word entities processed so far.");
}
}
// testing if import process was aborted
if (isAborted()) break;
// importing entity container to home db
homeWordIndex.addEntries(newContainer, true);
// delete complete index entity file
importWordIdxEntity.close();
importWordIndex.deleteIndex(wordHash);
// print out some statistical information
if (wordCounter%500 == 0) {
wordChunkEndHash = wordHash;
wordChunkEnd = System.currentTimeMillis();
long duration = wordChunkEnd - wordChunkStart;
log.logInfo(wordCounter + " word entities imported " +
"[" + wordChunkStartHash + " .. " + wordChunkEndHash + "] " +
this.getProcessingStatus() + "%\n" +
"Speed: "+ 500*1000/duration + " word entities/s" +
" | Elapsed time: " + serverDate.intervalToString(getElapsedTime()) +
" | Estimated time: " + serverDate.intervalToString(getEstimatedTime()) + "\n" +
"Home Words = " + homeWordIndex.size() +
" | Import Words = " + importWordIndex.size());
wordChunkStart = wordChunkEnd;
wordChunkStartHash = wordChunkEndHash;
}
} catch (Exception e) {
log.logSevere("Import of word entity '" + wordHash + "' failed.",e);
} finally {
if (importWordIdxEntity != null) try { importWordIdxEntity.close(); } catch (Exception e) {}
}
}
this.log.logInfo("Home word index contains " + homeWordIndex.size() + " words and " + homeUrlDB.size() + " URLs.");
this.log.logInfo("Import word index contains " + importWordIndex.size() + " words and " + importUrlDB.size() + " URLs.");
this.log.logInfo("DB-IMPORT FINISHED");
} catch (Exception e) {
this.log.logSevere("Database import failed.",e);
e.printStackTrace();
this.error = e.toString();
} finally {
if (importUrlDB != null) try { importUrlDB.close(); } catch (Exception e){}
if (importWordIndex != null) try { importWordIndex.close(5000); } catch (Exception e){}
}
}
private boolean isAborted() {
synchronized(this) {
if (this.paused) {
try {
this.wait();
}
catch (InterruptedException e){}
}
}
return (this.stopped) || Thread.currentThread().isInterrupted();
}
}