yacy_search_server/source/de/anomic/crawler/ExternalIndexImporter.java
orbiter 1d8d51075c refactoring:
- removed the plasma package. The name of that package came from a very early pre-version of YaCy, even before YaCy was named AnomicHTTPProxy. The Proxy project introduced search for cache contents using class files that had been developed during the plasma project. Information from 2002 about plasma can be found here:
http://web.archive.org/web/20020802110827/http://anomic.de/AnomicPlasma/index.html
We stil have one class that comes mostly unchanged from the plasma project, the Condenser class. But this is now part of the document package and all other classes in the plasma package can be assigned to other packages.
- cleaned up the http package: better structure of that class and clean isolation of server and client classes. The old HTCache becomes part of the client sub-package of http.
- because the plasmaSwitchboard is now part of the search package all servlets had to be touched to declare a different package source.

git-svn-id: https://svn.berlios.de/svnroot/repos/yacy/trunk@6232 6c8d7289-2bf4-0310-a012-ef5d649a1542
2009-07-19 20:37:44 +00:00

227 lines
11 KiB
Java

package de.anomic.crawler;
import java.util.HashSet;
import java.util.Iterator;
import java.util.TreeSet;
import de.anomic.kelondro.text.Reference;
import de.anomic.kelondro.text.ReferenceContainer;
import de.anomic.kelondro.text.Segment;
import de.anomic.kelondro.text.metadataPrototype.URLMetadataRow;
import de.anomic.kelondro.text.referencePrototype.WordReference;
import de.anomic.kelondro.util.DateFormatter;
public class ExternalIndexImporter extends AbstractImporter implements Importer {
/**
* the source word index (the DB to import)
*/
private final Segment importWordIndex;
/**
* the destination word index (the home DB)
*/
protected Segment homeWordIndex;
private final int importStartSize;
private byte[] wordHash = "------------".getBytes();
long wordChunkStart = System.currentTimeMillis(), wordChunkEnd = this.wordChunkStart;
byte[] wordChunkStartHash = "------------".getBytes(), wordChunkEndHash;
private long urlCounter = 0, wordCounter = 0, entryCounter = 0, notBoundEntryCounter = 0;
public ExternalIndexImporter(final Segment homeWI, final Segment importWI) {
super("PLASMADB");
this.homeWordIndex = homeWI;
this.importWordIndex = importWI;
this.importStartSize = this.importWordIndex.termIndex().sizesMax();
}
/**
* @see Importer#getJobName()
*/
public String getJobName() {
return this.importWordIndex.getLocation().toString();
}
/**
* @see Importer#getStatus()
*/
public String getStatus() {
final StringBuilder theStatus = new StringBuilder();
theStatus.append("Hash=").append(this.wordHash).append("\n");
theStatus.append("#URL=").append(this.urlCounter).append("\n");
theStatus.append("#Word Entity=").append(this.wordCounter).append("\n");
theStatus.append("#Word Entry={").append(this.entryCounter);
theStatus.append(" ,NotBound=").append(this.notBoundEntryCounter).append("}");
return theStatus.toString();
}
public void run() {
try {
importWordsDB();
} finally {
this.globalEnd = System.currentTimeMillis();
//this.sb.dbImportManager.finishedJobs.add(this);
}
}
/**
* @see Importer#getProcessingStatusPercent()
*/
public int getProcessingStatusPercent() {
// thid seems to be better:
// (this.importStartSize-this.importWordIndex.size())*100/((this.importStartSize==0)?1:this.importStartSize);
// but maxint (2,147,483,647) could be exceeded when WordIndexes reach 20M entries
//return (this.importStartSize-this.importWordIndex.size())/((this.importStartSize<100)?1:(this.importStartSize)/100);
return (int)(this.wordCounter)/((this.importStartSize<100)?1:(this.importStartSize)/100);
}
/**
* @see Importer#getElapsedTime()
*/
public long getEstimatedTime() {
return (this.wordCounter==0)?0:((this.importStartSize*getElapsedTime())/this.wordCounter)-getElapsedTime();
}
public void importWordsDB() {
this.log.logInfo("STARTING DB-IMPORT");
try {
this.log.logInfo("Importing DB from '" + this.importWordIndex.getLocation().getAbsolutePath() + "'");
this.log.logInfo("Home word index contains " + homeWordIndex.termIndex().sizesMax() + " words and " + homeWordIndex.urlMetadata().size() + " URLs.");
this.log.logInfo("Import word index contains " + this.importWordIndex.termIndex().sizesMax() + " words and " + this.importWordIndex.urlMetadata().size() + " URLs.");
final HashSet<String> unknownUrlBuffer = new HashSet<String>();
final HashSet<String> importedUrlBuffer = new HashSet<String>();
// iterate over all words from import db
//Iterator importWordHashIterator = this.importWordIndex.wordHashes(this.wordChunkStartHash, CrawlSwitchboard.RL_WORDFILES, false);
Iterator<ReferenceContainer<WordReference>> indexContainerIterator = this.importWordIndex.termIndex().references(this.wordChunkStartHash, false, 100, false).iterator();
while (!isAborted() && indexContainerIterator.hasNext()) {
final TreeSet<String> entityUrls = new TreeSet<String>();
ReferenceContainer<WordReference> newContainer = null;
try {
this.wordCounter++;
newContainer = indexContainerIterator.next();
this.wordHash = newContainer.getTermHash();
// loop throug the entities of the container and get the
// urlhash
final Iterator<WordReference> importWordIdxEntries = newContainer.entries();
Reference importWordIdxEntry;
while (importWordIdxEntries.hasNext()) {
// testing if import process was aborted
if (isAborted()) break;
// getting next word index entry
importWordIdxEntry = importWordIdxEntries.next();
final String urlHash = importWordIdxEntry.metadataHash();
entityUrls.add(urlHash);
}
final Iterator<String> urlIter = entityUrls.iterator();
while (urlIter.hasNext()) {
if (isAborted()) break;
final String urlHash = urlIter.next();
if (!importedUrlBuffer.contains(urlHash)) {
if (unknownUrlBuffer.contains(urlHash)) {
// url known as unknown
unknownUrlBuffer.add(urlHash);
notBoundEntryCounter++;
newContainer.remove(urlHash);
continue;
}
// we need to import the url
// getting the url entry
final URLMetadataRow urlEntry = this.importWordIndex.urlMetadata().load(urlHash, null, 0);
if (urlEntry != null) {
/* write it into the home url db */
homeWordIndex.urlMetadata().store(urlEntry);
importedUrlBuffer.add(urlHash);
this.urlCounter++;
if (this.urlCounter % 500 == 0) {
this.log.logFine(this.urlCounter + " URLs processed so far.");
}
} else {
unknownUrlBuffer.add(urlHash);
notBoundEntryCounter++;
newContainer.remove(urlHash);
continue;
}
//} else {
// already known url
}
this.entryCounter++;
}
// testing if import process was aborted
if (isAborted()) break;
// importing entity container to home db
if (newContainer.size() > 0) { homeWordIndex.termIndex().add(newContainer); }
// delete complete index entity file
this.importWordIndex.termIndex().delete(this.wordHash);
// print out some statistical information
if (this.entryCounter % 500 == 0) {
this.log.logFine(this.entryCounter + " word entries and " + this.wordCounter + " word entities processed so far.");
}
if (this.wordCounter%500 == 0) {
this.wordChunkEndHash = this.wordHash;
this.wordChunkEnd = System.currentTimeMillis();
final long duration = this.wordChunkEnd - this.wordChunkStart;
this.log.logInfo(this.wordCounter + " word entities imported " +
"[" + this.wordChunkStartHash + " .. " + this.wordChunkEndHash + "] " +
this.getProcessingStatusPercent() + "%\n" +
"Speed: "+ 500*1000/duration + " word entities/s" +
" | Elapsed time: " + DateFormatter.formatInterval(getElapsedTime()) +
" | Estimated time: " + DateFormatter.formatInterval(getEstimatedTime()) + "\n" +
"Home Words = " + homeWordIndex.termIndex().sizesMax() +
" | Import Words = " + this.importWordIndex.termIndex().sizesMax());
this.wordChunkStart = this.wordChunkEnd;
this.wordChunkStartHash = this.wordChunkEndHash;
}
} catch (final Exception e) {
this.log.logSevere("Import of word entity '" + this.wordHash + "' failed.",e);
} finally {
if (newContainer != null) newContainer.clear();
}
if (!indexContainerIterator.hasNext()) {
// We may not be finished yet, try to get the next chunk of wordHashes
final TreeSet<ReferenceContainer<WordReference>> containers = this.importWordIndex.termIndex().references(this.wordHash, false, 100, false);
indexContainerIterator = containers.iterator();
// Make sure we don't get the same wordhash twice, but don't skip a word
if ((indexContainerIterator.hasNext())&&(!this.wordHash.equals((indexContainerIterator.next()).getTermHash()))) {
indexContainerIterator = containers.iterator();
}
}
}
this.log.logInfo("Home word index contains " + homeWordIndex.termIndex().sizesMax() + " words and " + homeWordIndex.urlMetadata().size() + " URLs.");
this.log.logInfo("Import word index contains " + this.importWordIndex.termIndex().sizesMax() + " words and " + this.importWordIndex.urlMetadata().size() + " URLs.");
} catch (final Exception e) {
this.log.logSevere("Database import failed.",e);
e.printStackTrace();
this.error = e.toString();
} finally {
this.log.logInfo("Import process finished.");
if (this.importWordIndex != null) try { this.importWordIndex.close(); } catch (final Exception e){}
}
}
}