2009-07-19 22:37:44 +02:00
package de.anomic.crawler ;
2005-10-22 15:28:04 +02:00
2006-02-16 14:07:01 +01:00
import java.util.HashSet ;
2005-10-22 15:28:04 +02:00
import java.util.Iterator ;
2006-02-16 14:07:01 +01:00
import java.util.TreeSet ;
2009-10-11 02:12:19 +02:00
import net.yacy.kelondro.data.meta.URIMetadataRow ;
import net.yacy.kelondro.data.word.WordReference ;
2009-11-05 21:28:37 +01:00
import net.yacy.kelondro.logging.Log ;
2009-10-10 02:39:15 +02:00
import net.yacy.kelondro.rwi.Reference ;
import net.yacy.kelondro.rwi.ReferenceContainer ;
2009-10-10 03:14:19 +02:00
import net.yacy.kelondro.util.DateFormatter ;
2009-10-10 02:39:15 +02:00
2009-10-11 02:12:19 +02:00
import de.anomic.search.Segment ;
2005-10-22 15:28:04 +02:00
2009-07-19 22:37:44 +02:00
public class ExternalIndexImporter extends AbstractImporter implements Importer {
2007-05-06 11:52:04 +02:00
/ * *
* the source word index ( the DB to import )
* /
2009-05-28 16:26:05 +02:00
private final Segment importWordIndex ;
2007-05-06 11:52:04 +02:00
/ * *
* the destination word index ( the home DB )
* /
2009-05-28 16:26:05 +02:00
protected Segment homeWordIndex ;
2008-08-02 14:12:04 +02:00
private final int importStartSize ;
2006-01-31 13:30:24 +01:00
2009-04-16 17:29:00 +02:00
private byte [ ] wordHash = " ------------ " . getBytes ( ) ;
2005-10-22 15:28:04 +02:00
2005-12-06 11:41:19 +01:00
long wordChunkStart = System . currentTimeMillis ( ) , wordChunkEnd = this . wordChunkStart ;
2009-04-16 17:29:00 +02:00
byte [ ] wordChunkStartHash = " ------------ " . getBytes ( ) , wordChunkEndHash ;
2006-02-16 14:07:01 +01:00
private long urlCounter = 0 , wordCounter = 0 , entryCounter = 0 , notBoundEntryCounter = 0 ;
2005-10-22 15:28:04 +02:00
2006-01-31 13:30:24 +01:00
2009-07-19 22:37:44 +02:00
public ExternalIndexImporter ( final Segment homeWI , final Segment importWI ) {
2008-05-06 15:44:38 +02:00
super ( " PLASMADB " ) ;
2007-05-06 11:52:04 +02:00
this . homeWordIndex = homeWI ;
2006-12-05 03:47:51 +01:00
this . importWordIndex = importWI ;
2009-06-14 00:59:54 +02:00
this . importStartSize = this . importWordIndex . termIndex ( ) . sizesMax ( ) ;
2005-12-06 11:41:19 +01:00
}
2008-05-06 15:44:38 +02:00
2007-05-06 11:52:04 +02:00
/ * *
2008-05-06 15:44:38 +02:00
* @see Importer # getJobName ( )
2007-05-06 11:52:04 +02:00
* /
2006-01-31 13:30:24 +01:00
public String getJobName ( ) {
2009-05-28 16:26:05 +02:00
return this . importWordIndex . getLocation ( ) . toString ( ) ;
2005-12-06 11:41:19 +01:00
}
2006-01-31 13:30:24 +01:00
2007-05-06 11:52:04 +02:00
/ * *
2008-05-06 15:44:38 +02:00
* @see Importer # getStatus ( )
2007-05-06 11:52:04 +02:00
* /
2006-01-31 13:30:24 +01:00
public String getStatus ( ) {
2008-12-04 13:54:16 +01:00
final StringBuilder theStatus = new StringBuilder ( ) ;
2005-12-06 11:41:19 +01:00
2006-01-31 13:30:24 +01:00
theStatus . append ( " Hash= " ) . append ( this . wordHash ) . append ( " \ n " ) ;
theStatus . append ( " #URL= " ) . append ( this . urlCounter ) . append ( " \ n " ) ;
2006-02-16 14:07:01 +01:00
theStatus . append ( " #Word Entity= " ) . append ( this . wordCounter ) . append ( " \ n " ) ;
theStatus . append ( " #Word Entry={ " ) . append ( this . entryCounter ) ;
2006-03-17 22:52:36 +01:00
theStatus . append ( " ,NotBound= " ) . append ( this . notBoundEntryCounter ) . append ( " } " ) ;
2005-12-06 11:41:19 +01:00
2006-01-31 13:30:24 +01:00
return theStatus . toString ( ) ;
2005-10-22 15:28:04 +02:00
}
public void run ( ) {
try {
importWordsDB ( ) ;
} finally {
2005-12-06 11:41:19 +01:00
this . globalEnd = System . currentTimeMillis ( ) ;
2006-12-05 03:47:51 +01:00
//this.sb.dbImportManager.finishedJobs.add(this);
2005-10-22 15:28:04 +02:00
}
}
2006-01-31 13:30:24 +01:00
2007-05-06 11:52:04 +02:00
/ * *
2008-05-06 15:44:38 +02:00
* @see Importer # getProcessingStatusPercent ( )
2007-05-06 11:52:04 +02:00
* /
2006-01-31 13:30:24 +01:00
public int getProcessingStatusPercent ( ) {
2005-12-20 13:03:34 +01:00
// thid seems to be better:
// (this.importStartSize-this.importWordIndex.size())*100/((this.importStartSize==0)?1:this.importStartSize);
// but maxint (2,147,483,647) could be exceeded when WordIndexes reach 20M entries
2006-02-02 17:46:58 +01:00
//return (this.importStartSize-this.importWordIndex.size())/((this.importStartSize<100)?1:(this.importStartSize)/100);
return ( int ) ( this . wordCounter ) / ( ( this . importStartSize < 100 ) ? 1 : ( this . importStartSize ) / 100 ) ;
2005-10-22 15:28:04 +02:00
}
2006-01-31 13:30:24 +01:00
2007-05-06 11:52:04 +02:00
/ * *
2008-05-06 15:44:38 +02:00
* @see Importer # getElapsedTime ( )
2007-05-06 11:52:04 +02:00
* /
2005-10-22 15:28:04 +02:00
public long getEstimatedTime ( ) {
2006-02-02 17:46:58 +01:00
return ( this . wordCounter = = 0 ) ? 0 : ( ( this . importStartSize * getElapsedTime ( ) ) / this . wordCounter ) - getElapsedTime ( ) ;
2005-10-22 15:28:04 +02:00
}
public void importWordsDB ( ) {
this . log . logInfo ( " STARTING DB-IMPORT " ) ;
2007-03-24 16:28:17 +01:00
try {
2009-05-28 16:26:05 +02:00
this . log . logInfo ( " Importing DB from ' " + this . importWordIndex . getLocation ( ) . getAbsolutePath ( ) + " ' " ) ;
2009-06-14 00:59:54 +02:00
this . log . logInfo ( " Home word index contains " + homeWordIndex . termIndex ( ) . sizesMax ( ) + " words and " + homeWordIndex . urlMetadata ( ) . size ( ) + " URLs. " ) ;
this . log . logInfo ( " Import word index contains " + this . importWordIndex . termIndex ( ) . sizesMax ( ) + " words and " + this . importWordIndex . urlMetadata ( ) . size ( ) + " URLs. " ) ;
2005-10-22 15:28:04 +02:00
2008-08-02 14:12:04 +02:00
final HashSet < String > unknownUrlBuffer = new HashSet < String > ( ) ;
final HashSet < String > importedUrlBuffer = new HashSet < String > ( ) ;
2006-02-16 14:07:01 +01:00
2005-10-22 15:28:04 +02:00
// iterate over all words from import db
2009-05-28 16:26:05 +02:00
//Iterator importWordHashIterator = this.importWordIndex.wordHashes(this.wordChunkStartHash, CrawlSwitchboard.RL_WORDFILES, false);
2009-05-29 12:03:35 +02:00
Iterator < ReferenceContainer < WordReference > > indexContainerIterator = this . importWordIndex . termIndex ( ) . references ( this . wordChunkStartHash , false , 100 , false ) . iterator ( ) ;
2006-07-26 13:21:51 +02:00
while ( ! isAborted ( ) & & indexContainerIterator . hasNext ( ) ) {
2005-10-22 15:28:04 +02:00
2008-08-02 14:12:04 +02:00
final TreeSet < String > entityUrls = new TreeSet < String > ( ) ;
2009-04-15 08:34:27 +02:00
ReferenceContainer < WordReference > newContainer = null ;
2005-10-22 15:28:04 +02:00
try {
2006-02-02 17:46:58 +01:00
this . wordCounter + + ;
2008-06-06 18:01:27 +02:00
newContainer = indexContainerIterator . next ( ) ;
2009-04-03 15:23:45 +02:00
this . wordHash = newContainer . getTermHash ( ) ;
2005-10-22 15:28:04 +02:00
2006-02-16 14:07:01 +01:00
// loop throug the entities of the container and get the
2006-03-17 22:52:36 +01:00
// urlhash
2009-04-15 08:34:27 +02:00
final Iterator < WordReference > importWordIdxEntries = newContainer . entries ( ) ;
2009-03-02 00:58:14 +01:00
Reference importWordIdxEntry ;
2005-10-22 15:28:04 +02:00
while ( importWordIdxEntries . hasNext ( ) ) {
// testing if import process was aborted
if ( isAborted ( ) ) break ;
// getting next word index entry
2008-06-06 18:01:27 +02:00
importWordIdxEntry = importWordIdxEntries . next ( ) ;
2009-04-07 11:34:41 +02:00
final String urlHash = importWordIdxEntry . metadataHash ( ) ;
2006-03-17 22:52:36 +01:00
entityUrls . add ( urlHash ) ;
2005-10-22 15:28:04 +02:00
}
2006-02-16 14:07:01 +01:00
2008-08-02 14:12:04 +02:00
final Iterator < String > urlIter = entityUrls . iterator ( ) ;
2006-03-17 22:52:36 +01:00
while ( urlIter . hasNext ( ) ) {
if ( isAborted ( ) ) break ;
2008-08-02 14:12:04 +02:00
final String urlHash = urlIter . next ( ) ;
2006-03-17 22:52:36 +01:00
2008-05-03 11:06:00 +02:00
if ( ! importedUrlBuffer . contains ( urlHash ) ) {
if ( unknownUrlBuffer . contains ( urlHash ) ) {
// url known as unknown
2006-03-17 22:52:36 +01:00
unknownUrlBuffer . add ( urlHash ) ;
notBoundEntryCounter + + ;
newContainer . remove ( urlHash ) ;
continue ;
2008-08-02 15:57:00 +02:00
}
// we need to import the url
// getting the url entry
2009-10-11 02:12:19 +02:00
final URIMetadataRow urlEntry = this . importWordIndex . urlMetadata ( ) . load ( urlHash , null , 0 ) ;
2008-08-02 15:57:00 +02:00
if ( urlEntry ! = null ) {
/* write it into the home url db */
2009-05-29 12:03:35 +02:00
homeWordIndex . urlMetadata ( ) . store ( urlEntry ) ;
2008-08-02 15:57:00 +02:00
importedUrlBuffer . add ( urlHash ) ;
this . urlCounter + + ;
if ( this . urlCounter % 500 = = 0 ) {
this . log . logFine ( this . urlCounter + " URLs processed so far. " ) ;
2008-05-03 11:06:00 +02:00
}
2008-08-02 15:57:00 +02:00
} else {
unknownUrlBuffer . add ( urlHash ) ;
notBoundEntryCounter + + ;
newContainer . remove ( urlHash ) ;
continue ;
2006-03-17 22:52:36 +01:00
}
2008-05-03 11:06:00 +02:00
//} else {
// already known url
2006-03-17 22:52:36 +01:00
}
2006-03-18 00:39:10 +01:00
this . entryCounter + + ;
2006-02-16 14:07:01 +01:00
}
2005-10-22 15:28:04 +02:00
// testing if import process was aborted
if ( isAborted ( ) ) break ;
// importing entity container to home db
2009-12-02 01:37:59 +01:00
if ( ! newContainer . isEmpty ( ) ) { homeWordIndex . termIndex ( ) . add ( newContainer ) ; }
2006-03-17 22:52:36 +01:00
2005-10-22 15:28:04 +02:00
// delete complete index entity file
2009-05-29 12:03:35 +02:00
this . importWordIndex . termIndex ( ) . delete ( this . wordHash ) ;
2005-10-22 15:28:04 +02:00
// print out some statistical information
2006-02-16 14:07:01 +01:00
if ( this . entryCounter % 500 = = 0 ) {
this . log . logFine ( this . entryCounter + " word entries and " + this . wordCounter + " word entities processed so far. " ) ;
}
2006-03-17 22:52:36 +01:00
2006-02-02 17:46:58 +01:00
if ( this . wordCounter % 500 = = 0 ) {
this . wordChunkEndHash = this . wordHash ;
this . wordChunkEnd = System . currentTimeMillis ( ) ;
2008-08-02 14:12:04 +02:00
final long duration = this . wordChunkEnd - this . wordChunkStart ;
2006-02-02 17:46:58 +01:00
this . log . logInfo ( this . wordCounter + " word entities imported " +
" [ " + this . wordChunkStartHash + " .. " + this . wordChunkEndHash + " ] " +
2006-01-31 13:30:24 +01:00
this . getProcessingStatusPercent ( ) + " % \ n " +
2005-10-22 15:28:04 +02:00
" Speed: " + 500 * 1000 / duration + " word entities/s " +
2009-01-30 16:33:00 +01:00
" | Elapsed time: " + DateFormatter . formatInterval ( getElapsedTime ( ) ) +
" | Estimated time: " + DateFormatter . formatInterval ( getEstimatedTime ( ) ) + " \ n " +
2009-06-14 00:59:54 +02:00
" Home Words = " + homeWordIndex . termIndex ( ) . sizesMax ( ) +
" | Import Words = " + this . importWordIndex . termIndex ( ) . sizesMax ( ) ) ;
2006-02-02 17:46:58 +01:00
this . wordChunkStart = this . wordChunkEnd ;
this . wordChunkStartHash = this . wordChunkEndHash ;
2005-10-22 15:28:04 +02:00
}
2008-08-02 14:12:04 +02:00
} catch ( final Exception e ) {
2006-02-02 17:46:58 +01:00
this . log . logSevere ( " Import of word entity ' " + this . wordHash + " ' failed. " , e ) ;
2005-10-22 15:28:04 +02:00
} finally {
2006-02-02 17:46:58 +01:00
if ( newContainer ! = null ) newContainer . clear ( ) ;
2005-10-22 15:28:04 +02:00
}
2006-03-18 00:39:10 +01:00
2006-07-26 13:21:51 +02:00
if ( ! indexContainerIterator . hasNext ( ) ) {
2006-03-18 00:39:10 +01:00
// We may not be finished yet, try to get the next chunk of wordHashes
2009-05-29 12:03:35 +02:00
final TreeSet < ReferenceContainer < WordReference > > containers = this . importWordIndex . termIndex ( ) . references ( this . wordHash , false , 100 , false ) ;
2006-07-26 13:21:51 +02:00
indexContainerIterator = containers . iterator ( ) ;
2006-03-18 00:39:10 +01:00
// Make sure we don't get the same wordhash twice, but don't skip a word
2009-04-03 15:23:45 +02:00
if ( ( indexContainerIterator . hasNext ( ) ) & & ( ! this . wordHash . equals ( ( indexContainerIterator . next ( ) ) . getTermHash ( ) ) ) ) {
2006-07-26 13:21:51 +02:00
indexContainerIterator = containers . iterator ( ) ;
2006-03-18 00:39:10 +01:00
}
}
2005-10-22 15:28:04 +02:00
}
2009-06-14 00:59:54 +02:00
this . log . logInfo ( " Home word index contains " + homeWordIndex . termIndex ( ) . sizesMax ( ) + " words and " + homeWordIndex . urlMetadata ( ) . size ( ) + " URLs. " ) ;
this . log . logInfo ( " Import word index contains " + this . importWordIndex . termIndex ( ) . sizesMax ( ) + " words and " + this . importWordIndex . urlMetadata ( ) . size ( ) + " URLs. " ) ;
2008-08-02 14:12:04 +02:00
} catch ( final Exception e ) {
2005-10-22 15:28:04 +02:00
this . log . logSevere ( " Database import failed. " , e ) ;
2009-11-05 21:28:37 +01:00
Log . logException ( e ) ;
2005-10-22 15:28:04 +02:00
this . error = e . toString ( ) ;
} finally {
2006-02-02 17:46:58 +01:00
this . log . logInfo ( " Import process finished. " ) ;
2008-08-02 14:12:04 +02:00
if ( this . importWordIndex ! = null ) try { this . importWordIndex . close ( ) ; } catch ( final Exception e ) { }
2005-10-22 15:28:04 +02:00
}
}
2006-01-31 13:30:24 +01:00
2005-10-22 15:28:04 +02:00
}