2005-05-17 10:25:04 +02:00
// plasmaWordIndexCache.java
// -------------------------
// part of YACY
// (C) by Michael Peter Christen; mc@anomic.de
// first published on http://www.anomic.de
// Frankfurt, Germany, 2005
2005-11-04 14:41:51 +01:00
//
// $LastChangedDate$
// $LastChangedRevision$
// $LastChangedBy$
2005-05-17 10:25:04 +02:00
//
// This program is free software; you can redistribute it and/or modify
// it under the terms of the GNU General Public License as published by
// the Free Software Foundation; either version 2 of the License, or
// (at your option) any later version.
//
// This program is distributed in the hope that it will be useful,
// but WITHOUT ANY WARRANTY; without even the implied warranty of
// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
// GNU General Public License for more details.
//
// You should have received a copy of the GNU General Public License
// along with this program; if not, write to the Free Software
// Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
//
// Using this software in any meaning (reading, learning, copying, compiling,
// running) means that you agree that the Author(s) is (are) not responsible
// for cost, loss of data or any harm that may be caused directly or indirectly
// by usage of this softare or this documentation. The usage of this software
// is on your own risk. The installation and usage (starting/running) of this
// software may allow other people or application to access your computer and
// any attached devices and is highly dependent on the configuration of the
// software which must be done by the user of the software; the author(s) is
// (are) also not responsible for proper configuration and usage of the
// software, even if provoked by documentation provided together with
// the software.
//
// Any changes to this file according to the GPL as documented in the file
// gpl.txt aside this file in the shipment you received can be done to the
// lines that follows this copyright notice here, but changes must not be
// done inside the copyright notive above. A re-distribution must contain
// the intact and unchanged copyright notice.
// Contributions and changes to the program code must be marked as such.
package de.anomic.plasma ;
2005-07-06 16:48:41 +02:00
import java.io.File ;
import java.io.IOException ;
import java.util.Iterator ;
import java.util.Map ;
import java.util.TreeMap ;
2005-09-26 11:39:54 +02:00
import de.anomic.kelondro.kelondroArray ;
2005-07-06 16:48:41 +02:00
import de.anomic.kelondro.kelondroException ;
import de.anomic.kelondro.kelondroMScoreCluster ;
import de.anomic.kelondro.kelondroMergeIterator ;
import de.anomic.kelondro.kelondroRecords ;
2005-06-09 12:34:20 +02:00
import de.anomic.server.logging.serverLog ;
2005-05-17 10:25:04 +02:00
import de.anomic.yacy.yacySeedDB ;
public final class plasmaWordIndexCache implements plasmaWordIndexInterface {
2005-11-04 14:41:51 +01:00
2005-05-17 10:25:04 +02:00
// environment constants
2005-07-18 15:32:44 +02:00
private static final String indexArrayFileName = " indexDump1.array " ;
2005-05-18 23:52:17 +02:00
private static final String oldSingletonFileName = " indexSingletons0.db " ;
private static final String newSingletonFileName = " indexAssortment001.db " ;
2005-05-23 01:59:52 +02:00
private static final String indexAssortmentClusterPath = " ACLUSTER " ;
2005-07-20 15:03:41 +02:00
private static final int assortmentCount = 64 ;
2005-07-18 15:32:44 +02:00
private static final int ramCacheLimit = 200 ;
2005-11-04 14:41:51 +01:00
2005-05-17 10:25:04 +02:00
// class variables
2005-09-26 11:39:54 +02:00
private final File databaseRoot ;
private final plasmaWordIndexInterface backend ;
private final TreeMap cache ;
private final kelondroMScoreCluster hashScore ;
private final kelondroMScoreCluster hashDate ;
2005-10-10 11:28:28 +02:00
private long startTime ;
private int maxWordsLow , maxWordsHigh ; // we have 2 cache limits for different priorities
2005-09-26 11:39:54 +02:00
private final serverLog log ;
private final plasmaWordIndexAssortmentCluster assortmentCluster ;
2005-05-31 19:39:14 +02:00
private int assortmentBufferSize ; //kb
2005-09-26 11:39:54 +02:00
private final flush flushThread ;
2005-05-17 10:25:04 +02:00
// calculated constants
private static String minKey , maxKey ;
static {
2005-11-04 14:41:51 +01:00
maxKey = " " ;
for ( int i = 0 ; i < yacySeedDB . commonHashLength ; i + + ) maxKey + = 'z' ;
minKey = " " ;
for ( int i = 0 ; i < yacySeedDB . commonHashLength ; i + + ) maxKey + = '-' ;
2005-05-17 10:25:04 +02:00
}
2005-05-31 19:39:14 +02:00
public plasmaWordIndexCache ( File databaseRoot , plasmaWordIndexInterface backend , int assortmentbufferkb , serverLog log ) {
2005-05-23 01:59:52 +02:00
// migrate#1
2005-05-18 23:52:17 +02:00
File oldSingletonFile = new File ( databaseRoot , oldSingletonFileName ) ;
File newSingletonFile = new File ( databaseRoot , newSingletonFileName ) ;
if ( ( oldSingletonFile . exists ( ) ) & & ( ! ( newSingletonFile . exists ( ) ) ) ) oldSingletonFile . renameTo ( newSingletonFile ) ;
2005-11-04 14:41:51 +01:00
2005-05-23 01:59:52 +02:00
// create new assortment cluster path
File assortmentClusterPath = new File ( databaseRoot , indexAssortmentClusterPath ) ;
if ( ! ( assortmentClusterPath . exists ( ) ) ) assortmentClusterPath . mkdirs ( ) ;
2005-11-04 14:41:51 +01:00
2005-05-23 01:59:52 +02:00
// migrate#2
File acSingletonFile = new File ( assortmentClusterPath , newSingletonFileName ) ;
if ( ( newSingletonFile . exists ( ) ) & & ( ! ( acSingletonFile . exists ( ) ) ) ) newSingletonFile . renameTo ( acSingletonFile ) ;
2005-11-04 14:41:51 +01:00
2005-05-31 19:39:14 +02:00
// create flushing thread
flushThread = new flush ( ) ;
2005-11-04 14:41:51 +01:00
2005-05-17 10:25:04 +02:00
// creates a new index cache
// the cache has a back-end where indexes that do not fit in the cache are flushed
this . databaseRoot = databaseRoot ;
2005-05-31 19:39:14 +02:00
this . assortmentBufferSize = assortmentbufferkb ;
2005-05-17 10:25:04 +02:00
this . cache = new TreeMap ( ) ;
2005-11-04 14:41:51 +01:00
this . hashScore = new kelondroMScoreCluster ( ) ;
2005-05-31 19:39:14 +02:00
this . hashDate = new kelondroMScoreCluster ( ) ;
this . startTime = System . currentTimeMillis ( ) ;
2005-11-04 14:41:51 +01:00
this . maxWordsLow = 8000 ;
2005-10-10 11:28:28 +02:00
this . maxWordsHigh = 10000 ;
2005-05-17 10:25:04 +02:00
this . backend = backend ;
this . log = log ;
2005-11-04 14:41:51 +01:00
this . assortmentCluster = new plasmaWordIndexAssortmentCluster ( assortmentClusterPath , assortmentCount , assortmentBufferSize , log ) ;
2005-05-18 23:52:17 +02:00
2005-05-17 10:25:04 +02:00
// read in dump of last session
try {
restore ( ) ;
} catch ( IOException e ) {
2005-08-30 23:32:59 +02:00
log . logSevere ( " unable to restore cache dump: " + e . getMessage ( ) , e ) ;
2005-05-17 10:25:04 +02:00
}
2005-11-04 14:41:51 +01:00
2005-05-31 19:39:14 +02:00
// start permanent flushing
flushThread . start ( ) ;
2005-05-17 10:25:04 +02:00
}
private void dump ( int waitingSeconds ) throws IOException {
2005-08-30 23:10:39 +02:00
log . logConfig ( " creating dump for index cache, " + cache . size ( ) + " words (and much more urls) " ) ;
2005-07-18 15:32:44 +02:00
File indexDumpFile = new File ( databaseRoot , indexArrayFileName ) ;
if ( indexDumpFile . exists ( ) ) indexDumpFile . delete ( ) ;
2005-08-30 11:07:42 +02:00
kelondroArray dumpArray = null ;
try {
2005-11-04 14:41:51 +01:00
dumpArray = new kelondroArray ( indexDumpFile , plasmaWordIndexAssortment . bufferStructureBasis , 0 ) ;
long startTime = System . currentTimeMillis ( ) ;
long messageTime = System . currentTimeMillis ( ) + 5000 ;
long wordsPerSecond = 0 , wordcount = 0 , urlcount = 0 ;
synchronized ( cache ) {
Iterator i = cache . entrySet ( ) . iterator ( ) ;
Map . Entry entry ;
String wordHash ;
plasmaWordIndexEntryContainer container ;
long updateTime ;
plasmaWordIndexEntry wordEntry ;
byte [ ] [ ] row = new byte [ 5 ] [ ] ;
while ( i . hasNext ( ) ) {
// get entries
entry = ( Map . Entry ) i . next ( ) ;
wordHash = ( String ) entry . getKey ( ) ;
updateTime = getUpdateTime ( wordHash ) ;
container = ( plasmaWordIndexEntryContainer ) entry . getValue ( ) ;
// put entries on stack
if ( container ! = null ) {
Iterator ci = container . entries ( ) ;
while ( ci . hasNext ( ) ) {
wordEntry = ( plasmaWordIndexEntry ) ci . next ( ) ;
row [ 0 ] = wordHash . getBytes ( ) ;
row [ 1 ] = kelondroRecords . long2bytes ( container . size ( ) , 4 ) ;
row [ 2 ] = kelondroRecords . long2bytes ( updateTime , 8 ) ;
row [ 3 ] = wordEntry . getUrlHash ( ) . getBytes ( ) ;
row [ 4 ] = wordEntry . toEncodedForm ( true ) . getBytes ( ) ;
dumpArray . set ( ( int ) urlcount + + , row ) ;
}
}
wordcount + + ;
i . remove ( ) ; // free some mem
// write a log
if ( System . currentTimeMillis ( ) > messageTime ) {
// System.gc(); // for better statistic
wordsPerSecond = wordcount * 1000 / ( 1 + System . currentTimeMillis ( ) - startTime ) ;
log . logInfo ( " dumping status: " + wordcount + " words done, " + ( cache . size ( ) / ( wordsPerSecond + 1 ) ) + " seconds remaining, free mem = " + ( Runtime . getRuntime ( ) . freeMemory ( ) / 1024 / 1024 ) + " MB " ) ;
messageTime = System . currentTimeMillis ( ) + 5000 ;
2005-07-18 15:32:44 +02:00
}
}
}
2005-11-04 14:41:51 +01:00
dumpArray . close ( ) ;
dumpArray = null ;
log . logConfig ( " dumped " + urlcount + " word/URL relations in " + ( ( System . currentTimeMillis ( ) - startTime ) / 1000 ) + " seconds " ) ;
2005-08-30 11:07:42 +02:00
} finally {
if ( dumpArray ! = null ) try { dumpArray . close ( ) ; } catch ( Exception e ) { }
}
2005-07-18 15:32:44 +02:00
}
2005-11-04 14:41:51 +01:00
2005-07-18 15:32:44 +02:00
private long restore ( ) throws IOException {
File indexDumpFile = new File ( databaseRoot , indexArrayFileName ) ;
if ( ! ( indexDumpFile . exists ( ) ) ) return 0 ;
kelondroArray dumpArray = new kelondroArray ( indexDumpFile ) ;
2005-09-13 18:29:59 +02:00
log . logConfig ( " restore array dump of index cache, " + dumpArray . size ( ) + " word/URL relations " ) ;
2005-07-18 15:32:44 +02:00
long startTime = System . currentTimeMillis ( ) ;
long messageTime = System . currentTimeMillis ( ) + 5000 ;
long urlCount = 0 , urlsPerSecond = 0 ;
try {
synchronized ( cache ) {
int i = dumpArray . size ( ) ;
String wordHash ;
plasmaWordIndexEntryContainer container ;
long creationTime ;
plasmaWordIndexEntry wordEntry ;
byte [ ] [ ] row ;
Runtime rt = Runtime . getRuntime ( ) ;
while ( i - - > 0 ) {
// get out one entry
row = dumpArray . get ( i ) ;
2005-09-11 05:54:52 +02:00
if ( ( row [ 0 ] = = null ) | | ( row [ 1 ] = = null ) | | ( row [ 2 ] = = null ) | | ( row [ 3 ] = = null ) | | ( row [ 4 ] = = null ) ) continue ;
2005-07-18 15:32:44 +02:00
wordHash = new String ( row [ 0 ] ) ;
creationTime = kelondroRecords . bytes2long ( row [ 2 ] ) ;
wordEntry = new plasmaWordIndexEntry ( new String ( row [ 3 ] ) , new String ( row [ 4 ] ) ) ;
// store to cache
addEntry ( wordHash , wordEntry , creationTime ) ;
urlCount + + ;
// protect against memory shortage
while ( rt . freeMemory ( ) < 1000000 ) { flushFromMem ( ) ; java . lang . System . gc ( ) ; }
// write a log
if ( System . currentTimeMillis ( ) > messageTime ) {
System . gc ( ) ; // for better statistic
urlsPerSecond = 1 + urlCount * 1000 / ( 1 + System . currentTimeMillis ( ) - startTime ) ;
log . logInfo ( " restoring status: " + urlCount + " urls done, " + ( i / urlsPerSecond ) + " seconds remaining, free mem = " + ( Runtime . getRuntime ( ) . freeMemory ( ) / 1024 / 1024 ) + " MB " ) ;
messageTime = System . currentTimeMillis ( ) + 5000 ;
}
}
}
2005-11-04 14:41:51 +01:00
2005-07-18 15:32:44 +02:00
dumpArray . close ( ) ;
2005-08-30 23:10:39 +02:00
log . logConfig ( " restored " + cache . size ( ) + " words in " + ( ( System . currentTimeMillis ( ) - startTime ) / 1000 ) + " seconds " ) ;
2005-07-18 15:32:44 +02:00
} catch ( kelondroException e ) {
// restore failed
2005-08-30 23:32:59 +02:00
log . logSevere ( " restore of indexCache array dump failed: " + e . getMessage ( ) , e ) ;
2005-08-30 11:07:42 +02:00
} finally {
if ( dumpArray ! = null ) try { dumpArray . close ( ) ; } catch ( Exception e ) { }
2005-05-17 10:25:04 +02:00
}
return urlCount ;
}
2005-11-04 14:41:51 +01:00
2005-08-09 22:43:37 +02:00
public void intermission ( long pause ) {
2005-11-04 14:41:51 +01:00
flushThread . intermission ( pause ) ;
2005-08-09 22:43:37 +02:00
}
2005-05-17 10:25:04 +02:00
// cache settings
2005-11-04 14:41:51 +01:00
2005-05-17 10:25:04 +02:00
public int maxURLinWordCache ( ) {
return hashScore . getScore ( hashScore . getMaxObject ( ) ) ;
}
public int wordCacheRAMSize ( ) {
return cache . size ( ) ;
}
2005-11-04 14:41:51 +01:00
2005-10-10 11:28:28 +02:00
public void setMaxWords ( int maxWordsLow , int maxWordsHigh ) {
this . maxWordsLow = maxWordsLow ;
this . maxWordsHigh = maxWordsHigh ;
2005-05-17 10:25:04 +02:00
}
2005-11-04 14:41:51 +01:00
2005-05-22 15:27:54 +02:00
public int [ ] assortmentsSizes ( ) {
return assortmentCluster . sizes ( ) ;
2005-05-18 23:52:17 +02:00
}
2005-11-04 14:41:51 +01:00
2005-09-22 22:01:26 +02:00
public int [ ] assortmentsCacheChunkSizeAvg ( ) {
2005-09-20 12:10:34 +02:00
return assortmentCluster . cacheChunkSizeAvg ( ) ;
}
2005-11-04 14:41:51 +01:00
2005-09-20 12:10:34 +02:00
public int [ ] assortmentsCacheFillStatusCml ( ) {
return assortmentCluster . cacheFillStatusCml ( ) ;
}
2005-11-04 14:41:51 +01:00
2005-05-17 10:25:04 +02:00
public int size ( ) {
2005-05-22 15:27:54 +02:00
return java . lang . Math . max ( assortmentCluster . sizeTotal ( ) , java . lang . Math . max ( backend . size ( ) , cache . size ( ) ) ) ;
2005-05-17 10:25:04 +02:00
}
2005-11-04 14:41:51 +01:00
2005-05-17 10:25:04 +02:00
public Iterator wordHashes ( String startWordHash , boolean up ) {
// here we merge 3 databases into one view:
// - the RAM Cache
2005-05-31 19:39:14 +02:00
// - the assortmentCluster File Cache
2005-05-17 10:25:04 +02:00
// - the backend
if ( ! ( up ) ) throw new RuntimeException ( " plasmaWordIndexCache.wordHashes can only count up " ) ;
return new kelondroMergeIterator (
new kelondroMergeIterator (
2005-08-12 16:06:47 +02:00
cache . tailMap ( startWordHash ) . keySet ( ) . iterator ( ) ,
2005-05-22 15:27:54 +02:00
assortmentCluster . hashConjunction ( startWordHash , true ) ,
2005-05-17 10:25:04 +02:00
true ) ,
backend . wordHashes ( startWordHash , true ) ,
true ) ;
}
2005-11-04 14:41:51 +01:00
2005-09-26 11:39:54 +02:00
private final class flush extends Thread {
2005-05-31 19:39:14 +02:00
boolean terminate , pause ;
2005-11-04 14:41:51 +01:00
long intermission ;
2005-05-31 19:39:14 +02:00
public flush ( ) {
terminate = false ;
2005-08-09 22:43:37 +02:00
intermission = 0 ;
2005-10-05 12:45:33 +02:00
this . setName ( this . getClass ( ) . getName ( ) ) ;
2005-05-31 19:39:14 +02:00
}
2005-08-09 22:43:37 +02:00
2005-11-04 14:41:51 +01:00
public void intermission ( long pause ) {
this . intermission = System . currentTimeMillis ( ) + pause ;
}
2005-08-09 22:43:37 +02:00
2005-05-31 19:39:14 +02:00
public void run ( ) {
String nextHash ;
2005-06-19 15:37:17 +02:00
Runtime rt = Runtime . getRuntime ( ) ;
2005-08-12 01:33:19 +02:00
long pausetime ;
2005-05-31 19:39:14 +02:00
while ( ! terminate ) {
2005-11-04 14:41:51 +01:00
if ( intermission > 0 ) {
if ( this . intermission > System . currentTimeMillis ( ) ) {
try { this . sleep ( this . intermission - System . currentTimeMillis ( ) ) ; } catch ( InterruptedException e ) { }
}
this . intermission = 0 ;
}
2005-05-31 19:39:14 +02:00
if ( pause ) {
try { this . sleep ( 300 ) ; } catch ( InterruptedException e ) { }
} else {
2005-06-01 16:24:25 +02:00
flushFromMem ( ) ;
2005-08-12 01:33:19 +02:00
try {
2005-10-10 11:28:28 +02:00
pausetime = 1 + java . lang . Math . min ( 1000 , 5 * maxWordsHigh / ( cache . size ( ) + 1 ) ) ;
2005-08-12 01:33:19 +02:00
if ( cache . size ( ) = = 0 ) pausetime = 2000 ;
this . sleep ( pausetime ) ;
2005-06-19 15:37:17 +02:00
} catch ( InterruptedException e ) { }
2005-05-31 19:39:14 +02:00
}
}
}
2005-11-04 14:41:51 +01:00
2005-05-31 19:39:14 +02:00
public void pause ( ) {
pause = true ;
}
2005-11-04 14:41:51 +01:00
2005-05-31 19:39:14 +02:00
public void proceed ( ) {
pause = false ;
}
2005-11-04 14:41:51 +01:00
2005-05-31 19:39:14 +02:00
public void terminate ( ) {
terminate = true ;
}
}
2005-11-04 14:41:51 +01:00
2005-06-01 16:24:25 +02:00
private void flushFromMem ( ) {
// select appropriate hash
// we have 2 different methods to find a good hash:
// - the oldest entry in the cache
// - the entry with maximum count
if ( cache . size ( ) = = 0 ) return ;
flushThread . pause ( ) ;
try {
String hash = ( String ) hashScore . getMaxObject ( ) ;
2005-07-18 00:25:50 +02:00
if ( hash = = null ) {
flushThread . proceed ( ) ;
return ;
}
int count = hashScore . getMaxScore ( ) ;
long time = longTime ( hashDate . getScore ( hash ) ) ;
2005-07-18 15:32:44 +02:00
if ( ( count > ramCacheLimit ) | |
2005-07-20 02:39:06 +02:00
( ( count > assortmentCount ) & & ( System . currentTimeMillis ( ) - time > 10000 ) ) ) {
2005-06-01 16:24:25 +02:00
// flush high-score entries
2005-07-20 02:39:06 +02:00
flushFromMem ( hash ) ;
2005-06-01 16:24:25 +02:00
} else {
// flush oldest entries
hash = ( String ) hashDate . getMinObject ( ) ;
2005-07-20 02:39:06 +02:00
flushFromMem ( hash ) ;
2005-06-01 16:24:25 +02:00
}
} catch ( Exception e ) {
2005-08-30 23:32:59 +02:00
log . logSevere ( " flushFromMem: " + e . getMessage ( ) , e ) ;
2005-06-01 16:24:25 +02:00
}
flushThread . proceed ( ) ;
}
2005-11-04 14:41:51 +01:00
2005-07-20 02:39:06 +02:00
private int flushFromMem ( String key ) {
2005-05-17 10:25:04 +02:00
// this method flushes indexes out from the ram to the disc.
plasmaWordIndexEntryContainer container = null ;
long time ;
2005-11-04 14:41:51 +01:00
synchronized ( cache ) {
2005-05-17 10:25:04 +02:00
// get the container
container = ( plasmaWordIndexEntryContainer ) cache . get ( key ) ;
if ( container = = null ) return 0 ; // flushing of nonexisting key
2005-05-31 19:39:14 +02:00
time = getUpdateTime ( key ) ;
2005-05-17 10:25:04 +02:00
// remove it from the cache
cache . remove ( key ) ;
2005-11-04 14:41:51 +01:00
hashScore . deleteScore ( key ) ;
2005-05-31 19:39:14 +02:00
hashDate . deleteScore ( key ) ;
2005-11-04 14:41:51 +01:00
}
2005-05-18 23:52:17 +02:00
2005-07-17 23:22:18 +02:00
// now decide where to flush that container
2005-10-11 09:06:33 +02:00
//if (container.size() <= assortmentCluster.clusterCapacity) {
2005-07-17 23:22:18 +02:00
// this fits into the assortments
plasmaWordIndexEntryContainer feedback = assortmentCluster . storeTry ( key , container ) ;
if ( feedback = = null ) {
return container . size ( ) ;
} else {
// *** should care about another option here ***
2005-10-10 11:28:28 +02:00
return backend . addEntries ( feedback , time , true ) ;
2005-07-17 23:22:18 +02:00
}
2005-10-11 09:06:33 +02:00
/ *
2005-07-17 23:22:18 +02:00
} else {
// store to back-end; this should be a rare case
2005-10-10 11:28:28 +02:00
return backend . addEntries ( container , time , true ) ;
2005-07-17 23:22:18 +02:00
}
2005-10-11 09:06:33 +02:00
* * /
2005-07-17 23:22:18 +02:00
}
2005-11-04 14:41:51 +01:00
2005-05-31 19:39:14 +02:00
private int intTime ( long longTime ) {
return ( int ) ( ( longTime - startTime ) / 1000 ) ;
}
2005-11-04 14:41:51 +01:00
2005-05-31 19:39:14 +02:00
private long longTime ( int intTime ) {
return ( ( long ) intTime ) * ( ( long ) 1000 ) + startTime ;
}
2005-11-04 14:41:51 +01:00
2005-10-23 19:50:27 +02:00
private boolean flushFromAssortmentCluster ( String key , long maxTime ) {
2005-11-04 14:41:51 +01:00
// this should only be called if the assortment shall be deleted or returned in an index entity
2005-10-23 19:50:27 +02:00
maxTime = 8 * maxTime / 10 ; // reserve time for later adding to backend
plasmaWordIndexEntryContainer container = assortmentCluster . removeFromAll ( key , maxTime ) ;
2005-05-22 15:27:54 +02:00
if ( container = = null ) {
2005-05-17 10:25:04 +02:00
return false ;
} else {
2005-05-22 15:27:54 +02:00
// we have a non-empty entry-container
2005-05-17 10:25:04 +02:00
// integrate it to the backend
2005-10-10 11:28:28 +02:00
return backend . addEntries ( container , container . updated ( ) , true ) > 0 ;
2005-05-17 10:25:04 +02:00
}
}
2005-10-23 19:50:27 +02:00
public plasmaWordIndexEntity getIndex ( String wordHash , boolean deleteIfEmpty , long maxTime ) {
2005-05-31 19:39:14 +02:00
flushThread . pause ( ) ;
2005-10-23 19:50:27 +02:00
long start = System . currentTimeMillis ( ) ;
2005-07-20 02:39:06 +02:00
flushFromMem ( wordHash ) ;
2005-10-23 19:50:27 +02:00
if ( maxTime < 0 ) {
flushFromAssortmentCluster ( wordHash , - 1 ) ;
} else {
long remaining = maxTime - ( System . currentTimeMillis ( ) - start ) ;
if ( remaining > 0 ) flushFromAssortmentCluster ( wordHash , remaining ) ;
}
2005-05-31 19:39:14 +02:00
flushThread . proceed ( ) ;
2005-10-23 19:50:27 +02:00
long r = maxTime - ( System . currentTimeMillis ( ) - start ) ;
2005-11-04 14:41:51 +01:00
return backend . getIndex ( wordHash , deleteIfEmpty , ( r < 0 ) ? 0 : r ) ;
2005-05-17 10:25:04 +02:00
}
2005-11-04 14:41:51 +01:00
2005-05-31 19:39:14 +02:00
public long getUpdateTime ( String wordHash ) {
plasmaWordIndexEntryContainer entries = ( plasmaWordIndexEntryContainer ) cache . get ( wordHash ) ;
if ( entries = = null ) return 0 ;
return entries . updated ( ) ;
/ *
Long time = new Long ( longTime ( hashDate . getScore ( wordHash ) ) ) ;
2005-05-17 10:25:04 +02:00
if ( time = = null ) return 0 ;
return time . longValue ( ) ;
2005-05-31 19:39:14 +02:00
* /
2005-05-17 10:25:04 +02:00
}
2005-11-04 14:41:51 +01:00
2005-05-17 10:25:04 +02:00
public void deleteIndex ( String wordHash ) {
2005-05-31 19:39:14 +02:00
flushThread . pause ( ) ;
2005-05-17 10:25:04 +02:00
synchronized ( cache ) {
cache . remove ( wordHash ) ;
hashScore . deleteScore ( wordHash ) ;
2005-05-31 19:39:14 +02:00
hashDate . deleteScore ( wordHash ) ;
2005-05-17 10:25:04 +02:00
}
2005-10-23 19:50:27 +02:00
assortmentCluster . removeFromAll ( wordHash , - 1 ) ;
2005-11-04 14:41:51 +01:00
backend . deleteIndex ( wordHash ) ;
2005-05-31 19:39:14 +02:00
flushThread . proceed ( ) ;
2005-05-17 10:25:04 +02:00
}
public synchronized int removeEntries ( String wordHash , String [ ] urlHashes , boolean deleteComplete ) {
2005-05-31 19:39:14 +02:00
flushThread . pause ( ) ;
2005-07-20 02:39:06 +02:00
flushFromMem ( wordHash ) ;
2005-10-23 19:50:27 +02:00
flushFromAssortmentCluster ( wordHash , - 1 ) ;
2005-05-31 19:39:14 +02:00
int removed = backend . removeEntries ( wordHash , urlHashes , deleteComplete ) ;
flushThread . proceed ( ) ;
return removed ;
2005-05-17 10:25:04 +02:00
}
2005-11-04 14:41:51 +01:00
2005-10-11 09:06:33 +02:00
public int addEntries ( plasmaWordIndexEntryContainer container , long updateTime , boolean highPriority ) {
2005-07-20 02:39:06 +02:00
// this puts the entries into the cache, not into the assortment directly
2005-11-04 14:41:51 +01:00
2005-10-11 09:06:33 +02:00
int added = 0 ;
2005-08-12 01:33:19 +02:00
// check cache space
if ( cache . size ( ) > 0 ) try {
2005-08-14 02:57:30 +02:00
// pause to get space in the cache (while it is flushed)
2005-10-10 11:28:28 +02:00
long pausetime ;
if ( highPriority ) {
if ( cache . size ( ) + 1000 > = this . maxWordsHigh ) Thread . sleep ( java . lang . Math . min ( 1000 , cache . size ( ) - this . maxWordsHigh + 1000 ) ) ;
pausetime = java . lang . Math . min ( 10 , 2 * cache . size ( ) / ( maxWordsHigh + 1 ) ) ;
} else {
if ( cache . size ( ) + 1000 > = this . maxWordsLow ) Thread . sleep ( java . lang . Math . min ( 1000 , cache . size ( ) - this . maxWordsLow + 1000 ) ) ;
pausetime = java . lang . Math . min ( 10 , 2 * cache . size ( ) / ( maxWordsLow + 1 ) ) ;
}
2005-11-04 14:41:51 +01:00
2005-08-12 01:33:19 +02:00
// slow down if we reach cache limit
Thread . sleep ( pausetime ) ;
} catch ( InterruptedException e ) { }
2005-11-04 14:41:51 +01:00
2005-10-09 06:43:07 +02:00
//serverLog.logDebug("PLASMA INDEXING", "addEntryToIndexMem: cache.size=" + cache.size() + "; hashScore.size=" + hashScore.size());
2005-11-04 14:41:51 +01:00
2005-10-09 06:43:07 +02:00
// put new words into cache
String wordHash = container . wordHash ( ) ;
2005-11-04 14:41:51 +01:00
2005-10-11 09:06:33 +02:00
synchronized ( cache ) {
// stop flushing now for one moment
flushThread . pause ( ) ;
2005-11-04 14:41:51 +01:00
2005-10-11 09:06:33 +02:00
// put container into cache
2005-10-09 06:43:07 +02:00
plasmaWordIndexEntryContainer entries = ( plasmaWordIndexEntryContainer ) cache . get ( wordHash ) ; // null pointer exception? wordhash != null! must be cache==null
if ( entries = = null ) entries = new plasmaWordIndexEntryContainer ( wordHash ) ;
2005-05-17 10:25:04 +02:00
added = entries . add ( container ) ;
if ( added > 0 ) {
cache . put ( wordHash , entries ) ;
hashScore . addScore ( wordHash , added ) ;
2005-05-31 19:39:14 +02:00
hashDate . setScore ( wordHash , intTime ( updateTime ) ) ;
2005-05-17 10:25:04 +02:00
}
2005-06-19 15:37:17 +02:00
entries = null ;
2005-11-04 14:41:51 +01:00
2005-10-11 09:06:33 +02:00
// resume flushing
flushThread . proceed ( ) ;
2005-10-09 06:43:07 +02:00
}
2005-05-17 10:25:04 +02:00
//System.out.println("DEBUG: cache = " + cache.toString());
2005-11-04 14:41:51 +01:00
2005-05-17 10:25:04 +02:00
return added ;
}
2005-05-22 15:27:54 +02:00
private void addEntry ( String wordHash , plasmaWordIndexEntry newEntry , long updateTime ) {
2005-06-19 15:37:17 +02:00
flushThread . pause ( ) ;
2005-11-04 14:41:51 +01:00
plasmaWordIndexEntryContainer container = ( plasmaWordIndexEntryContainer ) cache . get ( wordHash ) ;
2005-06-19 15:37:17 +02:00
if ( container = = null ) container = new plasmaWordIndexEntryContainer ( wordHash ) ;
plasmaWordIndexEntry [ ] entries = new plasmaWordIndexEntry [ ] { newEntry } ;
if ( container . add ( entries , updateTime ) > 0 ) {
cache . put ( wordHash , container ) ;
2005-05-17 10:25:04 +02:00
hashScore . incScore ( wordHash ) ;
2005-05-31 19:39:14 +02:00
hashDate . setScore ( wordHash , intTime ( updateTime ) ) ;
2005-05-17 10:25:04 +02:00
}
2005-06-19 15:37:17 +02:00
entries = null ;
container = null ;
2005-05-31 19:39:14 +02:00
flushThread . proceed ( ) ;
2005-05-17 10:25:04 +02:00
}
public void close ( int waitingSeconds ) {
2005-05-31 19:39:14 +02:00
// stop permanent flushing
flushThread . terminate ( ) ;
try { flushThread . join ( 5000 ) ; } catch ( InterruptedException e ) { }
2005-11-04 14:41:51 +01:00
2005-05-31 19:39:14 +02:00
// close cluster
2005-05-22 15:27:54 +02:00
assortmentCluster . close ( ) ;
2005-05-17 10:25:04 +02:00
try {
dump ( waitingSeconds ) ;
} catch ( IOException e ) {
2005-08-30 23:32:59 +02:00
log . logSevere ( " unable to dump cache: " + e . getMessage ( ) , e ) ;
2005-05-17 10:25:04 +02:00
}
}
2005-07-20 02:39:06 +02:00
public int migrateWords2Assortment ( String wordhash ) throws IOException {
// returns the number of entries that had been added to the assortments
// can be negative if some assortments have been moved to the backend
File db = plasmaWordIndexEntity . wordHash2path ( databaseRoot , wordhash ) ;
if ( ! ( db . exists ( ) ) ) return 0 ;
2005-10-05 12:45:33 +02:00
plasmaWordIndexEntity entity = null ;
try {
entity = new plasmaWordIndexEntity ( databaseRoot , wordhash , true ) ;
int size = entity . size ( ) ;
if ( size > assortmentCluster . clusterCapacity ) {
// this will be too big to integrate it
entity . close ( ) ; entity = null ;
return 0 ;
2005-07-20 02:39:06 +02:00
} else {
2005-10-05 12:45:33 +02:00
// take out all words from the assortment to see if it fits
// together with the extracted assortment
2005-10-23 19:50:27 +02:00
plasmaWordIndexEntryContainer container = assortmentCluster . removeFromAll ( wordhash , - 1 ) ;
2005-10-05 12:45:33 +02:00
if ( size + container . size ( ) > assortmentCluster . clusterCapacity ) {
// this will also be too big to integrate, add to entity
entity . addEntries ( container ) ;
entity . close ( ) ; entity = null ;
return - container . size ( ) ;
} else {
// the combined container will fit, read the container
2005-10-13 15:57:15 +02:00
Iterator entries = entity . elements ( true ) ;
2005-10-05 12:45:33 +02:00
plasmaWordIndexEntry entry ;
2005-10-13 15:57:15 +02:00
while ( entries . hasNext ( ) ) {
entry = ( plasmaWordIndexEntry ) entries . next ( ) ;
2005-10-05 12:45:33 +02:00
container . add ( new plasmaWordIndexEntry [ ] { entry } , System . currentTimeMillis ( ) ) ;
}
// we have read all elements, now delete the entity
entity . deleteComplete ( ) ;
entity . close ( ) ; entity = null ;
// integrate the container into the assortments; this will work
assortmentCluster . storeTry ( wordhash , container ) ;
return size ;
2005-07-20 02:39:06 +02:00
}
}
2005-10-05 12:45:33 +02:00
} finally {
if ( entity ! = null ) try { entity . close ( ) ; } catch ( Exception e ) { }
2005-07-20 02:39:06 +02:00
}
}
2005-11-04 14:41:51 +01:00
}