2005-05-07 23:11:18 +02:00
// plasmaWordIndexEntity.java
// --------------------------
2005-04-07 21:19:42 +02:00
// part of YACY
// (C) by Michael Peter Christen; mc@anomic.de
// first published on http://www.anomic.de
// Frankfurt, Germany, 2004
2005-11-04 14:41:51 +01:00
//
// $LastChangedDate$
// $LastChangedRevision$
// $LastChangedBy$
2005-04-07 21:19:42 +02:00
//
// This program is free software; you can redistribute it and/or modify
// it under the terms of the GNU General Public License as published by
// the Free Software Foundation; either version 2 of the License, or
// (at your option) any later version.
//
// This program is distributed in the hope that it will be useful,
// but WITHOUT ANY WARRANTY; without even the implied warranty of
// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
// GNU General Public License for more details.
//
// You should have received a copy of the GNU General Public License
// along with this program; if not, write to the Free Software
// Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
//
// Using this software in any meaning (reading, learning, copying, compiling,
// running) means that you agree that the Author(s) is (are) not responsible
// for cost, loss of data or any harm that may be caused directly or indirectly
// by usage of this softare or this documentation. The usage of this software
// is on your own risk. The installation and usage (starting/running) of this
// software may allow other people or application to access your computer and
// any attached devices and is highly dependent on the configuration of the
// software which must be done by the user of the software; the author(s) is
// (are) also not responsible for proper configuration and usage of the
// software, even if provoked by documentation provided together with
// the software.
//
// Any changes to this file according to the GPL as documented in the file
// gpl.txt aside this file in the shipment you received can be done to the
// lines that follows this copyright notice here, but changes must not be
// done inside the copyright notive above. A re-distribution must contain
// the intact and unchanged copyright notice.
// Contributions and changes to the program code must be marked as such.
package de.anomic.plasma ;
2005-05-05 07:32:19 +02:00
import java.io.File ;
import java.io.IOException ;
import java.util.Iterator ;
import de.anomic.kelondro.kelondroTree ;
2005-09-21 23:56:39 +02:00
import de.anomic.kelondro.kelondroException ;
2005-10-23 19:50:27 +02:00
import de.anomic.server.logging.serverLog ;
2005-04-07 21:19:42 +02:00
2005-10-05 12:45:33 +02:00
public final class plasmaWordIndexEntity {
2005-04-07 21:19:42 +02:00
2005-10-10 02:45:18 +02:00
private final String theWordHash ;
2005-04-07 21:19:42 +02:00
private kelondroTree theIndex ;
private File theLocation ;
private boolean delete ;
2005-12-13 00:59:58 +01:00
public plasmaWordIndexEntity ( File databaseRoot , String wordHash , boolean deleteIfEmpty ) {
2005-11-04 14:41:51 +01:00
theWordHash = wordHash ;
theIndex = indexFile ( databaseRoot , wordHash ) ;
2005-04-07 21:19:42 +02:00
delete = deleteIfEmpty ;
}
2005-12-07 00:51:29 +01:00
public static boolean removePlasmaIndex ( File databaseRoot , String wordHash ) {
2005-04-07 21:19:42 +02:00
File f = wordHash2path ( databaseRoot , wordHash ) ;
boolean success = true ;
if ( f . exists ( ) ) success = f . delete ( ) ;
// clean up directory structure
f = f . getParentFile ( ) ;
while ( ( f . isDirectory ( ) ) & & ( f . list ( ) . length = = 0 ) ) {
if ( ! ( f . delete ( ) ) ) break ;
f = f . getParentFile ( ) ;
}
return success ;
}
2005-11-04 14:41:51 +01:00
2005-12-13 00:59:58 +01:00
private kelondroTree indexFile ( File databaseRoot , String wordHash ) {
if ( wordHash . length ( ) < 12 ) throw new RuntimeException ( " word hash wrong: ' " + wordHash + " ' " ) ;
2005-11-04 14:41:51 +01:00
theLocation = wordHash2path ( databaseRoot , wordHash ) ;
File fp = theLocation . getParentFile ( ) ;
if ( fp ! = null ) fp . mkdirs ( ) ;
kelondroTree kt ;
long cacheSize = theLocation . length ( ) ;
if ( cacheSize > 1048576 ) cacheSize = 1048576 ;
2005-12-13 00:59:58 +01:00
if ( theLocation . exists ( ) ) try {
2005-11-04 14:41:51 +01:00
// open existing index file
2006-05-10 11:08:42 +02:00
kt = new kelondroTree ( theLocation , cacheSize , kelondroTree . defaultObjectCachePercent ) ;
2005-12-13 00:59:58 +01:00
} catch ( IOException e ) {
theLocation . delete ( ) ;
2006-05-10 11:08:42 +02:00
kt = new kelondroTree ( theLocation , cacheSize , kelondroTree . defaultObjectCachePercent , plasmaURL . urlHashLength , plasmaWordIndexEntry . attrSpace , false ) ;
2005-11-04 14:41:51 +01:00
} else {
// create new index file
2006-05-10 11:08:42 +02:00
kt = new kelondroTree ( theLocation , cacheSize , kelondroTree . defaultObjectCachePercent , plasmaURL . urlHashLength , plasmaWordIndexEntry . attrSpace , false ) ;
2005-11-04 14:41:51 +01:00
}
return kt ; // everyone who get this should close it when finished!
2005-04-07 21:19:42 +02:00
}
public static File wordHash2path ( File databaseRoot , String hash ) {
2005-11-04 14:41:51 +01:00
// creates a path that constructs hashing on a file system
2005-06-23 04:07:45 +02:00
return new File ( databaseRoot , " WORDS/ " +
2005-11-04 14:41:51 +01:00
hash . substring ( 0 , 1 ) + " / " + hash . substring ( 1 , 2 ) + " / " + hash . substring ( 2 , 4 ) + " / " +
hash . substring ( 4 , 6 ) + " / " + hash + " .db " ) ;
2005-04-07 21:19:42 +02:00
}
public String wordHash ( ) {
return theWordHash ;
}
public int size ( ) {
2006-01-30 13:42:06 +01:00
if ( theIndex = = null ) return 0 ;
2006-01-30 02:18:25 +01:00
int size = theIndex . size ( ) ;
if ( ( size = = 0 ) & & ( delete ) ) {
deleteComplete ( ) ;
return 0 ;
2005-04-07 21:19:42 +02:00
} else {
2006-01-30 02:18:25 +01:00
return size ;
2005-04-07 21:19:42 +02:00
}
}
public void close ( ) throws IOException {
2006-01-30 02:18:25 +01:00
if ( theIndex ! = null ) theIndex . close ( ) ;
theIndex = null ;
2005-04-07 21:19:42 +02:00
}
2005-05-25 13:35:01 +02:00
public void finalize ( ) {
2005-11-04 14:41:51 +01:00
try {
close ( ) ;
} catch ( IOException e ) { }
2005-05-25 13:35:01 +02:00
}
2006-01-20 16:14:21 +01:00
public plasmaWordIndexEntry getEntry ( String urlhash ) throws IOException {
2006-01-30 02:18:25 +01:00
byte [ ] [ ] n = theIndex . get ( urlhash . getBytes ( ) ) ;
if ( n = = null ) return null ;
return new plasmaWordIndexEntry ( new String ( n [ 0 ] ) , new String ( n [ 1 ] ) ) ;
2006-01-20 16:14:21 +01:00
}
2005-04-07 21:19:42 +02:00
public boolean contains ( String urlhash ) throws IOException {
2006-01-30 02:18:25 +01:00
return ( theIndex . get ( urlhash . getBytes ( ) ) ! = null ) ;
2005-04-07 21:19:42 +02:00
}
public boolean contains ( plasmaWordIndexEntry entry ) throws IOException {
2006-01-30 02:18:25 +01:00
return ( theIndex . get ( entry . getUrlHash ( ) . getBytes ( ) ) ! = null ) ;
2005-04-07 21:19:42 +02:00
}
2005-05-07 23:11:18 +02:00
public boolean addEntry ( plasmaWordIndexEntry entry ) throws IOException {
2005-10-23 19:50:27 +02:00
if ( entry = = null ) return false ;
2006-03-10 17:28:01 +01:00
plasmaWordIndexEntry oldEntry = getEntry ( entry . getUrlHash ( ) ) ;
if ( ( oldEntry ! = null ) & & ( entry . isOlder ( oldEntry ) ) ) { // A more recent Entry is already in this entity
return false ;
}
2006-01-30 02:18:25 +01:00
return ( theIndex . put ( entry . getUrlHash ( ) . getBytes ( ) , entry . toEncodedForm ( ) . getBytes ( ) ) = = null ) ;
2005-04-07 21:19:42 +02:00
}
2005-08-15 01:35:18 +02:00
public int addEntries ( plasmaWordIndexEntryContainer container ) throws IOException {
2005-11-04 14:41:51 +01:00
//System.out.println("* adding " + newEntries.size() + " cached word index entries for word " + wordHash); // debug
// fetch the index cache
2005-07-20 02:39:06 +02:00
if ( ( container = = null ) | | ( container . size ( ) = = 0 ) ) return 0 ;
// open file
2005-08-15 01:35:18 +02:00
int count = 0 ;
// write from vector
if ( container ! = null ) {
Iterator i = container . entries ( ) ;
while ( i . hasNext ( ) ) {
if ( addEntry ( ( plasmaWordIndexEntry ) i . next ( ) ) ) count + + ;
2005-07-20 02:39:06 +02:00
}
}
2005-08-15 01:35:18 +02:00
// close and return
return count ;
2005-07-20 02:39:06 +02:00
}
2005-10-23 19:50:27 +02:00
public boolean deleteComplete ( ) {
2006-01-30 02:18:25 +01:00
try { theIndex . close ( ) ; } catch ( IOException e ) { }
// remove file
boolean success = theLocation . delete ( ) ;
// and also the paren directory if that is empty
if ( success ) {
File f = theLocation . getParentFile ( ) ;
while ( ( f . isDirectory ( ) ) & & ( f . list ( ) . length = = 0 ) ) {
if ( ! ( f . delete ( ) ) ) break ;
f = f . getParentFile ( ) ;
2005-04-07 21:19:42 +02:00
}
2006-01-30 02:18:25 +01:00
}
// reset all values
theIndex = null ;
theLocation = null ;
return success ;
2005-04-07 21:19:42 +02:00
}
public boolean removeEntry ( String urlHash , boolean deleteComplete ) throws IOException {
// returns true if there was an entry before, false if the key did not exist
// if after the removal the file is empty, then the file can be deleted if
// the flag deleteComplete is set.
2006-02-14 12:40:36 +01:00
if ( urlHash = = null | | theIndex = = null ) return false ;
boolean wasEntry = ( theIndex . remove ( urlHash . getBytes ( ) ) ! = null ) ;
2006-01-30 02:18:25 +01:00
if ( ( theIndex . size ( ) = = 0 ) & & ( deleteComplete ) ) deleteComplete ( ) ;
return wasEntry ;
2005-04-07 21:19:42 +02:00
}
2005-10-13 15:57:15 +02:00
public Iterator elements ( boolean up ) {
2005-11-04 14:41:51 +01:00
// returns an enumeration of plasmaWordIndexEntry objects
2006-01-30 02:18:25 +01:00
return new dbenum ( up ) ;
2005-04-07 21:19:42 +02:00
}
2005-10-13 15:57:15 +02:00
public final class dbenum implements Iterator {
2005-11-04 14:41:51 +01:00
Iterator i ;
public dbenum ( boolean up ) {
2006-01-30 13:50:40 +01:00
if ( theIndex = = null ) {
i = null ;
} else try {
2006-03-18 00:43:24 +01:00
i = theIndex . rows ( up , false , null ) ;
2005-09-21 23:56:39 +02:00
} catch ( kelondroException e ) {
e . printStackTrace ( ) ;
theIndex . file ( ) . delete ( ) ;
i = null ;
2006-03-17 19:10:48 +01:00
} catch ( IOException e ) {
e . printStackTrace ( ) ;
theIndex . file ( ) . delete ( ) ;
i = null ;
2005-09-21 23:56:39 +02:00
}
2005-11-04 14:41:51 +01:00
}
public boolean hasNext ( ) {
return ( i ! = null ) & & ( i . hasNext ( ) ) ;
}
public Object next ( ) {
2005-09-21 23:56:39 +02:00
if ( i = = null ) return null ;
2006-03-17 19:10:48 +01:00
byte [ ] [ ] n = ( byte [ ] [ ] ) i . next ( ) ;
return new plasmaWordIndexEntry ( new String ( n [ 0 ] ) , new String ( n [ 1 ] ) ) ;
2005-11-04 14:41:51 +01:00
}
2005-10-13 15:57:15 +02:00
public void remove ( ) {
throw new UnsupportedOperationException ( ) ;
}
2005-04-07 21:19:42 +02:00
}
public String toString ( ) {
2006-01-30 02:18:25 +01:00
return " DB: " + theIndex . toString ( ) ;
2005-04-07 21:19:42 +02:00
}
2005-10-10 02:45:18 +02:00
2005-10-13 15:57:15 +02:00
public void merge ( plasmaWordIndexEntity otherEntity , long time ) throws IOException {
// this is a merge of another entity to this entity
// the merge is interrupted when the given time is over
// a time=-1 means: no timeout
Iterator i = otherEntity . elements ( true ) ;
long timeout = ( time = = - 1 ) ? Long . MAX_VALUE : System . currentTimeMillis ( ) + time ;
2005-10-23 19:50:27 +02:00
try {
2005-10-13 15:57:15 +02:00
while ( ( i . hasNext ( ) ) & & ( System . currentTimeMillis ( ) < timeout ) ) {
addEntry ( ( plasmaWordIndexEntry ) i . next ( ) ) ;
}
2005-10-23 19:50:27 +02:00
} catch ( kelondroException e ) {
serverLog . logSevere ( " PLASMA " , " plasmaWordIndexEntity.merge: " + e . getMessage ( ) ) ;
}
2005-10-13 15:57:15 +02:00
}
2006-01-30 01:42:38 +01:00
/ *
// join methods
private static int log2 ( int x ) {
int l = 0 ;
while ( x > 0 ) { x = x > > 1 ; l + + ; }
return l ;
}
2005-10-12 14:28:49 +02:00
public static plasmaWordIndexEntity joinEntities ( Set entities , long time ) throws IOException {
2005-12-05 16:48:45 +01:00
// big problem here: there cannot be a time-out for join, since a time-out will leave the joined set too big.
// this will result in a OR behavior of the search instead of an AND behavior
2005-10-12 14:28:49 +02:00
long stamp = System . currentTimeMillis ( ) ;
// order entities by their size
TreeMap map = new TreeMap ( ) ;
plasmaWordIndexEntity singleEntity ;
Iterator i = entities . iterator ( ) ;
int count = 0 ;
while ( i . hasNext ( ) ) {
// get next entity:
singleEntity = ( plasmaWordIndexEntity ) i . next ( ) ;
// check result
if ( ( singleEntity = = null ) | | ( singleEntity . size ( ) = = 0 ) ) return new plasmaWordIndexEntity ( null ) ; // as this is a cunjunction of searches, we have no result if any word is not known
// store result in order of result size
map . put ( new Long ( singleEntity . size ( ) * 1000 + count ) , singleEntity ) ;
count + + ;
}
// check if there is any result
if ( map . size ( ) = = 0 ) return new plasmaWordIndexEntity ( null ) ; // no result, nothing found
// the map now holds the search results in order of number of hits per word
// we now must pairwise build up a conjunction of these sets
Long k = ( Long ) map . firstKey ( ) ; // the smallest, which means, the one with the least entries
plasmaWordIndexEntity searchA , searchB , searchResult = ( plasmaWordIndexEntity ) map . remove ( k ) ;
2005-12-05 16:48:45 +01:00
while ( ( map . size ( ) > 0 ) & & ( searchResult . size ( ) > 0 ) ) {
2005-10-12 14:28:49 +02:00
// take the first element of map which is a result and combine it with result
k = ( Long ) map . firstKey ( ) ; // the next smallest...
time - = ( System . currentTimeMillis ( ) - stamp ) ; stamp = System . currentTimeMillis ( ) ;
2005-12-05 16:48:45 +01:00
searchA = searchResult ;
searchB = ( plasmaWordIndexEntity ) map . remove ( k ) ;
2005-10-12 14:28:49 +02:00
searchResult = plasmaWordIndexEntity . joinConstructive ( searchA , searchB , 2 * time / ( map . size ( ) + 1 ) ) ;
2005-11-04 14:41:51 +01:00
// close the input files/structures
if ( searchA ! = searchResult ) searchA . close ( ) ;
if ( searchB ! = searchResult ) searchB . close ( ) ;
2005-10-12 14:28:49 +02:00
}
searchA = null ; // free resources
2005-11-04 14:41:51 +01:00
searchB = null ; // free resources
2005-10-12 14:28:49 +02:00
// in 'searchResult' is now the combined search result
if ( searchResult . size ( ) = = 0 ) return new plasmaWordIndexEntity ( null ) ;
return searchResult ;
}
2005-10-10 02:45:18 +02:00
public static plasmaWordIndexEntity joinConstructive ( plasmaWordIndexEntity i1 , plasmaWordIndexEntity i2 , long time ) throws IOException {
if ( ( i1 = = null ) | | ( i2 = = null ) ) return null ;
if ( ( i1 . size ( ) = = 0 ) | | ( i2 . size ( ) = = 0 ) ) return new plasmaWordIndexEntity ( null ) ;
// decide which method to use
int high = ( ( i1 . size ( ) > i2 . size ( ) ) ? i1 . size ( ) : i2 . size ( ) ) ;
int low = ( ( i1 . size ( ) > i2 . size ( ) ) ? i2 . size ( ) : i1 . size ( ) ) ;
int stepsEnum = 10 * ( high + low - 1 ) ;
int stepsTest = 12 * log2 ( high ) * low ;
// start most efficient method
if ( stepsEnum > stepsTest ) {
if ( i1 . size ( ) < i2 . size ( ) )
return joinConstructiveByTest ( i1 , i2 , time ) ;
else
return joinConstructiveByTest ( i2 , i1 , time ) ;
} else {
return joinConstructiveByEnumeration ( i1 , i2 , time ) ;
}
}
private static plasmaWordIndexEntity joinConstructiveByTest ( plasmaWordIndexEntity small , plasmaWordIndexEntity large , long time ) throws IOException {
System . out . println ( " DEBUG: JOIN METHOD BY TEST " ) ;
plasmaWordIndexEntity conj = new plasmaWordIndexEntity ( null ) ; // start with empty search result
2005-10-13 15:57:15 +02:00
Iterator se = small . elements ( true ) ;
2006-01-20 16:14:21 +01:00
plasmaWordIndexEntry ie0 , ie1 ;
2005-10-10 02:45:18 +02:00
long stamp = System . currentTimeMillis ( ) ;
try {
2005-10-13 15:57:15 +02:00
while ( ( se . hasNext ( ) ) & & ( ( System . currentTimeMillis ( ) - stamp ) < time ) ) {
2006-01-20 16:14:21 +01:00
ie0 = ( plasmaWordIndexEntry ) se . next ( ) ;
ie1 = large . getEntry ( ie0 . getUrlHash ( ) ) ;
if ( ie1 ! = null ) {
// this is a hit. Calculate word distance:
ie0 . combineDistance ( ie1 ) ;
conj . addEntry ( ie0 ) ;
}
2005-10-10 02:45:18 +02:00
}
} catch ( kelondroException e ) {
//serverLog.logSevere("PLASMA", "joinConstructiveByTest: Database corrupt (" + e.getMessage() + "), deleting index");
small . deleteComplete ( ) ;
return conj ;
}
return conj ;
}
private static plasmaWordIndexEntity joinConstructiveByEnumeration ( plasmaWordIndexEntity i1 , plasmaWordIndexEntity i2 , long time ) throws IOException {
System . out . println ( " DEBUG: JOIN METHOD BY ENUMERATION " ) ;
plasmaWordIndexEntity conj = new plasmaWordIndexEntity ( null ) ; // start with empty search result
2005-10-13 15:57:15 +02:00
Iterator e1 = i1 . elements ( true ) ;
Iterator e2 = i2 . elements ( true ) ;
2005-10-10 02:45:18 +02:00
int c ;
2005-10-13 15:57:15 +02:00
if ( ( e1 . hasNext ( ) ) & & ( e2 . hasNext ( ) ) ) {
2005-10-10 02:45:18 +02:00
plasmaWordIndexEntry ie1 ;
plasmaWordIndexEntry ie2 ;
try {
2005-10-13 15:57:15 +02:00
ie1 = ( plasmaWordIndexEntry ) e1 . next ( ) ;
2005-10-10 02:45:18 +02:00
} catch ( kelondroException e ) {
//serverLog.logSevere("PLASMA", "joinConstructiveByEnumeration: Database corrupt 1 (" + e.getMessage() + "), deleting index");
i1 . deleteComplete ( ) ;
return conj ;
}
try {
2005-10-13 15:57:15 +02:00
ie2 = ( plasmaWordIndexEntry ) e2 . next ( ) ;
2005-10-10 02:45:18 +02:00
} catch ( kelondroException e ) {
//serverLog.logSevere("PLASMA", "joinConstructiveByEnumeration: Database corrupt 2 (" + e.getMessage() + "), deleting index");
i2 . deleteComplete ( ) ;
return conj ;
}
long stamp = System . currentTimeMillis ( ) ;
while ( ( System . currentTimeMillis ( ) - stamp ) < time ) {
c = ie1 . getUrlHash ( ) . compareTo ( ie2 . getUrlHash ( ) ) ;
if ( c < 0 ) {
try {
2005-10-13 15:57:15 +02:00
if ( e1 . hasNext ( ) ) ie1 = ( plasmaWordIndexEntry ) e1 . next ( ) ; else break ;
2005-10-10 02:45:18 +02:00
} catch ( kelondroException e ) {
//serverLog.logSevere("PLASMA", "joinConstructiveByEnumeration: Database 1 corrupt (" + e.getMessage() + "), deleting index");
i1 . deleteComplete ( ) ;
break ;
}
} else if ( c > 0 ) {
try {
2005-10-13 15:57:15 +02:00
if ( e2 . hasNext ( ) ) ie2 = ( plasmaWordIndexEntry ) e2 . next ( ) ; else break ;
2005-10-10 02:45:18 +02:00
} catch ( kelondroException e ) {
//serverLog.logSevere("PLASMA", "joinConstructiveByEnumeration: Database 2 corrupt (" + e.getMessage() + "), deleting index");
i2 . deleteComplete ( ) ;
break ;
}
} else {
// we have found the same urls in different searches!
2006-01-20 16:14:21 +01:00
ie1 . combineDistance ( ie2 ) ;
2005-10-10 02:45:18 +02:00
conj . addEntry ( ie1 ) ;
try {
2005-10-13 15:57:15 +02:00
if ( e1 . hasNext ( ) ) ie1 = ( plasmaWordIndexEntry ) e1 . next ( ) ; else break ;
2005-10-10 02:45:18 +02:00
} catch ( kelondroException e ) {
//serverLog.logSevere("PLASMA", "joinConstructiveByEnumeration: Database 1 corrupt (" + e.getMessage() + "), deleting index");
i1 . deleteComplete ( ) ;
break ;
}
try {
2005-10-13 15:57:15 +02:00
if ( e2 . hasNext ( ) ) ie2 = ( plasmaWordIndexEntry ) e2 . next ( ) ; else break ;
2005-10-10 02:45:18 +02:00
} catch ( kelondroException e ) {
//serverLog.logSevere("PLASMA", "joinConstructiveByEnumeration: Database 2 corrupt (" + e.getMessage() + "), deleting index");
i2 . deleteComplete ( ) ;
break ;
}
}
}
}
return conj ;
}
2006-01-30 01:42:38 +01:00
* /
2006-03-10 17:28:01 +01:00
}