2005-04-07 21:19:42 +02:00
// plasmaWordIndex.java
2008-07-20 19:14:51 +02:00
// (C) 2005, 2006 by Michael Peter Christen; mc@yacy.net, Frankfurt a. M., Germany
2006-10-13 03:19:26 +02:00
// first published 2005 on http://www.anomic.de
//
// This is a part of YaCy, a peer-to-peer based web search engine
2005-09-20 17:43:31 +02:00
//
// $LastChangedDate$
// $LastChangedRevision$
2007-07-18 00:06:06 +02:00
// $LastChangedBy$
2005-04-07 21:19:42 +02:00
//
2006-10-13 03:19:26 +02:00
// LICENSE
2006-12-22 20:26:01 +01:00
//
2005-04-07 21:19:42 +02:00
// This program is free software; you can redistribute it and/or modify
// it under the terms of the GNU General Public License as published by
// the Free Software Foundation; either version 2 of the License, or
// (at your option) any later version.
//
// This program is distributed in the hope that it will be useful,
// but WITHOUT ANY WARRANTY; without even the implied warranty of
// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
// GNU General Public License for more details.
//
// You should have received a copy of the GNU General Public License
// along with this program; if not, write to the Free Software
// Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
package de.anomic.plasma ;
2005-05-05 07:32:19 +02:00
import java.io.File ;
2008-03-26 15:13:05 +01:00
import java.io.IOException ;
2007-02-26 16:49:23 +01:00
import java.util.ArrayList ;
2006-09-30 00:27:20 +02:00
import java.util.Date ;
2006-09-11 00:36:47 +02:00
import java.util.HashMap ;
2006-09-30 00:27:20 +02:00
import java.util.HashSet ;
2005-05-05 07:32:19 +02:00
import java.util.Iterator ;
2006-01-22 01:07:00 +01:00
import java.util.Map ;
2005-10-12 14:28:49 +02:00
import java.util.Set ;
2006-03-15 17:01:42 +01:00
import java.util.TreeSet ;
2005-05-05 07:32:19 +02:00
2008-05-14 23:36:02 +02:00
import de.anomic.crawler.CrawlProfile ;
import de.anomic.crawler.IndexingStack ;
2006-01-30 01:42:38 +01:00
import de.anomic.htmlFilter.htmlFilterContentScraper ;
2008-08-19 16:10:40 +02:00
import de.anomic.http.httpdProxyCacheEntry ;
2006-08-06 00:22:14 +02:00
import de.anomic.index.indexCollectionRI ;
2006-05-28 03:09:31 +02:00
import de.anomic.index.indexContainer ;
2006-07-26 13:21:51 +02:00
import de.anomic.index.indexContainerOrder ;
2006-11-06 11:15:05 +01:00
import de.anomic.index.indexRAMRI ;
2006-05-26 11:32:50 +02:00
import de.anomic.index.indexRI ;
2008-03-26 15:13:05 +01:00
import de.anomic.index.indexRWIEntry ;
2007-11-07 23:38:09 +01:00
import de.anomic.index.indexRWIRowEntry ;
2008-03-26 16:37:49 +01:00
import de.anomic.index.indexReferenceBlacklist ;
2008-03-26 15:13:05 +01:00
import de.anomic.index.indexRepositoryReference ;
2008-03-26 16:37:49 +01:00
import de.anomic.index.indexURLReference ;
import de.anomic.index.indexWord ;
2008-03-26 15:13:05 +01:00
import de.anomic.index.indexRepositoryReference.Export ;
2006-01-04 01:39:00 +01:00
import de.anomic.kelondro.kelondroBase64Order ;
2008-01-11 01:12:01 +01:00
import de.anomic.kelondro.kelondroByteOrder ;
2007-03-08 17:15:40 +01:00
import de.anomic.kelondro.kelondroCloneableIterator ;
2008-05-14 23:36:02 +02:00
import de.anomic.kelondro.kelondroException ;
2006-02-14 01:12:07 +01:00
import de.anomic.kelondro.kelondroMergeIterator ;
2006-03-16 17:44:29 +01:00
import de.anomic.kelondro.kelondroOrder ;
2007-03-08 17:15:40 +01:00
import de.anomic.kelondro.kelondroRotateIterator ;
2008-05-02 00:40:42 +02:00
import de.anomic.kelondro.kelondroRowCollection ;
2007-03-11 00:22:37 +01:00
import de.anomic.server.serverMemory ;
2008-08-13 12:37:53 +02:00
import de.anomic.server.serverProfiling ;
2007-12-27 18:56:59 +01:00
import de.anomic.server.logging.serverLog ;
2008-04-26 03:00:10 +02:00
import de.anomic.xml.RSSFeed ;
import de.anomic.xml.RSSMessage ;
2006-09-14 02:51:02 +02:00
import de.anomic.yacy.yacyDHTAction ;
2008-05-06 01:13:47 +02:00
import de.anomic.yacy.yacyNewsPool ;
2008-06-04 23:34:57 +02:00
import de.anomic.yacy.yacyPeerActions ;
2007-05-14 12:00:23 +02:00
import de.anomic.yacy.yacySeedDB ;
2007-09-05 11:01:35 +02:00
import de.anomic.yacy.yacyURL ;
2005-04-07 21:19:42 +02:00
2006-10-28 02:22:10 +02:00
public final class plasmaWordIndex implements indexRI {
2005-09-20 17:43:31 +02:00
2007-03-03 01:55:51 +01:00
// environment constants
2008-03-26 15:13:05 +01:00
public static final long wCacheMaxAge = 1000 * 60 * 30 ; // milliseconds; 30 minutes
2008-05-20 11:29:01 +02:00
public static final int wCacheMaxChunk = 800 ; // maximum number of references for each urlhash
public static final int lowcachedivisor = 1200 ;
2008-03-26 15:13:05 +01:00
public static final int maxCollectionPartition = 7 ; // should be 7
2008-05-14 23:36:02 +02:00
public static final String CRAWL_PROFILE_PROXY = " proxy " ;
public static final String CRAWL_PROFILE_REMOTE = " remote " ;
public static final String CRAWL_PROFILE_SNIPPET_LOCAL_TEXT = " snippetLocalText " ;
public static final String CRAWL_PROFILE_SNIPPET_GLOBAL_TEXT = " snippetGlobalText " ;
public static final String CRAWL_PROFILE_SNIPPET_LOCAL_MEDIA = " snippetLocalMedia " ;
public static final String CRAWL_PROFILE_SNIPPET_GLOBAL_MEDIA = " snippetGlobalMedia " ;
public static final String DBFILE_ACTIVE_CRAWL_PROFILES = " crawlProfilesActive.db " ;
public static final String DBFILE_PASSIVE_CRAWL_PROFILES = " crawlProfilesPassive.db " ;
2008-08-26 15:20:18 +02:00
public static final long CRAWL_PROFILE_PROXY_RECRAWL_CYCLE = 60L * 24L ;
public static final long CRAWL_PROFILE_SNIPPET_LOCAL_TEXT_RECRAWL_CYCLE = 60L * 24L * 30L ;
public static final long CRAWL_PROFILE_SNIPPET_GLOBAL_TEXT_RECRAWL_CYCLE = 60L * 24L * 30L ;
public static final long CRAWL_PROFILE_SNIPPET_LOCAL_MEDIA_RECRAWL_CYCLE = 60L * 24L * 30L ;
public static final long CRAWL_PROFILE_SNIPPET_GLOBAL_MEDIA_RECRAWL_CYCLE = 60L * 24L * 30L ;
2008-05-14 23:36:02 +02:00
2008-03-26 15:13:05 +01:00
private final kelondroByteOrder indexOrder = kelondroBase64Order . enhancedCoder ;
private final indexRAMRI dhtOutCache , dhtInCache ;
private final indexCollectionRI collections ; // new database structure to replace AssortmentCluster and FileCluster
2008-09-04 21:41:57 +02:00
private final serverLog log ;
private indexRepositoryReference referenceURL ;
2008-08-20 09:54:56 +02:00
public final yacySeedDB seedDB ;
2008-05-06 01:13:47 +02:00
public yacyNewsPool newsPool ;
2008-09-04 21:41:57 +02:00
private final File primaryRoot , secondaryRoot ;
2008-05-20 11:29:01 +02:00
public IndexingStack queuePreStack ;
2008-05-14 23:36:02 +02:00
public CrawlProfile profilesActiveCrawls , profilesPassiveCrawls ;
public CrawlProfile . entry defaultProxyProfile ;
public CrawlProfile . entry defaultRemoteProfile ;
public CrawlProfile . entry defaultTextSnippetLocalProfile , defaultTextSnippetGlobalProfile ;
public CrawlProfile . entry defaultMediaSnippetLocalProfile , defaultMediaSnippetGlobalProfile ;
2008-09-04 21:41:57 +02:00
private final File queuesRoot ;
2008-06-04 23:34:57 +02:00
public yacyPeerActions peerActions ;
2008-08-02 14:12:04 +02:00
public plasmaWordIndex ( final String networkName , final serverLog log , final File indexPrimaryRoot , final File indexSecondaryRoot , final int entityCacheMaxSize ) {
2008-03-26 20:51:05 +01:00
this . log = log ;
2008-05-06 15:44:38 +02:00
this . primaryRoot = new File ( indexPrimaryRoot , networkName ) ;
this . secondaryRoot = new File ( indexSecondaryRoot , networkName ) ;
File indexPrimaryTextLocation = new File ( this . primaryRoot , " TEXT " ) ;
2008-03-16 23:31:54 +01:00
if ( ! indexPrimaryTextLocation . exists ( ) ) {
// patch old index locations; the secondary path is patched in plasmaCrawlLURL
2008-08-02 14:12:04 +02:00
final File oldPrimaryPath = new File ( new File ( indexPrimaryRoot , " PUBLIC " ) , " TEXT " ) ;
final File oldPrimaryTextLocation = new File ( new File ( indexPrimaryRoot , " PUBLIC " ) , " TEXT " ) ;
2008-03-16 23:31:54 +01:00
if ( oldPrimaryPath . exists ( ) & & oldPrimaryTextLocation . exists ( ) ) {
// move the text folder from the old location to the new location
assert ! indexPrimaryTextLocation . exists ( ) ;
indexPrimaryTextLocation . mkdirs ( ) ;
if ( oldPrimaryTextLocation . renameTo ( indexPrimaryTextLocation ) ) {
if ( ! oldPrimaryPath . delete ( ) ) oldPrimaryPath . deleteOnExit ( ) ;
} else {
indexPrimaryTextLocation = oldPrimaryTextLocation ; // emergency case: stay with old directory
}
}
}
2008-08-02 14:12:04 +02:00
final File textindexcache = new File ( indexPrimaryTextLocation , " RICACHE " ) ;
2006-11-19 21:05:25 +01:00
if ( ! ( textindexcache . exists ( ) ) ) textindexcache . mkdirs ( ) ;
2008-08-02 15:57:00 +02:00
this . dhtOutCache = new indexRAMRI ( textindexcache , indexRWIRowEntry . urlEntryRow , entityCacheMaxSize , wCacheMaxChunk , wCacheMaxAge , " index.dhtout.heap " , log ) ;
this . dhtInCache = new indexRAMRI ( textindexcache , indexRWIRowEntry . urlEntryRow , entityCacheMaxSize , wCacheMaxChunk , wCacheMaxAge , " index.dhtin.heap " , log ) ;
2006-08-06 00:22:14 +02:00
// create collections storage path
2008-08-02 14:12:04 +02:00
final File textindexcollections = new File ( indexPrimaryTextLocation , " RICOLLECTION " ) ;
2006-11-19 21:05:25 +01:00
if ( ! ( textindexcollections . exists ( ) ) ) textindexcollections . mkdirs ( ) ;
2008-02-19 10:14:07 +01:00
this . collections = new indexCollectionRI ( textindexcollections , " collection " , maxCollectionPartition , indexRWIRowEntry . urlEntryRow ) ;
2006-12-05 03:47:51 +01:00
// create LURL-db
2008-05-06 15:44:38 +02:00
referenceURL = new indexRepositoryReference ( this . secondaryRoot ) ;
2006-08-07 23:49:39 +02:00
2008-05-14 23:36:02 +02:00
// make crawl profiles database and default profiles
this . queuesRoot = new File ( this . primaryRoot , " QUEUES " ) ;
this . queuesRoot . mkdirs ( ) ;
this . log . logConfig ( " Initializing Crawl Profiles " ) ;
2008-08-02 14:12:04 +02:00
final File profilesActiveFile = new File ( queuesRoot , DBFILE_ACTIVE_CRAWL_PROFILES ) ;
2008-05-14 23:36:02 +02:00
if ( ! profilesActiveFile . exists ( ) ) {
// migrate old file
2008-08-02 14:12:04 +02:00
final File oldFile = new File ( new File ( queuesRoot . getParentFile ( ) . getParentFile ( ) . getParentFile ( ) , " PLASMADB " ) , " crawlProfilesActive1.db " ) ;
2008-05-14 23:36:02 +02:00
if ( oldFile . exists ( ) ) oldFile . renameTo ( profilesActiveFile ) ;
}
this . profilesActiveCrawls = new CrawlProfile ( profilesActiveFile ) ;
initActiveCrawlProfiles ( ) ;
log . logConfig ( " Loaded active crawl profiles from file " + profilesActiveFile . getName ( ) +
" , " + this . profilesActiveCrawls . size ( ) + " entries " +
" , " + profilesActiveFile . length ( ) / 1024 ) ;
2008-08-02 14:12:04 +02:00
final File profilesPassiveFile = new File ( queuesRoot , DBFILE_PASSIVE_CRAWL_PROFILES ) ;
2008-05-14 23:36:02 +02:00
if ( ! profilesPassiveFile . exists ( ) ) {
// migrate old file
2008-08-02 14:12:04 +02:00
final File oldFile = new File ( new File ( queuesRoot . getParentFile ( ) . getParentFile ( ) . getParentFile ( ) , " PLASMADB " ) , " crawlProfilesPassive1.db " ) ;
2008-05-14 23:36:02 +02:00
if ( oldFile . exists ( ) ) oldFile . renameTo ( profilesPassiveFile ) ;
}
this . profilesPassiveCrawls = new CrawlProfile ( profilesPassiveFile ) ;
log . logConfig ( " Loaded passive crawl profiles from file " + profilesPassiveFile . getName ( ) +
" , " + this . profilesPassiveCrawls . size ( ) + " entries " +
" , " + profilesPassiveFile . length ( ) / 1024 ) ;
// init queues
2008-08-02 14:12:04 +02:00
final File preStackFile = new File ( queuesRoot , " urlNoticePreStack " ) ;
2008-05-14 23:36:02 +02:00
if ( ! preStackFile . exists ( ) ) {
// migrate old file
2008-08-02 14:12:04 +02:00
final File oldFile = new File ( new File ( queuesRoot . getParentFile ( ) . getParentFile ( ) . getParentFile ( ) , " PLASMADB " ) , " switchboardQueue.stack " ) ;
2008-05-14 23:36:02 +02:00
if ( oldFile . exists ( ) ) oldFile . renameTo ( preStackFile ) ;
}
this . queuePreStack = new IndexingStack ( this , preStackFile , this . profilesActiveCrawls ) ;
2008-05-06 01:13:47 +02:00
// create or init seed cache
2008-08-02 14:12:04 +02:00
final File networkRoot = new File ( this . primaryRoot , " NETWORK " ) ;
2008-05-06 01:13:47 +02:00
networkRoot . mkdirs ( ) ;
2008-08-02 14:12:04 +02:00
final File mySeedFile = new File ( networkRoot , yacySeedDB . DBFILE_OWN_SEED ) ;
final File oldSeedFile = new File ( new File ( indexPrimaryRoot . getParentFile ( ) , " YACYDB " ) , " mySeed.txt " ) ;
2008-05-06 01:13:47 +02:00
if ( oldSeedFile . exists ( ) ) oldSeedFile . renameTo ( mySeedFile ) ;
seedDB = new yacySeedDB (
new File ( networkRoot , " seed.new.db " ) ,
new File ( networkRoot , " seed.old.db " ) ,
new File ( networkRoot , " seed.pot.db " ) ,
mySeedFile
) ;
2008-06-04 23:34:57 +02:00
2008-05-06 01:13:47 +02:00
// create or init news database
newsPool = new yacyNewsPool ( networkRoot ) ;
2008-06-04 23:34:57 +02:00
// deploy peer actions
this . peerActions = new yacyPeerActions ( seedDB , newsPool ) ;
2008-05-06 15:44:38 +02:00
}
2008-06-16 23:39:58 +02:00
public void clearCache ( ) {
referenceURL . clearCache ( ) ;
}
2008-05-24 14:30:50 +02:00
public void clear ( ) {
dhtInCache . clear ( ) ;
dhtOutCache . clear ( ) ;
collections . clear ( ) ;
try {
referenceURL . clear ( ) ;
2008-08-02 14:12:04 +02:00
} catch ( final IOException e ) {
2008-05-24 14:30:50 +02:00
e . printStackTrace ( ) ;
}
queuePreStack . clear ( ) ;
}
2008-05-14 23:36:02 +02:00
private void initActiveCrawlProfiles ( ) {
this . defaultProxyProfile = null ;
this . defaultRemoteProfile = null ;
this . defaultTextSnippetLocalProfile = null ;
this . defaultTextSnippetGlobalProfile = null ;
this . defaultMediaSnippetLocalProfile = null ;
this . defaultMediaSnippetGlobalProfile = null ;
2008-08-02 14:12:04 +02:00
final Iterator < CrawlProfile . entry > i = this . profilesActiveCrawls . profiles ( true ) ;
2008-05-14 23:36:02 +02:00
CrawlProfile . entry profile ;
String name ;
try {
while ( i . hasNext ( ) ) {
profile = i . next ( ) ;
name = profile . name ( ) ;
if ( name . equals ( CRAWL_PROFILE_PROXY ) ) this . defaultProxyProfile = profile ;
if ( name . equals ( CRAWL_PROFILE_REMOTE ) ) this . defaultRemoteProfile = profile ;
if ( name . equals ( CRAWL_PROFILE_SNIPPET_LOCAL_TEXT ) ) this . defaultTextSnippetLocalProfile = profile ;
if ( name . equals ( CRAWL_PROFILE_SNIPPET_GLOBAL_TEXT ) ) this . defaultTextSnippetGlobalProfile = profile ;
if ( name . equals ( CRAWL_PROFILE_SNIPPET_LOCAL_MEDIA ) ) this . defaultMediaSnippetLocalProfile = profile ;
if ( name . equals ( CRAWL_PROFILE_SNIPPET_GLOBAL_MEDIA ) ) this . defaultMediaSnippetGlobalProfile = profile ;
}
2008-08-02 14:12:04 +02:00
} catch ( final Exception e ) {
2008-05-24 14:30:50 +02:00
this . profilesActiveCrawls . clear ( ) ;
2008-05-14 23:36:02 +02:00
this . defaultProxyProfile = null ;
this . defaultRemoteProfile = null ;
this . defaultTextSnippetLocalProfile = null ;
this . defaultTextSnippetGlobalProfile = null ;
this . defaultMediaSnippetLocalProfile = null ;
this . defaultMediaSnippetGlobalProfile = null ;
}
if ( this . defaultProxyProfile = = null ) {
// generate new default entry for proxy crawling
this . defaultProxyProfile = this . profilesActiveCrawls . newEntry ( " proxy " , null , " .* " , " .* " ,
0 /*Integer.parseInt(getConfig(PROXY_PREFETCH_DEPTH, "0"))*/ ,
0 /*Integer.parseInt(getConfig(PROXY_PREFETCH_DEPTH, "0"))*/ ,
2008-08-26 15:20:18 +02:00
this . profilesActiveCrawls . getRecrawlDate ( CRAWL_PROFILE_PROXY_RECRAWL_CYCLE ) , - 1 , - 1 , false ,
2008-05-14 23:36:02 +02:00
true /*getConfigBool(PROXY_INDEXING_LOCAL_TEXT, true)*/ ,
true /*getConfigBool(PROXY_INDEXING_LOCAL_MEDIA, true)*/ ,
true , true ,
false /*getConfigBool(PROXY_INDEXING_REMOTE, false)*/ , true , true , true ) ;
}
if ( this . defaultRemoteProfile = = null ) {
// generate new default entry for remote crawling
defaultRemoteProfile = this . profilesActiveCrawls . newEntry ( CRAWL_PROFILE_REMOTE , null , " .* " , " .* " , 0 , 0 ,
- 1 , - 1 , - 1 , true , true , true , false , true , false , true , true , false ) ;
}
if ( this . defaultTextSnippetLocalProfile = = null ) {
// generate new default entry for snippet fetch and optional crawling
defaultTextSnippetLocalProfile = this . profilesActiveCrawls . newEntry ( CRAWL_PROFILE_SNIPPET_LOCAL_TEXT , null , " .* " , " .* " , 0 , 0 ,
2008-08-26 15:20:18 +02:00
this . profilesActiveCrawls . getRecrawlDate ( CRAWL_PROFILE_SNIPPET_LOCAL_TEXT_RECRAWL_CYCLE ) , - 1 , - 1 , true , false , false , false , false , false , true , true , false ) ;
2008-05-14 23:36:02 +02:00
}
if ( this . defaultTextSnippetGlobalProfile = = null ) {
// generate new default entry for snippet fetch and optional crawling
defaultTextSnippetGlobalProfile = this . profilesActiveCrawls . newEntry ( CRAWL_PROFILE_SNIPPET_GLOBAL_TEXT , null , " .* " , " .* " , 0 , 0 ,
2008-08-26 15:20:18 +02:00
this . profilesActiveCrawls . getRecrawlDate ( CRAWL_PROFILE_SNIPPET_GLOBAL_TEXT_RECRAWL_CYCLE ) , - 1 , - 1 , true , true , true , true , true , false , true , true , false ) ;
2008-05-14 23:36:02 +02:00
}
if ( this . defaultMediaSnippetLocalProfile = = null ) {
// generate new default entry for snippet fetch and optional crawling
defaultMediaSnippetLocalProfile = this . profilesActiveCrawls . newEntry ( CRAWL_PROFILE_SNIPPET_LOCAL_MEDIA , null , " .* " , " .* " , 0 , 0 ,
2008-08-26 15:20:18 +02:00
this . profilesActiveCrawls . getRecrawlDate ( CRAWL_PROFILE_SNIPPET_LOCAL_MEDIA_RECRAWL_CYCLE ) , - 1 , - 1 , true , false , false , false , false , false , true , true , false ) ;
2008-05-14 23:36:02 +02:00
}
if ( this . defaultMediaSnippetGlobalProfile = = null ) {
// generate new default entry for snippet fetch and optional crawling
defaultMediaSnippetGlobalProfile = this . profilesActiveCrawls . newEntry ( CRAWL_PROFILE_SNIPPET_GLOBAL_MEDIA , null , " .* " , " .* " , 0 , 0 ,
2008-08-26 15:20:18 +02:00
this . profilesActiveCrawls . getRecrawlDate ( CRAWL_PROFILE_SNIPPET_GLOBAL_MEDIA_RECRAWL_CYCLE ) , - 1 , - 1 , true , false , true , true , true , false , true , true , false ) ;
2008-05-14 23:36:02 +02:00
}
}
private void resetProfiles ( ) {
2008-08-02 14:12:04 +02:00
final File pdb = new File ( this . queuesRoot , DBFILE_ACTIVE_CRAWL_PROFILES ) ;
2008-05-14 23:36:02 +02:00
if ( pdb . exists ( ) ) pdb . delete ( ) ;
profilesActiveCrawls = new CrawlProfile ( pdb ) ;
initActiveCrawlProfiles ( ) ;
}
public boolean cleanProfiles ( ) throws InterruptedException {
if ( queuePreStack . size ( ) > 0 ) return false ;
final Iterator < CrawlProfile . entry > iter = profilesActiveCrawls . profiles ( true ) ;
CrawlProfile . entry entry ;
boolean hasDoneSomething = false ;
try {
while ( iter . hasNext ( ) ) {
// check for interruption
if ( Thread . currentThread ( ) . isInterrupted ( ) ) throw new InterruptedException ( " Shutdown in progress " ) ;
// getting next profile
entry = iter . next ( ) ;
if ( ! ( ( entry . name ( ) . equals ( CRAWL_PROFILE_PROXY ) ) | |
( entry . name ( ) . equals ( CRAWL_PROFILE_REMOTE ) ) | |
( entry . name ( ) . equals ( CRAWL_PROFILE_SNIPPET_LOCAL_TEXT ) ) | |
( entry . name ( ) . equals ( CRAWL_PROFILE_SNIPPET_GLOBAL_TEXT ) ) | |
( entry . name ( ) . equals ( CRAWL_PROFILE_SNIPPET_LOCAL_MEDIA ) ) | |
( entry . name ( ) . equals ( CRAWL_PROFILE_SNIPPET_GLOBAL_MEDIA ) ) ) ) {
profilesPassiveCrawls . newEntry ( entry . map ( ) ) ;
iter . remove ( ) ;
hasDoneSomething = true ;
}
}
2008-08-02 14:12:04 +02:00
} catch ( final kelondroException e ) {
2008-05-14 23:36:02 +02:00
resetProfiles ( ) ;
hasDoneSomething = true ;
}
return hasDoneSomething ;
}
2008-08-02 14:12:04 +02:00
public File getLocation ( final boolean primary ) {
2008-05-06 15:44:38 +02:00
return ( primary ) ? this . primaryRoot : this . secondaryRoot ;
2005-04-07 21:19:42 +02:00
}
2005-09-20 17:43:31 +02:00
2008-08-02 14:12:04 +02:00
public void putURL ( final indexURLReference entry ) throws IOException {
2008-03-26 15:13:05 +01:00
this . referenceURL . store ( entry ) ;
}
2008-08-02 14:12:04 +02:00
public indexURLReference getURL ( final String urlHash , final indexRWIEntry searchedWord , final long ranking ) {
2008-03-26 15:13:05 +01:00
return this . referenceURL . load ( urlHash , searchedWord , ranking ) ;
}
2008-08-02 14:12:04 +02:00
public boolean removeURL ( final String urlHash ) {
2008-03-26 15:13:05 +01:00
return this . referenceURL . remove ( urlHash ) ;
}
2008-08-02 14:12:04 +02:00
public boolean existsURL ( final String urlHash ) {
2008-03-26 15:13:05 +01:00
return this . referenceURL . exists ( urlHash ) ;
}
public int countURL ( ) {
return this . referenceURL . size ( ) ;
}
2008-08-02 14:12:04 +02:00
public Export exportURL ( final File f , final String filter , final int format , final boolean dom ) {
2008-03-26 15:13:05 +01:00
return this . referenceURL . export ( f , filter , format , dom ) ;
}
public Export exportURL ( ) {
return this . referenceURL . export ( ) ;
}
2008-08-02 14:12:04 +02:00
public kelondroCloneableIterator < indexURLReference > entriesURL ( final boolean up , final String firstHash ) throws IOException {
2008-03-26 15:13:05 +01:00
return this . referenceURL . entries ( up , firstHash ) ;
}
2008-09-04 21:41:57 +02:00
public Iterator < indexRepositoryReference . hostStat > statistics ( int count ) throws IOException {
return this . referenceURL . statistics ( count ) ;
}
public int deleteDomain ( String urlfragment ) throws IOException {
return this . referenceURL . deleteDomain ( urlfragment ) ;
}
2008-08-02 14:12:04 +02:00
public indexRepositoryReference . BlacklistCleaner getURLCleaner ( final indexReferenceBlacklist blacklist ) {
2008-03-26 15:13:05 +01:00
return this . referenceURL . getBlacklistCleaner ( blacklist ) ; // thread is not already started after this is called!
}
public int getURLwriteCacheSize ( ) {
return this . referenceURL . writeCacheSize ( ) ;
}
2006-12-22 13:54:56 +01:00
public int minMem ( ) {
2007-08-20 19:36:43 +02:00
return 1024 * 1024 /* indexing overhead */ + dhtOutCache . minMem ( ) + dhtInCache . minMem ( ) + collections . minMem ( ) ;
2006-12-22 13:54:56 +01:00
}
2006-12-22 20:26:01 +01:00
2006-09-14 02:51:02 +02:00
public int maxURLinDHTOutCache ( ) {
return dhtOutCache . maxURLinCache ( ) ;
2005-04-25 01:15:40 +02:00
}
2005-09-20 17:43:31 +02:00
2006-09-14 02:51:02 +02:00
public long minAgeOfDHTOutCache ( ) {
return dhtOutCache . minAgeOfCache ( ) ;
2006-03-09 12:31:17 +01:00
}
2006-09-14 02:51:02 +02:00
public long maxAgeOfDHTOutCache ( ) {
return dhtOutCache . maxAgeOfCache ( ) ;
2006-02-25 22:05:19 +01:00
}
2006-09-14 02:51:02 +02:00
public int maxURLinDHTInCache ( ) {
return dhtInCache . maxURLinCache ( ) ;
2006-03-13 11:43:12 +01:00
}
2006-09-14 02:51:02 +02:00
public long minAgeOfDHTInCache ( ) {
return dhtInCache . minAgeOfCache ( ) ;
2006-03-13 11:43:12 +01:00
}
2006-09-14 02:51:02 +02:00
public long maxAgeOfDHTInCache ( ) {
return dhtInCache . maxAgeOfCache ( ) ;
2006-03-13 11:43:12 +01:00
}
2006-09-14 02:51:02 +02:00
public int dhtOutCacheSize ( ) {
return dhtOutCache . size ( ) ;
}
public int dhtInCacheSize ( ) {
return dhtInCache . size ( ) ;
2005-04-25 01:15:40 +02:00
}
2007-10-19 23:47:07 +02:00
2008-08-02 14:12:04 +02:00
public long dhtCacheSizeBytes ( final boolean in ) {
2007-10-19 23:47:07 +02:00
// calculate the real size in bytes of DHT-In/Out-Cache
long cacheBytes = 0 ;
2008-08-02 14:12:04 +02:00
final long entryBytes = indexRWIRowEntry . urlEntryRow . objectsize ;
final indexRAMRI cache = ( in ? dhtInCache : dhtOutCache ) ;
2007-10-20 01:36:40 +02:00
synchronized ( cache ) {
2008-08-02 14:12:04 +02:00
final Iterator < indexContainer > it = cache . wordContainers ( null , false ) ;
2008-01-11 01:12:01 +01:00
while ( it . hasNext ( ) ) cacheBytes + = it . next ( ) . size ( ) * entryBytes ;
2007-10-19 23:47:07 +02:00
}
return cacheBytes ;
}
2005-09-20 17:43:31 +02:00
2008-08-02 14:12:04 +02:00
public void setMaxWordCount ( final int maxWords ) {
2006-09-14 02:51:02 +02:00
dhtOutCache . setMaxWordCount ( maxWords ) ;
2006-09-22 12:58:58 +02:00
dhtInCache . setMaxWordCount ( maxWords ) ;
}
2008-08-02 14:12:04 +02:00
public void dhtFlushControl ( final indexRAMRI theCache ) {
2006-02-25 17:10:31 +01:00
// check for forced flush
2008-08-13 12:37:53 +02:00
int cs = cacheSize ( ) ;
if ( cs > 0 ) {
// flush elements that are too big. This flushing depends on the fact that the flush rule
// selects the biggest elements first for flushing. If it does not for any reason, the following
// loop would not terminate.
2008-08-20 10:37:39 +02:00
serverProfiling . update ( " wordcache " , Long . valueOf ( cs ) ) ;
2008-08-13 12:37:53 +02:00
// To ensure termination an additional counter is used
int l = 0 ;
while ( ( l + + < 100 ) & & ( theCache . maxURLinCache ( ) > wCacheMaxChunk ) ) {
flushCache ( theCache , Math . min ( 10 , theCache . size ( ) ) ) ;
}
// next flush more entries if the size exceeds the maximum size of the cache
if ( ( theCache . size ( ) > theCache . getMaxWordCount ( ) ) | |
( serverMemory . available ( ) < collections . minMem ( ) ) ) {
flushCache ( theCache , Math . min ( theCache . size ( ) - theCache . getMaxWordCount ( ) + 1 , theCache . size ( ) ) ) ;
}
2008-08-20 10:37:39 +02:00
if ( cacheSize ( ) ! = cs ) serverProfiling . update ( " wordcache " , Long . valueOf ( cacheSize ( ) ) ) ;
2006-02-14 01:12:07 +01:00
}
2006-02-25 17:10:31 +01:00
}
2006-10-28 02:22:10 +02:00
2008-08-02 14:12:04 +02:00
public long getUpdateTime ( final String wordHash ) {
final indexContainer entries = getContainer ( wordHash , null ) ;
2006-10-28 02:22:10 +02:00
if ( entries = = null ) return 0 ;
return entries . updated ( ) ;
}
2008-08-02 14:12:04 +02:00
public static indexContainer emptyContainer ( final String wordHash , final int elementCount ) {
2007-11-07 23:38:09 +01:00
return new indexContainer ( wordHash , indexRWIRowEntry . urlEntryRow , elementCount ) ;
2006-11-05 20:07:19 +01:00
}
2006-11-19 21:05:25 +01:00
2008-08-02 14:12:04 +02:00
public void addEntry ( final String wordHash , final indexRWIRowEntry entry , final long updateTime , boolean dhtInCase ) {
2006-09-14 02:51:02 +02:00
// set dhtInCase depending on wordHash
2008-05-06 01:13:47 +02:00
if ( ( ! dhtInCase ) & & ( yacyDHTAction . shallBeOwnWord ( seedDB , wordHash ) ) ) dhtInCase = true ;
2006-09-14 02:51:02 +02:00
// add the entry
2007-04-27 14:26:36 +02:00
if ( dhtInCase ) {
2006-09-14 02:51:02 +02:00
dhtInCache . addEntry ( wordHash , entry , updateTime , true ) ;
2007-08-15 13:36:59 +02:00
dhtFlushControl ( this . dhtInCache ) ;
2007-04-27 14:26:36 +02:00
} else {
2006-09-14 02:51:02 +02:00
dhtOutCache . addEntry ( wordHash , entry , updateTime , false ) ;
2007-08-15 13:36:59 +02:00
dhtFlushControl ( this . dhtOutCache ) ;
2006-09-14 02:51:02 +02:00
}
2006-02-25 17:10:31 +01:00
}
2008-08-02 14:12:04 +02:00
public void addEntries ( final indexContainer entries ) {
2008-02-03 00:53:39 +01:00
addEntries ( entries , false ) ;
}
2008-08-02 14:12:04 +02:00
public void addEntries ( final indexContainer entries , boolean dhtInCase ) {
2007-11-07 23:38:09 +01:00
assert ( entries . row ( ) . objectsize = = indexRWIRowEntry . urlEntryRow . objectsize ) ;
2006-11-19 21:05:25 +01:00
2006-09-14 02:51:02 +02:00
// set dhtInCase depending on wordHash
2008-05-06 01:13:47 +02:00
if ( ( ! dhtInCase ) & & ( yacyDHTAction . shallBeOwnWord ( seedDB , entries . getWordHash ( ) ) ) ) dhtInCase = true ;
2006-09-14 02:51:02 +02:00
// add the entry
2007-04-27 14:26:36 +02:00
if ( dhtInCase ) {
2008-02-03 00:53:39 +01:00
dhtInCache . addEntries ( entries ) ;
2007-08-15 13:36:59 +02:00
dhtFlushControl ( this . dhtInCache ) ;
2007-04-27 14:26:36 +02:00
} else {
2008-02-03 00:53:39 +01:00
dhtOutCache . addEntries ( entries ) ;
2007-08-15 13:36:59 +02:00
dhtFlushControl ( this . dhtOutCache ) ;
2006-09-14 02:51:02 +02:00
}
2005-11-07 13:33:02 +01:00
}
2005-09-20 17:43:31 +02:00
2008-03-22 02:28:37 +01:00
public int flushCacheSome ( ) {
2008-08-02 14:12:04 +02:00
final int fo = flushCache ( dhtOutCache , Math . max ( 1 , dhtOutCache . size ( ) / lowcachedivisor ) ) ;
final int fi = flushCache ( dhtInCache , Math . max ( 1 , dhtInCache . size ( ) / lowcachedivisor ) ) ;
2008-03-22 02:28:37 +01:00
return fo + fi ;
2006-02-25 09:42:45 +01:00
}
2008-08-02 14:12:04 +02:00
private int flushCache ( final indexRAMRI ram , int count ) {
2008-03-22 02:28:37 +01:00
if ( count < = 0 ) return 0 ;
2007-11-04 11:36:25 +01:00
2006-08-21 03:29:26 +02:00
String wordHash ;
2008-08-02 14:12:04 +02:00
final ArrayList < indexContainer > containerList = new ArrayList < indexContainer > ( ) ;
2007-11-12 02:14:51 +01:00
count = Math . min ( 5000 , Math . min ( count , ram . size ( ) ) ) ;
boolean collectMax = true ;
indexContainer c ;
while ( collectMax ) {
synchronized ( ram ) {
2007-02-28 12:13:23 +01:00
wordHash = ram . maxScoreWordHash ( ) ;
2007-09-04 01:43:55 +02:00
c = ram . getContainer ( wordHash , null ) ;
2007-03-03 01:55:51 +01:00
if ( ( c ! = null ) & & ( c . size ( ) > wCacheMaxChunk ) ) {
2007-02-28 12:13:23 +01:00
containerList . add ( ram . deleteContainer ( wordHash ) ) ;
2007-03-11 00:22:37 +01:00
if ( serverMemory . available ( ) < collections . minMem ( ) ) break ; // protect memory during flush
2007-02-28 12:13:23 +01:00
} else {
collectMax = false ;
}
}
2007-11-12 02:14:51 +01:00
}
count = count - containerList . size ( ) ;
2008-06-18 01:56:39 +02:00
containerList . addAll ( ram . bestFlushContainers ( count ) ) ;
2007-03-03 01:55:51 +01:00
// flush the containers
2008-08-02 14:12:04 +02:00
for ( final indexContainer container : containerList ) collections . addEntries ( container ) ;
2006-08-23 13:36:09 +02:00
//System.out.println("DEBUG-Finished flush of " + count + " entries from RAM to DB in " + (System.currentTimeMillis() - start) + " milliseconds");
2008-03-22 02:28:37 +01:00
return containerList . size ( ) ;
2006-02-25 09:42:45 +01:00
}
2006-01-19 15:13:39 +01:00
2008-08-02 15:57:00 +02:00
/ * *
* this is called by the switchboard to put in a new page into the index
* use all the words in one condenser object to simultanous create index entries
*
* @param url
* @param urlModified
* @param document
* @param condenser
* @param language
* @param doctype
* @param outlinksSame
* @param outlinksOther
* @return
* /
public int addPageIndex ( final yacyURL url , final Date urlModified , final plasmaParserDocument document , final plasmaCondenser condenser , final String language , final char doctype , final int outlinksSame , final int outlinksOther ) {
2006-12-08 03:14:56 +01:00
int wordCount = 0 ;
2008-08-02 14:12:04 +02:00
final int urlLength = url . toNormalform ( true , true ) . length ( ) ;
final int urlComps = htmlFilterContentScraper . urlComps ( url . toString ( ) ) . length ;
2006-12-08 03:14:56 +01:00
// iterate over all words of context text
2008-08-02 14:12:04 +02:00
final Iterator < Map . Entry < String , indexWord > > i = condenser . words ( ) . entrySet ( ) . iterator ( ) ;
2008-03-26 16:37:49 +01:00
Map . Entry < String , indexWord > wentry ;
2006-01-19 15:13:39 +01:00
String word ;
2008-02-19 15:13:35 +01:00
indexRWIRowEntry ientry ;
2008-03-26 16:37:49 +01:00
indexWord wprop ;
2006-01-19 15:13:39 +01:00
while ( i . hasNext ( ) ) {
2008-01-11 01:12:01 +01:00
wentry = i . next ( ) ;
word = wentry . getKey ( ) ;
wprop = wentry . getValue ( ) ;
2006-12-08 03:14:56 +01:00
assert ( wprop . flags ! = null ) ;
2007-11-07 23:38:09 +01:00
ientry = new indexRWIRowEntry ( url . hash ( ) ,
2008-01-22 12:51:43 +01:00
urlLength , urlComps , ( document = = null ) ? urlLength : document . dc_title ( ) . length ( ) ,
2006-11-19 21:05:25 +01:00
wprop . count ,
2008-03-26 16:37:49 +01:00
condenser . RESULT_NUMB_WORDS ,
condenser . RESULT_NUMB_SENTENCES ,
2006-11-19 21:05:25 +01:00
wprop . posInText ,
wprop . posInPhrase ,
wprop . numOfPhrase ,
urlModified . getTime ( ) ,
System . currentTimeMillis ( ) ,
language ,
doctype ,
outlinksSame , outlinksOther ,
2008-02-19 15:13:35 +01:00
wprop . flags ) ;
2008-03-26 16:37:49 +01:00
addEntry ( indexWord . word2hash ( word ) , ientry , System . currentTimeMillis ( ) , false ) ;
2006-12-08 03:14:56 +01:00
wordCount + + ;
2006-01-19 15:13:39 +01:00
}
2006-12-08 03:14:56 +01:00
return wordCount ;
2005-10-13 02:05:30 +02:00
}
2006-01-30 01:42:38 +01:00
2008-08-02 14:12:04 +02:00
public boolean hasContainer ( final String wordHash ) {
2007-01-08 14:13:30 +01:00
if ( dhtOutCache . hasContainer ( wordHash ) ) return true ;
if ( dhtInCache . hasContainer ( wordHash ) ) return true ;
if ( collections . hasContainer ( wordHash ) ) return true ;
return false ;
}
2008-08-02 14:12:04 +02:00
public indexContainer getContainer ( final String wordHash , final Set < String > urlselection ) {
2007-05-14 12:00:23 +02:00
if ( ( wordHash = = null ) | | ( wordHash . length ( ) ! = yacySeedDB . commonHashLength ) ) {
// wrong input
return null ;
}
2006-11-19 21:05:25 +01:00
// get from cache
2007-04-05 16:58:29 +02:00
indexContainer container ;
2008-03-15 01:49:16 +01:00
container = dhtOutCache . getContainer ( wordHash , urlselection ) ;
if ( container = = null ) {
container = dhtInCache . getContainer ( wordHash , urlselection ) ;
} else {
container . addAllUnique ( dhtInCache . getContainer ( wordHash , urlselection ) ) ;
2007-04-05 16:58:29 +02:00
}
2006-11-19 21:05:25 +01:00
// get from collection index
2008-03-15 01:49:16 +01:00
if ( container = = null ) {
container = collections . getContainer ( wordHash , urlselection ) ;
} else {
container . addAllUnique ( collections . getContainer ( wordHash , urlselection ) ) ;
2006-11-19 21:05:25 +01:00
}
2008-02-03 13:40:40 +01:00
2008-02-03 22:47:27 +01:00
if ( container = = null ) return null ;
2008-02-03 13:40:40 +01:00
// check doubles
2008-08-02 14:12:04 +02:00
final int beforeDouble = container . size ( ) ;
final ArrayList < kelondroRowCollection > d = container . removeDoubles ( ) ;
2008-05-02 00:40:42 +02:00
kelondroRowCollection set ;
2008-02-03 13:40:40 +01:00
for ( int i = 0 ; i < d . size ( ) ; i + + ) {
// for each element in the double-set, take that one that is the most recent one
set = d . get ( i ) ;
indexRWIRowEntry e , elm = null ;
long lm = 0 ;
for ( int j = 0 ; j < set . size ( ) ; j + + ) {
2008-04-24 15:31:55 +02:00
e = new indexRWIRowEntry ( set . get ( j , true ) ) ;
2008-02-03 13:40:40 +01:00
if ( ( elm = = null ) | | ( e . lastModified ( ) > lm ) ) {
elm = e ;
lm = e . lastModified ( ) ;
}
}
2008-06-06 18:01:27 +02:00
if ( elm ! = null ) {
container . addUnique ( elm . toKelondroEntry ( ) ) ;
}
2008-02-03 13:40:40 +01:00
}
if ( container . size ( ) < beforeDouble ) System . out . println ( " *** DEBUG DOUBLECHECK - removed " + ( beforeDouble - container . size ( ) ) + " index entries from word container " + container . getWordHash ( ) ) ;
2006-11-19 21:05:25 +01:00
return container ;
2006-01-30 01:42:38 +01:00
}
2006-01-30 13:42:06 +01:00
2008-08-02 15:57:00 +02:00
/ * *
* return map of wordhash : indexContainer
*
* @param wordHashes
* @param urlselection
* @param deleteIfEmpty
* @param interruptIfEmpty
* @return
* /
public HashMap < String , indexContainer > getContainers ( final Set < String > wordHashes , final Set < String > urlselection , final boolean interruptIfEmpty ) {
2006-01-30 01:42:38 +01:00
// retrieve entities that belong to the hashes
2008-08-02 14:12:04 +02:00
final HashMap < String , indexContainer > containers = new HashMap < String , indexContainer > ( wordHashes . size ( ) ) ;
2006-01-30 01:42:38 +01:00
String singleHash ;
2006-05-28 03:09:31 +02:00
indexContainer singleContainer ;
2008-08-02 14:12:04 +02:00
final Iterator < String > i = wordHashes . iterator ( ) ;
2006-08-12 00:07:59 +02:00
while ( i . hasNext ( ) ) {
2006-01-30 01:42:38 +01:00
2006-09-11 13:12:42 +02:00
// get next word hash:
2008-06-06 18:01:27 +02:00
singleHash = i . next ( ) ;
2006-01-30 01:42:38 +01:00
2006-08-12 00:07:59 +02:00
// retrieve index
2007-09-04 01:43:55 +02:00
singleContainer = getContainer ( singleHash , urlselection ) ;
2006-01-30 01:42:38 +01:00
2006-08-12 00:07:59 +02:00
// check result
2008-02-27 16:16:47 +01:00
if ( ( ( singleContainer = = null ) | | ( singleContainer . size ( ) = = 0 ) ) & & ( interruptIfEmpty ) ) return new HashMap < String , indexContainer > ( 0 ) ;
2006-01-30 01:42:38 +01:00
2006-09-11 00:36:47 +02:00
containers . put ( singleHash , singleContainer ) ;
2006-08-12 00:07:59 +02:00
}
2006-01-30 01:42:38 +01:00
return containers ;
}
2008-01-19 01:40:19 +01:00
@SuppressWarnings ( " unchecked " )
2008-08-02 14:12:04 +02:00
public HashMap < String , indexContainer > [ ] localSearchContainers ( final plasmaSearchQuery query , final Set < String > urlselection ) {
2007-11-17 02:53:02 +01:00
// search for the set of hashes and return a map of of wordhash:indexContainer containing the seach result
// retrieve entities that belong to the hashes
2008-02-27 16:16:47 +01:00
HashMap < String , indexContainer > inclusionContainers = ( query . queryHashes . size ( ) = = 0 ) ? new HashMap < String , indexContainer > ( 0 ) : getContainers (
2007-11-17 02:53:02 +01:00
query . queryHashes ,
urlselection ,
true ) ;
2008-02-27 16:16:47 +01:00
if ( ( inclusionContainers . size ( ) ! = 0 ) & & ( inclusionContainers . size ( ) < query . queryHashes . size ( ) ) ) inclusionContainers = new HashMap < String , indexContainer > ( 0 ) ; // prevent that only a subset is returned
2008-08-02 14:12:04 +02:00
final HashMap < String , indexContainer > exclusionContainers = ( inclusionContainers . size ( ) = = 0 ) ? new HashMap < String , indexContainer > ( 0 ) : getContainers (
2007-11-17 02:53:02 +01:00
query . excludeHashes ,
urlselection ,
true ) ;
2008-02-27 16:16:47 +01:00
return new HashMap [ ] { inclusionContainers , exclusionContainers } ;
2007-11-17 02:53:02 +01:00
}
2005-05-07 23:11:18 +02:00
public int size ( ) {
2006-12-02 20:34:59 +01:00
return java . lang . Math . max ( collections . size ( ) , java . lang . Math . max ( dhtInCache . size ( ) , dhtOutCache . size ( ) ) ) ;
2005-04-07 21:19:42 +02:00
}
2005-09-20 17:43:31 +02:00
2008-04-08 16:44:39 +02:00
public int collectionsSize ( ) {
return collections . size ( ) ;
}
public int cacheSize ( ) {
return dhtInCache . size ( ) + dhtOutCache . size ( ) ;
}
2008-08-02 14:12:04 +02:00
public int indexSize ( final String wordHash ) {
2006-02-14 01:12:07 +01:00
int size = 0 ;
2006-11-19 21:05:25 +01:00
size + = dhtInCache . indexSize ( wordHash ) ;
size + = dhtOutCache . indexSize ( wordHash ) ;
2006-12-02 20:34:59 +01:00
size + = collections . indexSize ( wordHash ) ;
2006-02-14 01:12:07 +01:00
return size ;
2005-04-07 21:19:42 +02:00
}
2005-09-20 17:43:31 +02:00
2006-12-05 03:47:51 +01:00
public void close ( ) {
2007-04-26 17:15:40 +02:00
dhtInCache . close ( ) ;
dhtOutCache . close ( ) ;
2008-03-15 01:49:16 +01:00
collections . close ( ) ;
2008-03-26 15:13:05 +01:00
referenceURL . close ( ) ;
2008-05-06 01:13:47 +02:00
seedDB . close ( ) ;
newsPool . close ( ) ;
2008-05-14 23:36:02 +02:00
profilesActiveCrawls . close ( ) ;
queuePreStack . close ( ) ;
2008-06-04 23:34:57 +02:00
peerActions . close ( ) ;
2005-04-07 21:19:42 +02:00
}
2008-05-06 01:13:47 +02:00
2008-08-02 14:12:04 +02:00
public indexContainer deleteContainer ( final String wordHash ) {
final indexContainer c = new indexContainer (
2007-08-25 01:12:59 +02:00
wordHash ,
2007-11-07 23:38:09 +01:00
indexRWIRowEntry . urlEntryRow ,
2007-08-25 01:12:59 +02:00
dhtInCache . sizeContainer ( wordHash ) + dhtOutCache . sizeContainer ( wordHash ) + collections . indexSize ( wordHash )
) ;
2008-03-15 01:49:16 +01:00
c . addAllUnique ( dhtInCache . deleteContainer ( wordHash ) ) ;
c . addAllUnique ( dhtOutCache . deleteContainer ( wordHash ) ) ;
c . addAllUnique ( collections . deleteContainer ( wordHash ) ) ;
2006-11-19 21:05:25 +01:00
return c ;
2005-04-07 21:19:42 +02:00
}
2006-02-14 01:12:07 +01:00
2008-08-02 14:12:04 +02:00
public boolean removeEntry ( final String wordHash , final String urlHash ) {
2006-09-18 02:37:02 +02:00
boolean removed = false ;
2008-03-15 01:49:16 +01:00
removed = removed | ( dhtInCache . removeEntry ( wordHash , urlHash ) ) ;
removed = removed | ( dhtOutCache . removeEntry ( wordHash , urlHash ) ) ;
removed = removed | ( collections . removeEntry ( wordHash , urlHash ) ) ;
2006-11-19 21:05:25 +01:00
return removed ;
2006-08-01 12:30:55 +02:00
}
2008-08-02 14:12:04 +02:00
public int removeEntryMultiple ( final Set < String > wordHashes , final String urlHash ) {
2007-08-26 20:18:35 +02:00
// remove the same url hashes for multiple words
// this is mainly used when correcting a index after a search
2008-08-02 14:12:04 +02:00
final Iterator < String > i = wordHashes . iterator ( ) ;
2007-08-26 20:18:35 +02:00
int count = 0 ;
while ( i . hasNext ( ) ) {
2008-06-06 18:01:27 +02:00
if ( removeEntry ( i . next ( ) , urlHash ) ) count + + ;
2007-08-26 20:18:35 +02:00
}
return count ;
}
2008-08-02 14:12:04 +02:00
public int removeEntries ( final String wordHash , final Set < String > urlHashes ) {
2006-08-16 21:49:31 +02:00
int removed = 0 ;
2008-03-15 01:49:16 +01:00
removed + = dhtInCache . removeEntries ( wordHash , urlHashes ) ;
removed + = dhtOutCache . removeEntries ( wordHash , urlHashes ) ;
removed + = collections . removeEntries ( wordHash , urlHashes ) ;
2006-10-13 01:14:41 +02:00
return removed ;
}
2008-08-02 14:12:04 +02:00
public String removeEntriesExpl ( final String wordHash , final Set < String > urlHashes ) {
2006-10-13 01:14:41 +02:00
String removed = " " ;
2008-03-15 01:49:16 +01:00
removed + = dhtInCache . removeEntries ( wordHash , urlHashes ) + " , " ;
removed + = dhtOutCache . removeEntries ( wordHash , urlHashes ) + " , " ;
removed + = collections . removeEntries ( wordHash , urlHashes ) ;
2006-02-14 01:12:07 +01:00
return removed ;
}
2008-08-02 14:12:04 +02:00
public void removeEntriesMultiple ( final Set < String > wordHashes , final Set < String > urlHashes ) {
2007-08-26 20:18:35 +02:00
// remove the same url hashes for multiple words
// this is mainly used when correcting a index after a search
2008-08-02 14:12:04 +02:00
final Iterator < String > i = wordHashes . iterator ( ) ;
2007-08-26 20:18:35 +02:00
while ( i . hasNext ( ) ) {
2008-06-06 18:01:27 +02:00
removeEntries ( i . next ( ) , urlHashes ) ;
2007-03-13 23:18:36 +01:00
}
}
2008-08-02 14:12:04 +02:00
public int removeWordReferences ( final Set < String > words , final String urlhash ) {
2007-03-13 23:18:36 +01:00
// sequentially delete all word references
// returns number of deletions
2008-08-02 14:12:04 +02:00
final Iterator < String > iter = words . iterator ( ) ;
2007-03-13 23:18:36 +01:00
int count = 0 ;
while ( iter . hasNext ( ) ) {
// delete the URL reference in this word index
2008-06-06 18:01:27 +02:00
if ( removeEntry ( indexWord . word2hash ( iter . next ( ) ) , urlhash ) ) count + + ;
2006-12-06 13:51:46 +01:00
}
return count ;
}
2006-09-14 02:51:02 +02:00
2008-08-02 14:12:04 +02:00
public synchronized TreeSet < indexContainer > indexContainerSet ( final String startHash , final boolean ram , final boolean rot , int count ) {
2006-07-26 13:21:51 +02:00
// creates a set of indexContainers
2006-09-14 02:51:02 +02:00
// this does not use the dhtInCache
2008-08-02 14:12:04 +02:00
final kelondroOrder < indexContainer > containerOrder = new indexContainerOrder ( indexOrder . clone ( ) ) ;
2008-01-11 01:12:01 +01:00
containerOrder . rotate ( emptyContainer ( startHash , 0 ) ) ;
2008-08-02 14:12:04 +02:00
final TreeSet < indexContainer > containers = new TreeSet < indexContainer > ( containerOrder ) ;
final Iterator < indexContainer > i = wordContainers ( startHash , ram , rot ) ;
2006-12-05 03:47:51 +01:00
if ( ram ) count = Math . min ( dhtOutCache . size ( ) , count ) ;
2006-09-14 02:51:02 +02:00
indexContainer container ;
2007-04-13 15:40:19 +02:00
// this loop does not terminate using the i.hasNex() predicate when rot == true
// because then the underlying iterator is a rotating iterator without termination
// in this case a termination must be ensured with a counter
// It must also be ensured that the counter is in/decreased every loop
2006-09-14 02:51:02 +02:00
while ( ( count > 0 ) & & ( i . hasNext ( ) ) ) {
2008-01-11 01:12:01 +01:00
container = i . next ( ) ;
2006-09-14 02:51:02 +02:00
if ( ( container ! = null ) & & ( container . size ( ) > 0 ) ) {
containers . add ( container ) ;
2006-03-18 17:14:24 +01:00
}
2007-04-13 15:40:19 +02:00
count - - ; // decrease counter even if the container was null or empty to ensure termination
2006-09-14 02:51:02 +02:00
}
2007-04-13 15:40:19 +02:00
return containers ; // this may return less containers as demanded
2006-03-15 00:22:49 +01:00
}
2006-12-05 03:47:51 +01:00
2008-08-02 14:12:04 +02:00
public indexURLReference storeDocument ( final IndexingStack . QueueEntry entry , final plasmaParserDocument document , final plasmaCondenser condenser ) throws IOException {
final long startTime = System . currentTimeMillis ( ) ;
2008-03-26 20:51:05 +01:00
// CREATE INDEX
2008-08-02 14:12:04 +02:00
final String dc_title = document . dc_title ( ) ;
final yacyURL referrerURL = entry . referrerURL ( ) ;
final Date docDate = entry . getModificationDate ( ) ;
2008-09-18 15:12:33 +02:00
String language = condenser . language ( ) ;
if ( language = = null ) {
System . out . println ( " *** DEBUG LANGUAGE: identification of " + entry . url ( ) + " FAILED, taking TLD " ) ;
language = entry . url ( ) . language ( ) ;
} else {
System . out . println ( " *** DEBUG LANGUAGE: identification of " + entry . url ( ) + " SUCCESS: " + language ) ;
2008-09-18 16:12:15 +02:00
if ( language . equals ( " pl " ) ) language = entry . url ( ) . language ( ) ; // patch a bug TODO: remove this if bug is fixed
2008-09-18 15:12:33 +02:00
}
2008-03-26 20:51:05 +01:00
// create a new loaded URL db entry
2008-08-02 14:12:04 +02:00
final long ldate = System . currentTimeMillis ( ) ;
final indexURLReference newEntry = new indexURLReference (
2008-03-26 20:51:05 +01:00
entry . url ( ) , // URL
dc_title , // document description
document . dc_creator ( ) , // author
document . dc_subject ( ' ' ) , // tags
" " , // ETag
docDate , // modification date
new Date ( ) , // loaded date
new Date ( ldate + Math . max ( 0 , ldate - docDate . getTime ( ) ) / 2 ) , // freshdate, computed with Proxy-TTL formula
( referrerURL = = null ) ? null : referrerURL . hash ( ) , // referer hash
new byte [ 0 ] , // md5
( int ) entry . size ( ) , // size
condenser . RESULT_NUMB_WORDS , // word count
2008-08-19 16:10:40 +02:00
httpdProxyCacheEntry . docType ( document . dc_format ( ) ) , // doctype
2008-03-26 20:51:05 +01:00
condenser . RESULT_FLAGS , // flags
2008-09-18 15:12:33 +02:00
language , // language
2008-03-26 20:51:05 +01:00
document . inboundLinks ( ) , // inbound links
document . outboundLinks ( ) , // outbound links
document . getAudiolinks ( ) . size ( ) , // laudio
document . getImages ( ) . size ( ) , // limage
document . getVideolinks ( ) . size ( ) , // lvideo
document . getApplinks ( ) . size ( ) // lapp
) ;
// STORE URL TO LOADED-URL-DB
putURL ( newEntry ) ;
2008-08-02 14:12:04 +02:00
final long storageEndTime = System . currentTimeMillis ( ) ;
2008-03-26 20:51:05 +01:00
// STORE PAGE INDEX INTO WORD INDEX DB
2008-08-02 14:12:04 +02:00
final int words = addPageIndex (
2008-03-26 20:51:05 +01:00
entry . url ( ) , // document url
docDate , // document mod date
document , // document content
condenser , // document condenser
2008-09-18 15:12:33 +02:00
language , // document language
2008-08-19 16:10:40 +02:00
httpdProxyCacheEntry . docType ( document . dc_format ( ) ) , // document type
2008-03-26 20:51:05 +01:00
document . inboundLinks ( ) , // inbound links
document . outboundLinks ( ) // outbound links
) ;
2008-08-02 14:12:04 +02:00
final long indexingEndTime = System . currentTimeMillis ( ) ;
2008-03-26 20:51:05 +01:00
if ( log . isInfo ( ) ) {
// TODO: UTF-8 docDescription seems not to be displayed correctly because
// of string concatenation
log . logInfo ( " *Indexed " + words + " words in URL " + entry . url ( ) +
" [ " + entry . urlHash ( ) + " ] " +
" \ n \ tDescription: " + dc_title +
" \ n \ tMimeType: " + document . dc_format ( ) + " | Charset: " + document . getCharset ( ) + " | " +
" Size: " + document . getTextLength ( ) + " bytes | " +
" Anchors: " + ( ( document . getAnchors ( ) = = null ) ? 0 : document . getAnchors ( ) . size ( ) ) +
" \ n \ tLinkStorageTime: " + ( storageEndTime - startTime ) + " ms | " +
" indexStorageTime: " + ( indexingEndTime - storageEndTime ) + " ms " ) ;
2008-05-13 00:23:29 +02:00
RSSFeed . channels ( ( entry . initiator ( ) . equals ( seedDB . mySeed ( ) . hash ) ) ? RSSFeed . LOCALINDEXING : RSSFeed . REMOTEINDEXING ) . addMessage ( new RSSMessage ( " Indexed web page " , dc_title , entry . url ( ) . toNormalform ( true , false ) ) ) ;
2008-03-26 20:51:05 +01:00
}
// finished
return newEntry ;
}
2008-08-02 14:12:04 +02:00
public synchronized kelondroCloneableIterator < indexContainer > wordContainers ( final String startHash , final boolean ram , final boolean rot ) {
final kelondroCloneableIterator < indexContainer > i = wordContainers ( startHash , ram ) ;
2007-03-08 17:15:40 +01:00
if ( rot ) {
2008-01-25 12:44:27 +01:00
return new kelondroRotateIterator < indexContainer > ( i , new String ( kelondroBase64Order . zero ( startHash . length ( ) ) ) , dhtOutCache . size ( ) + ( ( ram ) ? 0 : collections . size ( ) ) ) ;
2007-03-08 17:15:40 +01:00
}
2008-08-02 15:57:00 +02:00
return i ;
2006-02-14 01:12:07 +01:00
}
2008-08-02 14:12:04 +02:00
public synchronized kelondroCloneableIterator < indexContainer > wordContainers ( final String startWordHash , final boolean ram ) {
final kelondroOrder < indexContainer > containerOrder = new indexContainerOrder ( indexOrder . clone ( ) ) ;
2008-01-11 01:12:01 +01:00
containerOrder . rotate ( emptyContainer ( startWordHash , 0 ) ) ;
2006-12-05 03:47:51 +01:00
if ( ram ) {
2006-09-14 02:51:02 +02:00
return dhtOutCache . wordContainers ( startWordHash , false ) ;
2006-12-05 03:47:51 +01:00
}
2008-08-02 15:57:00 +02:00
return new kelondroMergeIterator < indexContainer > (
dhtOutCache . wordContainers ( startWordHash , false ) ,
collections . wordContainers ( startWordHash , false ) ,
containerOrder ,
indexContainer . containerMergeMethod ,
true ) ;
2005-04-07 21:19:42 +02:00
}
2006-02-12 00:19:01 +01:00
2008-03-26 15:13:05 +01:00
2006-03-08 23:06:11 +01:00
// The Cleaner class was provided as "UrldbCleaner" by Hydrox
2008-08-02 14:12:04 +02:00
public synchronized ReferenceCleaner getReferenceCleaner ( final String startHash ) {
2008-03-26 15:13:05 +01:00
return new ReferenceCleaner ( startHash ) ;
2006-03-08 23:06:11 +01:00
}
2008-03-26 15:13:05 +01:00
public class ReferenceCleaner extends Thread {
2006-03-08 23:06:11 +01:00
2008-08-02 14:12:04 +02:00
private final String startHash ;
2006-03-08 23:06:11 +01:00
private boolean run = true ;
private boolean pause = false ;
public int rwiCountAtStart = 0 ;
public String wordHashNow = " " ;
public String lastWordHash = " " ;
public int lastDeletionCounter = 0 ;
2008-08-02 14:12:04 +02:00
public ReferenceCleaner ( final String startHash ) {
2006-03-08 23:06:11 +01:00
this . startHash = startHash ;
this . rwiCountAtStart = size ( ) ;
}
public void run ( ) {
2006-03-09 13:23:05 +01:00
serverLog . logInfo ( " INDEXCLEANER " , " IndexCleaner-Thread started " ) ;
2006-07-26 13:21:51 +02:00
indexContainer container = null ;
2008-01-30 01:15:43 +01:00
indexRWIRowEntry entry = null ;
2007-09-05 11:01:35 +02:00
yacyURL url = null ;
2008-08-02 14:12:04 +02:00
final HashSet < String > urlHashs = new HashSet < String > ( ) ;
2008-01-11 01:12:01 +01:00
Iterator < indexContainer > indexContainerIterator = indexContainerSet ( startHash , false , false , 100 ) . iterator ( ) ;
2006-12-02 20:34:59 +01:00
while ( indexContainerIterator . hasNext ( ) & & run ) {
waiter ( ) ;
2008-06-06 18:01:27 +02:00
container = indexContainerIterator . next ( ) ;
2008-08-02 14:12:04 +02:00
final Iterator < indexRWIRowEntry > containerIterator = container . entries ( ) ;
2006-12-02 20:34:59 +01:00
wordHashNow = container . getWordHash ( ) ;
while ( containerIterator . hasNext ( ) & & run ) {
2006-03-08 23:06:11 +01:00
waiter ( ) ;
2008-01-30 01:15:43 +01:00
entry = containerIterator . next ( ) ;
2006-12-02 20:34:59 +01:00
// System.out.println("Wordhash: "+wordHash+" UrlHash:
// "+entry.getUrlHash());
2008-08-02 14:12:04 +02:00
final indexURLReference ue = referenceURL . load ( entry . urlHash ( ) , entry , 0 ) ;
2006-12-02 20:34:59 +01:00
if ( ue = = null ) {
urlHashs . add ( entry . urlHash ( ) ) ;
} else {
url = ue . comp ( ) . url ( ) ;
2008-03-26 16:37:49 +01:00
if ( ( url = = null ) | | ( plasmaSwitchboard . urlBlacklist . isListed ( indexReferenceBlacklist . BLACKLIST_CRAWLER , url ) = = true ) ) {
2006-09-07 20:24:39 +02:00
urlHashs . add ( entry . urlHash ( ) ) ;
2006-03-08 23:06:11 +01:00
}
2006-03-17 21:52:43 +01:00
}
2006-12-02 20:34:59 +01:00
}
if ( urlHashs . size ( ) > 0 ) {
2008-08-02 14:12:04 +02:00
final int removed = removeEntries ( container . getWordHash ( ) , urlHashs ) ;
2006-12-02 20:34:59 +01:00
serverLog . logFine ( " INDEXCLEANER " , container . getWordHash ( ) + " : " + removed + " of " + container . size ( ) + " URL-entries deleted " ) ;
lastWordHash = container . getWordHash ( ) ;
lastDeletionCounter = urlHashs . size ( ) ;
urlHashs . clear ( ) ;
}
if ( ! containerIterator . hasNext ( ) ) {
// We may not be finished yet, try to get the next chunk of wordHashes
2008-08-02 14:12:04 +02:00
final TreeSet < indexContainer > containers = indexContainerSet ( container . getWordHash ( ) , false , false , 100 ) ;
2006-12-02 20:34:59 +01:00
indexContainerIterator = containers . iterator ( ) ;
// Make sure we don't get the same wordhash twice, but don't skip a word
2008-06-06 18:01:27 +02:00
if ( ( indexContainerIterator . hasNext ( ) ) & & ( ! container . getWordHash ( ) . equals ( indexContainerIterator . next ( ) . getWordHash ( ) ) ) ) {
2006-07-26 13:21:51 +02:00
indexContainerIterator = containers . iterator ( ) ;
2006-03-18 00:39:10 +01:00
}
2006-03-08 23:06:11 +01:00
}
}
serverLog . logInfo ( " INDEXCLEANER " , " IndexCleaner-Thread stopped " ) ;
}
public void abort ( ) {
synchronized ( this ) {
run = false ;
this . notifyAll ( ) ;
}
}
public void pause ( ) {
2006-12-22 20:17:23 +01:00
synchronized ( this ) {
if ( ! pause ) {
2006-03-08 23:06:11 +01:00
pause = true ;
2006-12-22 20:17:23 +01:00
serverLog . logInfo ( " INDEXCLEANER " , " IndexCleaner-Thread paused " ) ;
2006-03-08 23:06:11 +01:00
}
}
}
public void endPause ( ) {
2006-12-22 20:17:23 +01:00
synchronized ( this ) {
if ( pause ) {
2006-03-08 23:06:11 +01:00
pause = false ;
this . notifyAll ( ) ;
serverLog . logInfo ( " INDEXCLEANER " , " IndexCleaner-Thread resumed " ) ;
}
}
}
2006-12-22 20:17:23 +01:00
2006-03-08 23:06:11 +01:00
public void waiter ( ) {
2006-12-22 20:17:23 +01:00
synchronized ( this ) {
2006-03-08 23:06:11 +01:00
if ( this . pause ) {
try {
this . wait ( ) ;
2008-08-02 14:12:04 +02:00
} catch ( final InterruptedException e ) {
2006-03-08 23:06:11 +01:00
this . run = false ;
return ;
}
}
}
}
}
2007-04-26 17:15:40 +02:00
2005-04-07 21:19:42 +02:00
}