2005-04-07 21:19:42 +02:00
// plasmaWordIndex.java
2008-07-20 19:14:51 +02:00
// (C) 2005, 2006 by Michael Peter Christen; mc@yacy.net, Frankfurt a. M., Germany
2006-10-13 03:19:26 +02:00
// first published 2005 on http://www.anomic.de
//
// This is a part of YaCy, a peer-to-peer based web search engine
2005-09-20 17:43:31 +02:00
//
// $LastChangedDate$
// $LastChangedRevision$
2007-07-18 00:06:06 +02:00
// $LastChangedBy$
2005-04-07 21:19:42 +02:00
//
2006-10-13 03:19:26 +02:00
// LICENSE
2006-12-22 20:26:01 +01:00
//
2005-04-07 21:19:42 +02:00
// This program is free software; you can redistribute it and/or modify
// it under the terms of the GNU General Public License as published by
// the Free Software Foundation; either version 2 of the License, or
// (at your option) any later version.
//
// This program is distributed in the hope that it will be useful,
// but WITHOUT ANY WARRANTY; without even the implied warranty of
// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
// GNU General Public License for more details.
//
// You should have received a copy of the GNU General Public License
// along with this program; if not, write to the Free Software
// Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
package de.anomic.plasma ;
2005-05-05 07:32:19 +02:00
import java.io.File ;
2008-03-26 15:13:05 +01:00
import java.io.IOException ;
2006-09-30 00:27:20 +02:00
import java.util.Date ;
2009-03-16 17:24:53 +01:00
import java.util.HashMap ;
2006-09-30 00:27:20 +02:00
import java.util.HashSet ;
2005-05-05 07:32:19 +02:00
import java.util.Iterator ;
2006-01-22 01:07:00 +01:00
import java.util.Map ;
2009-03-16 17:24:53 +01:00
import java.util.Set ;
2006-03-15 17:01:42 +01:00
import java.util.TreeSet ;
2005-05-05 07:32:19 +02:00
2008-05-14 23:36:02 +02:00
import de.anomic.crawler.CrawlProfile ;
import de.anomic.crawler.IndexingStack ;
2009-04-03 15:23:45 +02:00
import de.anomic.data.Blacklist ;
2006-01-30 01:42:38 +01:00
import de.anomic.htmlFilter.htmlFilterContentScraper ;
2008-08-19 16:10:40 +02:00
import de.anomic.http.httpdProxyCacheEntry ;
2009-04-03 00:15:33 +02:00
import de.anomic.kelondro.blob.BLOBArray ;
2009-03-16 17:24:53 +01:00
import de.anomic.kelondro.order.Base64Order ;
import de.anomic.kelondro.order.ByteOrder ;
import de.anomic.kelondro.text.BufferedIndex ;
2009-03-17 14:03:27 +01:00
import de.anomic.kelondro.text.IndexCell ;
2009-03-30 17:31:25 +02:00
import de.anomic.kelondro.text.IndexCollectionMigration ;
2009-03-02 11:00:32 +01:00
import de.anomic.kelondro.text.ReferenceContainer ;
2009-03-30 08:22:27 +02:00
import de.anomic.kelondro.text.IODispatcher ;
2009-03-02 12:04:13 +01:00
import de.anomic.kelondro.text.MetadataRepository ;
2009-04-15 08:34:27 +02:00
import de.anomic.kelondro.text.ReferenceFactory ;
2009-04-03 15:23:45 +02:00
import de.anomic.kelondro.text.metadataPrototype.URLMetadataRow ;
2009-04-15 08:34:27 +02:00
import de.anomic.kelondro.text.referencePrototype.WordReference ;
import de.anomic.kelondro.text.referencePrototype.WordReferenceFactory ;
2009-04-03 15:23:45 +02:00
import de.anomic.kelondro.text.referencePrototype.WordReferenceRow ;
2009-03-30 17:31:25 +02:00
import de.anomic.kelondro.util.FileUtils ;
2009-01-30 23:44:20 +01:00
import de.anomic.kelondro.util.kelondroException ;
2009-01-31 00:33:47 +01:00
import de.anomic.kelondro.util.Log ;
2009-04-03 15:23:45 +02:00
import de.anomic.plasma.parser.Word ;
import de.anomic.plasma.parser.Condenser ;
2008-09-21 02:04:42 +02:00
import de.anomic.tools.iso639 ;
2008-04-26 03:00:10 +02:00
import de.anomic.xml.RSSFeed ;
import de.anomic.xml.RSSMessage ;
2007-05-14 12:00:23 +02:00
import de.anomic.yacy.yacySeedDB ;
2007-09-05 11:01:35 +02:00
import de.anomic.yacy.yacyURL ;
2005-04-07 21:19:42 +02:00
2009-03-13 15:56:25 +01:00
public final class plasmaWordIndex {
2005-09-20 17:43:31 +02:00
2007-03-03 01:55:51 +01:00
// environment constants
2009-03-30 17:31:25 +02:00
public static final long wCacheMaxAge = 1000 * 60 * 30 ; // milliseconds; 30 minutes
public static final int wCacheMaxChunk = 800 ; // maximum number of references for each urlhash
public static final int lowcachedivisor = 900 ;
public static final int maxCollectionPartition = 7 ; // should be 7
2009-04-03 00:15:33 +02:00
public static final long targetFileSize = 100 * 1024 * 1024 ; // 100 MB
public static final long maxFileSize = BLOBArray . oneGigabyte ; // 1GB
2008-03-26 15:13:05 +01:00
2009-04-15 08:34:27 +02:00
// the reference factory
public static final ReferenceFactory < WordReference > wordReferenceFactory = new WordReferenceFactory ( ) ;
2008-05-14 23:36:02 +02:00
public static final String CRAWL_PROFILE_PROXY = " proxy " ;
public static final String CRAWL_PROFILE_REMOTE = " remote " ;
public static final String CRAWL_PROFILE_SNIPPET_LOCAL_TEXT = " snippetLocalText " ;
public static final String CRAWL_PROFILE_SNIPPET_GLOBAL_TEXT = " snippetGlobalText " ;
public static final String CRAWL_PROFILE_SNIPPET_LOCAL_MEDIA = " snippetLocalMedia " ;
public static final String CRAWL_PROFILE_SNIPPET_GLOBAL_MEDIA = " snippetGlobalMedia " ;
2009-04-16 10:01:38 +02:00
public static final String CRAWL_PROFILE_SURROGATE = " surrogates " ;
2008-10-08 20:26:36 +02:00
public static final String DBFILE_ACTIVE_CRAWL_PROFILES = " crawlProfilesActive.heap " ;
public static final String DBFILE_PASSIVE_CRAWL_PROFILES = " crawlProfilesPassive.heap " ;
2008-05-14 23:36:02 +02:00
2008-08-26 15:20:18 +02:00
public static final long CRAWL_PROFILE_PROXY_RECRAWL_CYCLE = 60L * 24L ;
public static final long CRAWL_PROFILE_SNIPPET_LOCAL_TEXT_RECRAWL_CYCLE = 60L * 24L * 30L ;
public static final long CRAWL_PROFILE_SNIPPET_GLOBAL_TEXT_RECRAWL_CYCLE = 60L * 24L * 30L ;
public static final long CRAWL_PROFILE_SNIPPET_LOCAL_MEDIA_RECRAWL_CYCLE = 60L * 24L * 30L ;
public static final long CRAWL_PROFILE_SNIPPET_GLOBAL_MEDIA_RECRAWL_CYCLE = 60L * 24L * 30L ;
2009-04-16 10:01:38 +02:00
public static final long CRAWL_PROFILE_SURROGATE_RECRAWL_CYCLE = 60L * 24L * 30L ;
2008-08-26 15:20:18 +02:00
2009-03-16 17:24:53 +01:00
public static final ByteOrder wordOrder = Base64Order . enhancedCoder ;
2008-05-14 23:36:02 +02:00
2009-04-15 08:34:27 +02:00
private final BufferedIndex < WordReference > index ;
2009-03-10 14:38:40 +01:00
private final Log log ;
2009-03-13 11:34:51 +01:00
private MetadataRepository metadata ;
private final yacySeedDB peers ;
2009-03-10 14:38:40 +01:00
private final File primaryRoot , secondaryRoot ;
public IndexingStack queuePreStack ;
public CrawlProfile profilesActiveCrawls , profilesPassiveCrawls ;
public CrawlProfile . entry defaultProxyProfile ;
public CrawlProfile . entry defaultRemoteProfile ;
public CrawlProfile . entry defaultTextSnippetLocalProfile , defaultTextSnippetGlobalProfile ;
public CrawlProfile . entry defaultMediaSnippetLocalProfile , defaultMediaSnippetGlobalProfile ;
2009-04-16 10:01:38 +02:00
public CrawlProfile . entry defaultSurrogateProfile ;
2009-03-10 14:38:40 +01:00
private final File queuesRoot ;
2009-04-15 08:34:27 +02:00
private IODispatcher < WordReference > merger ;
2009-03-20 15:54:37 +01:00
2009-01-23 16:32:27 +01:00
public plasmaWordIndex (
final String networkName ,
2009-01-31 00:33:47 +01:00
final Log log ,
2009-01-23 16:32:27 +01:00
final File indexPrimaryRoot ,
final File indexSecondaryRoot ,
final int entityCacheMaxSize ,
2009-04-02 17:08:56 +02:00
final boolean useCommons ,
2009-01-23 16:32:27 +01:00
final int redundancy ,
2009-04-23 00:39:12 +02:00
final int partitionExponent ) throws IOException {
2009-04-17 15:04:56 +02:00
log . logInfo ( " Initializing Word Index for the network ' " + networkName + " ', word hash cache size is " + Word . hashCacheSize + " . " ) ;
2008-09-24 01:30:25 +02:00
if ( networkName = = null | | networkName . length ( ) = = 0 ) {
log . logSevere ( " no network name given - shutting down " ) ;
System . exit ( 0 ) ;
}
2008-03-26 20:51:05 +01:00
this . log = log ;
2008-05-06 15:44:38 +02:00
this . primaryRoot = new File ( indexPrimaryRoot , networkName ) ;
this . secondaryRoot = new File ( indexSecondaryRoot , networkName ) ;
File indexPrimaryTextLocation = new File ( this . primaryRoot , " TEXT " ) ;
2008-03-16 23:31:54 +01:00
if ( ! indexPrimaryTextLocation . exists ( ) ) {
// patch old index locations; the secondary path is patched in plasmaCrawlLURL
2008-08-02 14:12:04 +02:00
final File oldPrimaryPath = new File ( new File ( indexPrimaryRoot , " PUBLIC " ) , " TEXT " ) ;
final File oldPrimaryTextLocation = new File ( new File ( indexPrimaryRoot , " PUBLIC " ) , " TEXT " ) ;
2008-03-16 23:31:54 +01:00
if ( oldPrimaryPath . exists ( ) & & oldPrimaryTextLocation . exists ( ) ) {
// move the text folder from the old location to the new location
assert ! indexPrimaryTextLocation . exists ( ) ;
indexPrimaryTextLocation . mkdirs ( ) ;
if ( oldPrimaryTextLocation . renameTo ( indexPrimaryTextLocation ) ) {
2009-03-30 17:31:25 +02:00
FileUtils . deletedelete ( oldPrimaryPath ) ;
2008-03-16 23:31:54 +01:00
} else {
indexPrimaryTextLocation = oldPrimaryTextLocation ; // emergency case: stay with old directory
}
}
}
2009-03-30 17:31:25 +02:00
// check if the peer has migrated the index
if ( new File ( indexPrimaryTextLocation , " RICOLLECTION " ) . exists ( ) ) {
2009-04-23 00:39:12 +02:00
this . merger = new IODispatcher < WordReference > ( plasmaWordIndex . wordReferenceFactory , 1 , 1 ) ;
2009-03-31 09:51:32 +02:00
if ( this . merger ! = null ) this . merger . start ( ) ;
2009-04-23 00:39:12 +02:00
this . index = new IndexCollectionMigration < WordReference > (
2009-03-30 17:31:25 +02:00
indexPrimaryTextLocation ,
2009-04-15 08:34:27 +02:00
wordReferenceFactory ,
2009-03-30 17:31:25 +02:00
wordOrder ,
2009-04-03 15:23:45 +02:00
WordReferenceRow . urlEntryRow ,
2009-03-30 17:31:25 +02:00
entityCacheMaxSize ,
2009-03-31 18:49:02 +02:00
targetFileSize ,
maxFileSize ,
2009-03-30 17:31:25 +02:00
this . merger ,
2009-04-23 00:39:12 +02:00
log ) ;
2009-03-30 17:31:25 +02:00
} else {
2009-04-15 08:34:27 +02:00
this . merger = new IODispatcher < WordReference > ( plasmaWordIndex . wordReferenceFactory , 1 , 1 ) ;
2009-03-31 09:51:32 +02:00
this . merger . start ( ) ;
2009-04-15 08:34:27 +02:00
this . index = new IndexCell < WordReference > (
2009-03-30 17:31:25 +02:00
new File ( indexPrimaryTextLocation , " RICELL " ) ,
2009-04-15 08:34:27 +02:00
wordReferenceFactory ,
2009-03-30 17:31:25 +02:00
wordOrder ,
2009-04-03 15:23:45 +02:00
WordReferenceRow . urlEntryRow ,
2009-03-30 17:31:25 +02:00
entityCacheMaxSize ,
2009-03-31 18:49:02 +02:00
targetFileSize ,
maxFileSize ,
2009-03-30 17:31:25 +02:00
this . merger ) ;
}
2009-03-17 14:03:27 +01:00
2009-03-30 06:41:06 +02:00
// migrate LURL-db files into new subdirectory METADATA
File textdir = new File ( this . secondaryRoot , " TEXT " ) ;
File metadatadir = new File ( textdir , " METADATA " ) ;
if ( ! metadatadir . exists ( ) ) metadatadir . mkdirs ( ) ;
String [ ] l = textdir . list ( ) ;
for ( int i = 0 ; i < l . length ; i + + ) {
if ( l [ i ] . startsWith ( " urls. " ) ) ( new File ( textdir , l [ i ] ) ) . renameTo ( new File ( metadatadir , l [ i ] ) ) ;
}
2006-12-05 03:47:51 +01:00
// create LURL-db
2009-03-30 06:41:06 +02:00
metadata = new MetadataRepository ( metadatadir ) ;
2006-08-07 23:49:39 +02:00
2008-05-14 23:36:02 +02:00
// make crawl profiles database and default profiles
this . queuesRoot = new File ( this . primaryRoot , " QUEUES " ) ;
this . queuesRoot . mkdirs ( ) ;
this . log . logConfig ( " Initializing Crawl Profiles " ) ;
2008-08-02 14:12:04 +02:00
final File profilesActiveFile = new File ( queuesRoot , DBFILE_ACTIVE_CRAWL_PROFILES ) ;
2008-05-14 23:36:02 +02:00
if ( ! profilesActiveFile . exists ( ) ) {
// migrate old file
2008-08-02 14:12:04 +02:00
final File oldFile = new File ( new File ( queuesRoot . getParentFile ( ) . getParentFile ( ) . getParentFile ( ) , " PLASMADB " ) , " crawlProfilesActive1.db " ) ;
2008-05-14 23:36:02 +02:00
if ( oldFile . exists ( ) ) oldFile . renameTo ( profilesActiveFile ) ;
}
2008-10-08 20:26:36 +02:00
try {
this . profilesActiveCrawls = new CrawlProfile ( profilesActiveFile ) ;
} catch ( IOException e ) {
2009-03-30 17:31:25 +02:00
FileUtils . deletedelete ( profilesActiveFile ) ;
2008-10-08 20:26:36 +02:00
try {
this . profilesActiveCrawls = new CrawlProfile ( profilesActiveFile ) ;
} catch ( IOException e1 ) {
e1 . printStackTrace ( ) ;
this . profilesActiveCrawls = null ;
}
}
2008-05-14 23:36:02 +02:00
initActiveCrawlProfiles ( ) ;
2009-04-07 11:34:41 +02:00
log . logInfo ( " Loaded active crawl profiles from file " + profilesActiveFile . getName ( ) +
2008-05-14 23:36:02 +02:00
" , " + this . profilesActiveCrawls . size ( ) + " entries " +
" , " + profilesActiveFile . length ( ) / 1024 ) ;
2008-08-02 14:12:04 +02:00
final File profilesPassiveFile = new File ( queuesRoot , DBFILE_PASSIVE_CRAWL_PROFILES ) ;
2008-05-14 23:36:02 +02:00
if ( ! profilesPassiveFile . exists ( ) ) {
// migrate old file
2008-08-02 14:12:04 +02:00
final File oldFile = new File ( new File ( queuesRoot . getParentFile ( ) . getParentFile ( ) . getParentFile ( ) , " PLASMADB " ) , " crawlProfilesPassive1.db " ) ;
2008-05-14 23:36:02 +02:00
if ( oldFile . exists ( ) ) oldFile . renameTo ( profilesPassiveFile ) ;
}
2008-10-08 20:26:36 +02:00
try {
this . profilesPassiveCrawls = new CrawlProfile ( profilesPassiveFile ) ;
} catch ( IOException e ) {
2009-03-30 17:31:25 +02:00
FileUtils . deletedelete ( profilesPassiveFile ) ;
2008-10-08 20:26:36 +02:00
try {
this . profilesPassiveCrawls = new CrawlProfile ( profilesPassiveFile ) ;
} catch ( IOException e1 ) {
e1 . printStackTrace ( ) ;
this . profilesPassiveCrawls = null ;
}
}
2009-04-07 11:34:41 +02:00
log . logInfo ( " Loaded passive crawl profiles from file " + profilesPassiveFile . getName ( ) +
2008-05-14 23:36:02 +02:00
" , " + this . profilesPassiveCrawls . size ( ) + " entries " +
" , " + profilesPassiveFile . length ( ) / 1024 ) ;
// init queues
2008-08-02 14:12:04 +02:00
final File preStackFile = new File ( queuesRoot , " urlNoticePreStack " ) ;
2008-05-14 23:36:02 +02:00
if ( ! preStackFile . exists ( ) ) {
// migrate old file
2008-08-02 14:12:04 +02:00
final File oldFile = new File ( new File ( queuesRoot . getParentFile ( ) . getParentFile ( ) . getParentFile ( ) , " PLASMADB " ) , " switchboardQueue.stack " ) ;
2008-05-14 23:36:02 +02:00
if ( oldFile . exists ( ) ) oldFile . renameTo ( preStackFile ) ;
}
this . queuePreStack = new IndexingStack ( this , preStackFile , this . profilesActiveCrawls ) ;
2008-05-06 01:13:47 +02:00
// create or init seed cache
2008-08-02 14:12:04 +02:00
final File networkRoot = new File ( this . primaryRoot , " NETWORK " ) ;
2008-05-06 01:13:47 +02:00
networkRoot . mkdirs ( ) ;
2008-08-02 14:12:04 +02:00
final File mySeedFile = new File ( networkRoot , yacySeedDB . DBFILE_OWN_SEED ) ;
final File oldSeedFile = new File ( new File ( indexPrimaryRoot . getParentFile ( ) , " YACYDB " ) , " mySeed.txt " ) ;
2008-05-06 01:13:47 +02:00
if ( oldSeedFile . exists ( ) ) oldSeedFile . renameTo ( mySeedFile ) ;
2009-03-13 11:34:51 +01:00
peers = new yacySeedDB (
replaced old DHT transmission method with new method. Many things have changed! some of them:
- after a index selection is made, the index is splitted into its vertical components
- from differrent index selctions the splitted components can be accumulated before they are placed into the transmission queue
- each splitted chunk gets its own transmission thread
- multiple transmission threads are started concurrently
- the process can be monitored with the blocking queue servlet
To implement that, a new package de.anomic.yacy.dht was created. Some old files have been removed.
The new index distribution model using a vertical DHT was implemented. An abstraction of this model
is implemented in the new dht package as interface. The freeworld network has now a configuration
of two vertial partitions; sixteen partitions are planned and will be configured if the process is bug-free.
This modification has three main targets:
- enhance the DHT transmission speed
- with a vertical DHT, a search will speed up. With two partitions, two times. With sixteen, sixteen times.
- the vertical DHT will apply a semi-dht for URLs, and peers will receive a fraction of the overall URLs they received before.
with two partitions, the fractions will be halve. With sixteen partitions, a 1/16 of the previous number of URLs.
BE CAREFULL, THIS IS A MAJOR CODE CHANGE, POSSIBLY FULL OF BUGS AND HARMFUL THINGS.
git-svn-id: https://svn.berlios.de/svnroot/repos/yacy/trunk@5586 6c8d7289-2bf4-0310-a012-ef5d649a1542
2009-02-10 01:06:59 +01:00
networkRoot ,
" seed.new.heap " ,
" seed.old.heap " ,
" seed.pot.heap " ,
2009-01-23 16:32:27 +01:00
mySeedFile ,
redundancy ,
replaced old DHT transmission method with new method. Many things have changed! some of them:
- after a index selection is made, the index is splitted into its vertical components
- from differrent index selctions the splitted components can be accumulated before they are placed into the transmission queue
- each splitted chunk gets its own transmission thread
- multiple transmission threads are started concurrently
- the process can be monitored with the blocking queue servlet
To implement that, a new package de.anomic.yacy.dht was created. Some old files have been removed.
The new index distribution model using a vertical DHT was implemented. An abstraction of this model
is implemented in the new dht package as interface. The freeworld network has now a configuration
of two vertial partitions; sixteen partitions are planned and will be configured if the process is bug-free.
This modification has three main targets:
- enhance the DHT transmission speed
- with a vertical DHT, a search will speed up. With two partitions, two times. With sixteen, sixteen times.
- the vertical DHT will apply a semi-dht for URLs, and peers will receive a fraction of the overall URLs they received before.
with two partitions, the fractions will be halve. With sixteen partitions, a 1/16 of the previous number of URLs.
BE CAREFULL, THIS IS A MAJOR CODE CHANGE, POSSIBLY FULL OF BUGS AND HARMFUL THINGS.
git-svn-id: https://svn.berlios.de/svnroot/repos/yacy/trunk@5586 6c8d7289-2bf4-0310-a012-ef5d649a1542
2009-02-10 01:06:59 +01:00
partitionExponent ) ;
2008-05-06 15:44:38 +02:00
}
2009-03-13 11:34:51 +01:00
public MetadataRepository metadata ( ) {
return this . metadata ;
}
public yacySeedDB peers ( ) {
return this . peers ;
2008-06-16 23:39:58 +02:00
}
2009-04-15 08:34:27 +02:00
public BufferedIndex < WordReference > index ( ) {
2009-03-13 15:56:25 +01:00
return this . index ;
}
2008-05-24 14:30:50 +02:00
public void clear ( ) {
try {
2009-03-16 01:18:37 +01:00
index . clear ( ) ;
2009-03-13 11:34:51 +01:00
metadata . clear ( ) ;
2008-08-02 14:12:04 +02:00
} catch ( final IOException e ) {
2008-05-24 14:30:50 +02:00
e . printStackTrace ( ) ;
}
queuePreStack . clear ( ) ;
}
2008-05-14 23:36:02 +02:00
private void initActiveCrawlProfiles ( ) {
this . defaultProxyProfile = null ;
this . defaultRemoteProfile = null ;
this . defaultTextSnippetLocalProfile = null ;
this . defaultTextSnippetGlobalProfile = null ;
this . defaultMediaSnippetLocalProfile = null ;
this . defaultMediaSnippetGlobalProfile = null ;
2009-04-16 10:01:38 +02:00
this . defaultSurrogateProfile = null ;
2008-08-02 14:12:04 +02:00
final Iterator < CrawlProfile . entry > i = this . profilesActiveCrawls . profiles ( true ) ;
2008-05-14 23:36:02 +02:00
CrawlProfile . entry profile ;
String name ;
try {
while ( i . hasNext ( ) ) {
profile = i . next ( ) ;
name = profile . name ( ) ;
if ( name . equals ( CRAWL_PROFILE_PROXY ) ) this . defaultProxyProfile = profile ;
if ( name . equals ( CRAWL_PROFILE_REMOTE ) ) this . defaultRemoteProfile = profile ;
if ( name . equals ( CRAWL_PROFILE_SNIPPET_LOCAL_TEXT ) ) this . defaultTextSnippetLocalProfile = profile ;
if ( name . equals ( CRAWL_PROFILE_SNIPPET_GLOBAL_TEXT ) ) this . defaultTextSnippetGlobalProfile = profile ;
if ( name . equals ( CRAWL_PROFILE_SNIPPET_LOCAL_MEDIA ) ) this . defaultMediaSnippetLocalProfile = profile ;
if ( name . equals ( CRAWL_PROFILE_SNIPPET_GLOBAL_MEDIA ) ) this . defaultMediaSnippetGlobalProfile = profile ;
2009-04-16 10:01:38 +02:00
if ( name . equals ( CRAWL_PROFILE_SURROGATE ) ) this . defaultSurrogateProfile = profile ;
2008-05-14 23:36:02 +02:00
}
2008-08-02 14:12:04 +02:00
} catch ( final Exception e ) {
2008-05-24 14:30:50 +02:00
this . profilesActiveCrawls . clear ( ) ;
2008-05-14 23:36:02 +02:00
this . defaultProxyProfile = null ;
this . defaultRemoteProfile = null ;
this . defaultTextSnippetLocalProfile = null ;
this . defaultTextSnippetGlobalProfile = null ;
this . defaultMediaSnippetLocalProfile = null ;
this . defaultMediaSnippetGlobalProfile = null ;
2009-04-16 10:01:38 +02:00
this . defaultSurrogateProfile = null ;
2008-05-14 23:36:02 +02:00
}
if ( this . defaultProxyProfile = = null ) {
// generate new default entry for proxy crawling
2008-11-14 10:58:56 +01:00
this . defaultProxyProfile = this . profilesActiveCrawls . newEntry ( " proxy " , null , CrawlProfile . KEYWORDS_PROXY , CrawlProfile . MATCH_ALL , CrawlProfile . MATCH_NEVER ,
2008-05-14 23:36:02 +02:00
0 /*Integer.parseInt(getConfig(PROXY_PREFETCH_DEPTH, "0"))*/ ,
2008-08-26 15:20:18 +02:00
this . profilesActiveCrawls . getRecrawlDate ( CRAWL_PROFILE_PROXY_RECRAWL_CYCLE ) , - 1 , - 1 , false ,
2008-05-14 23:36:02 +02:00
true /*getConfigBool(PROXY_INDEXING_LOCAL_TEXT, true)*/ ,
true /*getConfigBool(PROXY_INDEXING_LOCAL_MEDIA, true)*/ ,
true , true ,
false /*getConfigBool(PROXY_INDEXING_REMOTE, false)*/ , true , true , true ) ;
}
if ( this . defaultRemoteProfile = = null ) {
// generate new default entry for remote crawling
2008-11-14 10:58:56 +01:00
defaultRemoteProfile = this . profilesActiveCrawls . newEntry ( CRAWL_PROFILE_REMOTE , null , CrawlProfile . KEYWORDS_REMOTE , CrawlProfile . MATCH_ALL , CrawlProfile . MATCH_NEVER , 0 ,
2008-05-14 23:36:02 +02:00
- 1 , - 1 , - 1 , true , true , true , false , true , false , true , true , false ) ;
}
if ( this . defaultTextSnippetLocalProfile = = null ) {
// generate new default entry for snippet fetch and optional crawling
2008-11-14 10:58:56 +01:00
defaultTextSnippetLocalProfile = this . profilesActiveCrawls . newEntry ( CRAWL_PROFILE_SNIPPET_LOCAL_TEXT , null , CrawlProfile . KEYWORDS_SNIPPET , CrawlProfile . MATCH_ALL , CrawlProfile . MATCH_NEVER , 0 ,
2008-08-26 15:20:18 +02:00
this . profilesActiveCrawls . getRecrawlDate ( CRAWL_PROFILE_SNIPPET_LOCAL_TEXT_RECRAWL_CYCLE ) , - 1 , - 1 , true , false , false , false , false , false , true , true , false ) ;
2008-05-14 23:36:02 +02:00
}
if ( this . defaultTextSnippetGlobalProfile = = null ) {
// generate new default entry for snippet fetch and optional crawling
2008-11-14 10:58:56 +01:00
defaultTextSnippetGlobalProfile = this . profilesActiveCrawls . newEntry ( CRAWL_PROFILE_SNIPPET_GLOBAL_TEXT , null , CrawlProfile . KEYWORDS_SNIPPET , CrawlProfile . MATCH_ALL , CrawlProfile . MATCH_NEVER , 0 ,
2008-08-26 15:20:18 +02:00
this . profilesActiveCrawls . getRecrawlDate ( CRAWL_PROFILE_SNIPPET_GLOBAL_TEXT_RECRAWL_CYCLE ) , - 1 , - 1 , true , true , true , true , true , false , true , true , false ) ;
2008-05-14 23:36:02 +02:00
}
if ( this . defaultMediaSnippetLocalProfile = = null ) {
// generate new default entry for snippet fetch and optional crawling
2008-11-14 10:58:56 +01:00
defaultMediaSnippetLocalProfile = this . profilesActiveCrawls . newEntry ( CRAWL_PROFILE_SNIPPET_LOCAL_MEDIA , null , CrawlProfile . KEYWORDS_SNIPPET , CrawlProfile . MATCH_ALL , CrawlProfile . MATCH_NEVER , 0 ,
2008-08-26 15:20:18 +02:00
this . profilesActiveCrawls . getRecrawlDate ( CRAWL_PROFILE_SNIPPET_LOCAL_MEDIA_RECRAWL_CYCLE ) , - 1 , - 1 , true , false , false , false , false , false , true , true , false ) ;
2008-05-14 23:36:02 +02:00
}
if ( this . defaultMediaSnippetGlobalProfile = = null ) {
// generate new default entry for snippet fetch and optional crawling
2008-11-14 10:58:56 +01:00
defaultMediaSnippetGlobalProfile = this . profilesActiveCrawls . newEntry ( CRAWL_PROFILE_SNIPPET_GLOBAL_MEDIA , null , CrawlProfile . KEYWORDS_SNIPPET , CrawlProfile . MATCH_ALL , CrawlProfile . MATCH_NEVER , 0 ,
2008-08-26 15:20:18 +02:00
this . profilesActiveCrawls . getRecrawlDate ( CRAWL_PROFILE_SNIPPET_GLOBAL_MEDIA_RECRAWL_CYCLE ) , - 1 , - 1 , true , false , true , true , true , false , true , true , false ) ;
2008-05-14 23:36:02 +02:00
}
2009-04-16 10:01:38 +02:00
if ( this . defaultSurrogateProfile = = null ) {
// generate new default entry for surrogate parsing
defaultSurrogateProfile = this . profilesActiveCrawls . newEntry ( CRAWL_PROFILE_SURROGATE , null , CrawlProfile . KEYWORDS_SNIPPET , CrawlProfile . MATCH_ALL , CrawlProfile . MATCH_NEVER , 0 ,
this . profilesActiveCrawls . getRecrawlDate ( CRAWL_PROFILE_SURROGATE_RECRAWL_CYCLE ) , - 1 , - 1 , true , true , false , false , false , false , true , true , false ) ;
}
2008-05-14 23:36:02 +02:00
}
private void resetProfiles ( ) {
2008-08-02 14:12:04 +02:00
final File pdb = new File ( this . queuesRoot , DBFILE_ACTIVE_CRAWL_PROFILES ) ;
2009-03-30 17:31:25 +02:00
if ( pdb . exists ( ) ) FileUtils . deletedelete ( pdb ) ;
2008-10-08 20:26:36 +02:00
try {
profilesActiveCrawls = new CrawlProfile ( pdb ) ;
} catch ( IOException e ) {
e . printStackTrace ( ) ;
}
2008-05-14 23:36:02 +02:00
initActiveCrawlProfiles ( ) ;
}
public boolean cleanProfiles ( ) throws InterruptedException {
if ( queuePreStack . size ( ) > 0 ) return false ;
final Iterator < CrawlProfile . entry > iter = profilesActiveCrawls . profiles ( true ) ;
CrawlProfile . entry entry ;
boolean hasDoneSomething = false ;
try {
while ( iter . hasNext ( ) ) {
// check for interruption
if ( Thread . currentThread ( ) . isInterrupted ( ) ) throw new InterruptedException ( " Shutdown in progress " ) ;
// getting next profile
entry = iter . next ( ) ;
if ( ! ( ( entry . name ( ) . equals ( CRAWL_PROFILE_PROXY ) ) | |
( entry . name ( ) . equals ( CRAWL_PROFILE_REMOTE ) ) | |
( entry . name ( ) . equals ( CRAWL_PROFILE_SNIPPET_LOCAL_TEXT ) ) | |
( entry . name ( ) . equals ( CRAWL_PROFILE_SNIPPET_GLOBAL_TEXT ) ) | |
( entry . name ( ) . equals ( CRAWL_PROFILE_SNIPPET_LOCAL_MEDIA ) ) | |
2009-04-16 10:01:38 +02:00
( entry . name ( ) . equals ( CRAWL_PROFILE_SNIPPET_GLOBAL_MEDIA ) ) | |
( entry . name ( ) . equals ( CRAWL_PROFILE_SURROGATE ) ) ) ) {
2008-05-14 23:36:02 +02:00
profilesPassiveCrawls . newEntry ( entry . map ( ) ) ;
iter . remove ( ) ;
hasDoneSomething = true ;
}
}
2008-08-02 14:12:04 +02:00
} catch ( final kelondroException e ) {
2008-05-14 23:36:02 +02:00
resetProfiles ( ) ;
hasDoneSomething = true ;
}
return hasDoneSomething ;
}
2008-08-02 14:12:04 +02:00
public File getLocation ( final boolean primary ) {
2008-05-06 15:44:38 +02:00
return ( primary ) ? this . primaryRoot : this . secondaryRoot ;
2005-04-07 21:19:42 +02:00
}
2006-11-19 21:05:25 +01:00
2008-08-02 15:57:00 +02:00
/ * *
* this is called by the switchboard to put in a new page into the index
* use all the words in one condenser object to simultanous create index entries
*
* @param url
* @param urlModified
* @param document
* @param condenser
* @param language
* @param doctype
* @param outlinksSame
* @param outlinksOther
* @return
* /
2009-04-03 15:23:45 +02:00
public int addPageIndex ( final yacyURL url , final Date urlModified , final plasmaParserDocument document , final Condenser condenser , final String language , final char doctype , final int outlinksSame , final int outlinksOther ) {
2006-12-08 03:14:56 +01:00
int wordCount = 0 ;
2008-08-02 14:12:04 +02:00
final int urlLength = url . toNormalform ( true , true ) . length ( ) ;
final int urlComps = htmlFilterContentScraper . urlComps ( url . toString ( ) ) . length ;
2006-12-08 03:14:56 +01:00
// iterate over all words of context text
2009-03-02 00:58:14 +01:00
final Iterator < Map . Entry < String , Word > > i = condenser . words ( ) . entrySet ( ) . iterator ( ) ;
Map . Entry < String , Word > wentry ;
2006-01-19 15:13:39 +01:00
String word ;
2009-04-17 11:26:16 +02:00
int len = ( document = = null ) ? urlLength : document . dc_title ( ) . length ( ) ;
2009-04-21 16:23:04 +02:00
WordReferenceRow ientry = new WordReferenceRow ( url . hash ( ) ,
urlLength , urlComps , len ,
condenser . RESULT_NUMB_WORDS ,
condenser . RESULT_NUMB_SENTENCES ,
urlModified . getTime ( ) ,
System . currentTimeMillis ( ) ,
language ,
doctype ,
outlinksSame , outlinksOther ) ;
Word wprop ;
2006-01-19 15:13:39 +01:00
while ( i . hasNext ( ) ) {
2008-01-11 01:12:01 +01:00
wentry = i . next ( ) ;
word = wentry . getKey ( ) ;
wprop = wentry . getValue ( ) ;
2006-12-08 03:14:56 +01:00
assert ( wprop . flags ! = null ) ;
2009-04-21 16:23:04 +02:00
ientry . setWord ( wprop ) ;
2009-03-16 09:32:28 +01:00
try {
2009-04-21 16:23:04 +02:00
this . index . add ( Word . word2hash ( word ) , ientry ) ;
2009-03-16 09:32:28 +01:00
} catch ( IOException e ) {
e . printStackTrace ( ) ;
}
2006-12-08 03:14:56 +01:00
wordCount + + ;
2006-01-19 15:13:39 +01:00
}
2006-12-08 03:14:56 +01:00
return wordCount ;
2005-10-13 02:05:30 +02:00
}
2006-01-30 01:42:38 +01:00
2006-12-05 03:47:51 +01:00
public void close ( ) {
2009-03-20 15:54:37 +01:00
if ( this . merger ! = null ) this . merger . terminate ( ) ;
2009-03-13 15:56:25 +01:00
index . close ( ) ;
2009-03-13 11:34:51 +01:00
metadata . close ( ) ;
peers . close ( ) ;
2008-05-14 23:36:02 +02:00
profilesActiveCrawls . close ( ) ;
queuePreStack . close ( ) ;
2005-04-07 21:19:42 +02:00
}
2006-12-05 03:47:51 +01:00
2009-04-03 15:23:45 +02:00
public URLMetadataRow storeDocument ( final IndexingStack . QueueEntry entry , final plasmaParserDocument document , final Condenser condenser ) throws IOException {
2008-08-02 14:12:04 +02:00
final long startTime = System . currentTimeMillis ( ) ;
2008-03-26 20:51:05 +01:00
// CREATE INDEX
2008-09-21 02:04:42 +02:00
// load some document metadata
2008-08-02 14:12:04 +02:00
final String dc_title = document . dc_title ( ) ;
final yacyURL referrerURL = entry . referrerURL ( ) ;
final Date docDate = entry . getModificationDate ( ) ;
2008-09-21 02:04:42 +02:00
// do a identification of the language
String language = condenser . language ( ) ; // this is a statistical analysation of the content: will be compared with other attributes
2009-04-17 16:20:12 +02:00
String bymetadata = document . dc_language ( ) ; // the languageByMetadata may return null if there was no declaration
2008-09-18 15:12:33 +02:00
if ( language = = null ) {
2008-09-21 02:04:42 +02:00
// no statistics available, we take either the metadata (if given) or the TLD
2008-09-20 00:19:11 +02:00
language = ( bymetadata = = null ) ? entry . url ( ) . language ( ) : bymetadata ;
System . out . println ( " *** DEBUG LANGUAGE-BY-STATISTICS: " + entry . url ( ) + " FAILED, taking " + ( ( bymetadata = = null ) ? " TLD " : " metadata " ) + " : " + language ) ;
2008-09-18 15:12:33 +02:00
} else {
2008-09-21 02:04:42 +02:00
if ( bymetadata = = null ) {
// two possible results: compare and report conflicts
if ( language . equals ( entry . url ( ) . language ( ) ) )
System . out . println ( " *** DEBUG LANGUAGE-BY-STATISTICS: " + entry . url ( ) + " CONFIRMED - TLD IDENTICAL: " + language ) ;
else {
String error = " *** DEBUG LANGUAGE-BY-STATISTICS: " + entry . url ( ) + " CONFLICTING: " + language + " (the language given by the TLD is " + entry . url ( ) . language ( ) + " ) " ;
// see if we have a hint in the url that the statistic was right
String u = entry . url ( ) . toNormalform ( true , false ) . toLowerCase ( ) ;
if ( ! u . contains ( " / " + language + " / " ) & & ! u . contains ( " / " + iso639 . country ( language ) . toLowerCase ( ) + " / " ) ) {
// no confirmation using the url, use the TLD
2008-09-20 00:19:11 +02:00
language = entry . url ( ) . language ( ) ;
2008-09-21 02:04:42 +02:00
System . out . println ( error + " , corrected using the TLD " ) ;
} else {
// this is a strong hint that the statistics was in fact correct
System . out . println ( error + " , but the url proves that the statistic is correct " ) ;
2008-09-20 00:19:11 +02:00
}
2008-09-21 02:04:42 +02:00
}
} else {
// here we have three results: we can do a voting
if ( language . equals ( bymetadata ) ) {
System . out . println ( " *** DEBUG LANGUAGE-BY-STATISTICS: " + entry . url ( ) + " CONFIRMED - METADATA IDENTICAL: " + language ) ;
} else if ( language . equals ( entry . url ( ) . language ( ) ) ) {
System . out . println ( " *** DEBUG LANGUAGE-BY-STATISTICS: " + entry . url ( ) + " CONFIRMED - TLD IS IDENTICAL: " + language ) ;
} else if ( bymetadata . equals ( entry . url ( ) . language ( ) ) ) {
System . out . println ( " *** DEBUG LANGUAGE-BY-STATISTICS: " + entry . url ( ) + " CONFLICTING: " + language + " BUT METADATA AND TLD ARE IDENTICAL: " + bymetadata + " ) " ) ;
language = bymetadata ;
2008-09-20 00:19:11 +02:00
} else {
2008-09-21 02:04:42 +02:00
System . out . println ( " *** DEBUG LANGUAGE-BY-STATISTICS: " + entry . url ( ) + " CONFLICTING: ALL DIFFERENT! statistic: " + language + " , metadata: " + bymetadata + " , TLD: + " + entry . url ( ) . language ( ) + " . taking metadata. " ) ;
language = bymetadata ;
2008-09-20 00:19:11 +02:00
}
}
2008-09-18 15:12:33 +02:00
}
2008-03-26 20:51:05 +01:00
// create a new loaded URL db entry
2008-08-02 14:12:04 +02:00
final long ldate = System . currentTimeMillis ( ) ;
2009-04-03 15:23:45 +02:00
final URLMetadataRow newEntry = new URLMetadataRow (
2008-03-26 20:51:05 +01:00
entry . url ( ) , // URL
dc_title , // document description
document . dc_creator ( ) , // author
document . dc_subject ( ' ' ) , // tags
" " , // ETag
docDate , // modification date
new Date ( ) , // loaded date
new Date ( ldate + Math . max ( 0 , ldate - docDate . getTime ( ) ) / 2 ) , // freshdate, computed with Proxy-TTL formula
( referrerURL = = null ) ? null : referrerURL . hash ( ) , // referer hash
new byte [ 0 ] , // md5
( int ) entry . size ( ) , // size
condenser . RESULT_NUMB_WORDS , // word count
2008-08-19 16:10:40 +02:00
httpdProxyCacheEntry . docType ( document . dc_format ( ) ) , // doctype
2008-03-26 20:51:05 +01:00
condenser . RESULT_FLAGS , // flags
2008-09-18 15:12:33 +02:00
language , // language
2008-03-26 20:51:05 +01:00
document . inboundLinks ( ) , // inbound links
document . outboundLinks ( ) , // outbound links
document . getAudiolinks ( ) . size ( ) , // laudio
document . getImages ( ) . size ( ) , // limage
document . getVideolinks ( ) . size ( ) , // lvideo
document . getApplinks ( ) . size ( ) // lapp
) ;
// STORE URL TO LOADED-URL-DB
2009-04-27 17:29:50 +02:00
metadata . store ( newEntry ) ; // TODO: should be serialized; integrated in IODispatcher
2008-03-26 20:51:05 +01:00
2008-08-02 14:12:04 +02:00
final long storageEndTime = System . currentTimeMillis ( ) ;
2008-03-26 20:51:05 +01:00
// STORE PAGE INDEX INTO WORD INDEX DB
2008-08-02 14:12:04 +02:00
final int words = addPageIndex (
2008-03-26 20:51:05 +01:00
entry . url ( ) , // document url
docDate , // document mod date
document , // document content
condenser , // document condenser
2008-09-18 15:12:33 +02:00
language , // document language
2008-08-19 16:10:40 +02:00
httpdProxyCacheEntry . docType ( document . dc_format ( ) ) , // document type
2008-03-26 20:51:05 +01:00
document . inboundLinks ( ) , // inbound links
document . outboundLinks ( ) // outbound links
) ;
2008-08-02 14:12:04 +02:00
final long indexingEndTime = System . currentTimeMillis ( ) ;
2008-03-26 20:51:05 +01:00
if ( log . isInfo ( ) ) {
// TODO: UTF-8 docDescription seems not to be displayed correctly because
// of string concatenation
log . logInfo ( " *Indexed " + words + " words in URL " + entry . url ( ) +
" [ " + entry . urlHash ( ) + " ] " +
" \ n \ tDescription: " + dc_title +
" \ n \ tMimeType: " + document . dc_format ( ) + " | Charset: " + document . getCharset ( ) + " | " +
" Size: " + document . getTextLength ( ) + " bytes | " +
" Anchors: " + ( ( document . getAnchors ( ) = = null ) ? 0 : document . getAnchors ( ) . size ( ) ) +
" \ n \ tLinkStorageTime: " + ( storageEndTime - startTime ) + " ms | " +
" indexStorageTime: " + ( indexingEndTime - storageEndTime ) + " ms " ) ;
2009-03-13 11:34:51 +01:00
RSSFeed . channels ( ( entry . initiator ( ) . equals ( peers . mySeed ( ) . hash ) ) ? RSSFeed . LOCALINDEXING : RSSFeed . REMOTEINDEXING ) . addMessage ( new RSSMessage ( " Indexed web page " , dc_title , entry . url ( ) . toNormalform ( true , false ) ) ) ;
2008-03-26 20:51:05 +01:00
}
// finished
return newEntry ;
}
2009-03-16 17:24:53 +01:00
@SuppressWarnings ( " unchecked " )
2009-04-16 17:29:00 +02:00
public HashMap < byte [ ] , ReferenceContainer < WordReference > > [ ] localSearchContainers (
final TreeSet < byte [ ] > queryHashes ,
final TreeSet < byte [ ] > excludeHashes ,
2009-03-16 17:24:53 +01:00
final Set < String > urlselection ) {
// search for the set of hashes and return a map of of wordhash:indexContainer containing the seach result
// retrieve entities that belong to the hashes
2009-04-16 17:29:00 +02:00
HashMap < byte [ ] , ReferenceContainer < WordReference > > inclusionContainers =
2009-04-02 15:26:47 +02:00
( queryHashes . size ( ) = = 0 ) ?
2009-04-16 17:29:00 +02:00
new HashMap < byte [ ] , ReferenceContainer < WordReference > > ( 0 ) :
2009-04-02 15:26:47 +02:00
getContainers ( queryHashes , urlselection ) ;
2009-04-16 17:29:00 +02:00
if ( ( inclusionContainers . size ( ) ! = 0 ) & & ( inclusionContainers . size ( ) < queryHashes . size ( ) ) ) inclusionContainers = new HashMap < byte [ ] , ReferenceContainer < WordReference > > ( 0 ) ; // prevent that only a subset is returned
final HashMap < byte [ ] , ReferenceContainer < WordReference > > exclusionContainers =
2009-04-02 15:26:47 +02:00
( inclusionContainers . size ( ) = = 0 ) ?
2009-04-16 17:29:00 +02:00
new HashMap < byte [ ] , ReferenceContainer < WordReference > > ( 0 ) :
2009-04-02 15:26:47 +02:00
getContainers ( excludeHashes , urlselection ) ;
2009-03-16 17:24:53 +01:00
return new HashMap [ ] { inclusionContainers , exclusionContainers } ;
}
/ * *
* collect containers for given word hashes . This collection stops if a single container does not contain any references .
* In that case only a empty result is returned .
* @param wordHashes
* @param urlselection
* @return map of wordhash : indexContainer
* /
2009-04-16 17:29:00 +02:00
private HashMap < byte [ ] , ReferenceContainer < WordReference > > getContainers ( final TreeSet < byte [ ] > wordHashes , final Set < String > urlselection ) {
2009-03-16 17:24:53 +01:00
// retrieve entities that belong to the hashes
2009-04-16 17:29:00 +02:00
final HashMap < byte [ ] , ReferenceContainer < WordReference > > containers = new HashMap < byte [ ] , ReferenceContainer < WordReference > > ( wordHashes . size ( ) ) ;
byte [ ] singleHash ;
2009-04-15 08:34:27 +02:00
ReferenceContainer < WordReference > singleContainer ;
2009-04-16 17:29:00 +02:00
final Iterator < byte [ ] > i = wordHashes . iterator ( ) ;
2009-03-16 17:24:53 +01:00
while ( i . hasNext ( ) ) {
// get next word hash:
singleHash = i . next ( ) ;
// retrieve index
try {
singleContainer = index . get ( singleHash , urlselection ) ;
} catch ( IOException e ) {
e . printStackTrace ( ) ;
continue ;
}
// check result
2009-04-16 17:29:00 +02:00
if ( ( singleContainer = = null | | singleContainer . size ( ) = = 0 ) ) return new HashMap < byte [ ] , ReferenceContainer < WordReference > > ( 0 ) ;
2009-03-16 17:24:53 +01:00
containers . put ( singleHash , singleContainer ) ;
}
return containers ;
}
2006-03-08 23:06:11 +01:00
// The Cleaner class was provided as "UrldbCleaner" by Hydrox
2009-04-16 17:29:00 +02:00
public synchronized ReferenceCleaner getReferenceCleaner ( final byte [ ] startHash ) {
2008-03-26 15:13:05 +01:00
return new ReferenceCleaner ( startHash ) ;
2006-03-08 23:06:11 +01:00
}
2008-03-26 15:13:05 +01:00
public class ReferenceCleaner extends Thread {
2006-03-08 23:06:11 +01:00
2009-04-16 17:29:00 +02:00
private final byte [ ] startHash ;
2006-03-08 23:06:11 +01:00
private boolean run = true ;
private boolean pause = false ;
public int rwiCountAtStart = 0 ;
2009-04-16 17:29:00 +02:00
public byte [ ] wordHashNow = null ;
public byte [ ] lastWordHash = null ;
2006-03-08 23:06:11 +01:00
public int lastDeletionCounter = 0 ;
2009-04-16 17:29:00 +02:00
public ReferenceCleaner ( final byte [ ] startHash ) {
2006-03-08 23:06:11 +01:00
this . startHash = startHash ;
2009-03-13 15:56:25 +01:00
this . rwiCountAtStart = index ( ) . size ( ) ;
2006-03-08 23:06:11 +01:00
}
public void run ( ) {
2009-01-31 00:33:47 +01:00
Log . logInfo ( " INDEXCLEANER " , " IndexCleaner-Thread started " ) ;
2009-04-15 08:34:27 +02:00
ReferenceContainer < WordReference > container = null ;
WordReference entry = null ;
2007-09-05 11:01:35 +02:00
yacyURL url = null ;
2008-08-02 14:12:04 +02:00
final HashSet < String > urlHashs = new HashSet < String > ( ) ;
2009-03-16 17:24:53 +01:00
try {
2009-04-15 08:34:27 +02:00
Iterator < ReferenceContainer < WordReference > > indexContainerIterator = index . references ( startHash , false , 100 , false ) . iterator ( ) ;
2009-03-16 17:24:53 +01:00
while ( indexContainerIterator . hasNext ( ) & & run ) {
2006-03-08 23:06:11 +01:00
waiter ( ) ;
2009-03-16 17:24:53 +01:00
container = indexContainerIterator . next ( ) ;
2009-04-15 08:34:27 +02:00
final Iterator < WordReference > containerIterator = container . entries ( ) ;
2009-04-03 15:23:45 +02:00
wordHashNow = container . getTermHash ( ) ;
2009-03-16 17:24:53 +01:00
while ( containerIterator . hasNext ( ) & & run ) {
waiter ( ) ;
entry = containerIterator . next ( ) ;
// System.out.println("Wordhash: "+wordHash+" UrlHash:
// "+entry.getUrlHash());
2009-04-07 11:34:41 +02:00
final URLMetadataRow ue = metadata . load ( entry . metadataHash ( ) , entry , 0 ) ;
2009-03-16 17:24:53 +01:00
if ( ue = = null ) {
2009-04-07 11:34:41 +02:00
urlHashs . add ( entry . metadataHash ( ) ) ;
2009-03-16 17:24:53 +01:00
} else {
url = ue . metadata ( ) . url ( ) ;
if ( ( url = = null ) | | ( plasmaSwitchboard . urlBlacklist . isListed ( Blacklist . BLACKLIST_CRAWLER , url ) = = true ) ) {
2009-04-07 11:34:41 +02:00
urlHashs . add ( entry . metadataHash ( ) ) ;
2009-03-16 17:24:53 +01:00
}
2006-03-08 23:06:11 +01:00
}
2006-03-17 21:52:43 +01:00
}
2009-03-16 17:24:53 +01:00
if ( urlHashs . size ( ) > 0 ) try {
2009-04-03 15:23:45 +02:00
final int removed = index . remove ( container . getTermHash ( ) , urlHashs ) ;
2009-04-18 16:35:18 +02:00
Log . logFine ( " INDEXCLEANER " , container . getTermHashAsString ( ) + " : " + removed + " of " + container . size ( ) + " URL-entries deleted " ) ;
2009-04-03 15:23:45 +02:00
lastWordHash = container . getTermHash ( ) ;
2009-03-16 17:24:53 +01:00
lastDeletionCounter = urlHashs . size ( ) ;
urlHashs . clear ( ) ;
} catch ( IOException e ) {
e . printStackTrace ( ) ;
}
if ( ! containerIterator . hasNext ( ) ) {
// We may not be finished yet, try to get the next chunk of wordHashes
2009-04-15 08:34:27 +02:00
final TreeSet < ReferenceContainer < WordReference > > containers = index . references ( container . getTermHash ( ) , false , 100 , false ) ;
2006-07-26 13:21:51 +02:00
indexContainerIterator = containers . iterator ( ) ;
2009-03-16 17:24:53 +01:00
// Make sure we don't get the same wordhash twice, but don't skip a word
2009-04-03 15:23:45 +02:00
if ( ( indexContainerIterator . hasNext ( ) ) & & ( ! container . getTermHash ( ) . equals ( indexContainerIterator . next ( ) . getTermHash ( ) ) ) ) {
2009-03-16 17:24:53 +01:00
indexContainerIterator = containers . iterator ( ) ;
}
2006-03-18 00:39:10 +01:00
}
2006-03-08 23:06:11 +01:00
}
2009-03-16 17:24:53 +01:00
} catch ( IOException e ) {
e . printStackTrace ( ) ;
2006-03-08 23:06:11 +01:00
}
2009-01-31 00:33:47 +01:00
Log . logInfo ( " INDEXCLEANER " , " IndexCleaner-Thread stopped " ) ;
2006-03-08 23:06:11 +01:00
}
public void abort ( ) {
synchronized ( this ) {
run = false ;
this . notifyAll ( ) ;
}
}
public void pause ( ) {
2006-12-22 20:17:23 +01:00
synchronized ( this ) {
if ( ! pause ) {
2006-03-08 23:06:11 +01:00
pause = true ;
2009-01-31 00:33:47 +01:00
Log . logInfo ( " INDEXCLEANER " , " IndexCleaner-Thread paused " ) ;
2006-03-08 23:06:11 +01:00
}
}
}
public void endPause ( ) {
2006-12-22 20:17:23 +01:00
synchronized ( this ) {
if ( pause ) {
2006-03-08 23:06:11 +01:00
pause = false ;
this . notifyAll ( ) ;
2009-01-31 00:33:47 +01:00
Log . logInfo ( " INDEXCLEANER " , " IndexCleaner-Thread resumed " ) ;
2006-03-08 23:06:11 +01:00
}
}
}
2006-12-22 20:17:23 +01:00
2006-03-08 23:06:11 +01:00
public void waiter ( ) {
2006-12-22 20:17:23 +01:00
synchronized ( this ) {
2006-03-08 23:06:11 +01:00
if ( this . pause ) {
try {
this . wait ( ) ;
2008-08-02 14:12:04 +02:00
} catch ( final InterruptedException e ) {
2006-03-08 23:06:11 +01:00
this . run = false ;
return ;
}
}
}
}
}
2005-04-07 21:19:42 +02:00
}