2012-08-17 15:33:02 +02:00
// Fulltext.java
2008-03-26 15:13:05 +01:00
// (C) 2006 by Michael Peter Christen; mc@yacy.net, Frankfurt a. M., Germany
// first published 2006 as part of 'plasmaCrawlLURL.java' on http://yacy.net
//
// This is a part of YaCy, a peer-to-peer based web search engine
//
// LICENSE
2011-07-15 10:38:10 +02:00
//
2008-03-26 15:13:05 +01:00
// This program is free software; you can redistribute it and/or modify
// it under the terms of the GNU General Public License as published by
// the Free Software Foundation; either version 2 of the License, or
// (at your option) any later version.
//
// This program is distributed in the hope that it will be useful,
// but WITHOUT ANY WARRANTY; without even the implied warranty of
// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
// GNU General Public License for more details.
//
// You should have received a copy of the GNU General Public License
// along with this program; if not, write to the Free Software
// Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
2011-09-25 18:59:06 +02:00
package net.yacy.search.index ;
2008-03-26 15:13:05 +01:00
import java.io.BufferedOutputStream ;
import java.io.File ;
import java.io.FileOutputStream ;
import java.io.IOException ;
import java.io.PrintWriter ;
2012-11-02 01:22:31 +01:00
import java.net.MalformedURLException ;
2008-09-04 21:41:57 +02:00
import java.util.ArrayList ;
2013-02-22 15:45:15 +01:00
import java.util.Collection ;
2012-09-26 16:56:33 +02:00
import java.util.Date ;
2013-05-17 13:59:37 +02:00
import java.util.HashSet ;
2008-03-26 15:13:05 +01:00
import java.util.Iterator ;
2012-09-24 17:05:28 +02:00
import java.util.List ;
2008-09-04 21:41:57 +02:00
import java.util.Map ;
2013-05-17 13:59:37 +02:00
import java.util.Set ;
2012-11-02 01:22:31 +01:00
import java.util.concurrent.BlockingQueue ;
2012-11-13 16:54:28 +01:00
import java.util.concurrent.atomic.AtomicInteger ;
2013-01-24 12:39:19 +01:00
import java.util.regex.Pattern ;
2008-03-26 15:13:05 +01:00
2012-09-24 17:05:28 +02:00
import net.yacy.cora.date.GenericFormatter ;
2012-11-13 16:54:28 +01:00
import net.yacy.cora.date.ISO8601Formatter ;
2013-09-15 00:30:23 +02:00
import net.yacy.cora.document.encoding.ASCII ;
import net.yacy.cora.document.id.DigestURL ;
import net.yacy.cora.document.id.MultiProtocolURL ;
2012-11-02 01:22:31 +01:00
import net.yacy.cora.federate.solr.connector.AbstractSolrConnector ;
2012-09-25 21:20:03 +02:00
import net.yacy.cora.federate.solr.connector.EmbeddedSolrConnector ;
2013-02-21 13:23:55 +01:00
import net.yacy.cora.federate.solr.connector.RemoteSolrConnector ;
import net.yacy.cora.federate.solr.connector.ShardSelection ;
2012-09-25 21:20:03 +02:00
import net.yacy.cora.federate.solr.connector.SolrConnector ;
2013-02-21 13:23:55 +01:00
import net.yacy.cora.federate.solr.instance.EmbeddedInstance ;
import net.yacy.cora.federate.solr.instance.InstanceMirror ;
import net.yacy.cora.federate.solr.instance.RemoteInstance ;
import net.yacy.cora.federate.solr.instance.ShardInstance ;
2011-12-16 23:59:29 +01:00
import net.yacy.cora.order.CloneableIterator ;
2013-05-09 00:22:45 +02:00
import net.yacy.cora.protocol.HeaderFramework ;
2013-01-24 12:39:19 +01:00
import net.yacy.cora.sorting.ReversibleScoreMap ;
2011-12-16 23:59:29 +01:00
import net.yacy.cora.sorting.ScoreMap ;
2013-02-27 22:40:23 +01:00
import net.yacy.cora.sorting.WeakPriorityBlockingQueue ;
2012-09-24 17:05:28 +02:00
import net.yacy.cora.storage.ZIPReader ;
import net.yacy.cora.storage.ZIPWriter ;
2013-07-09 14:28:25 +02:00
import net.yacy.cora.util.ConcurrentLog ;
2009-10-18 02:53:43 +02:00
import net.yacy.document.parser.html.CharacterCoding ;
2012-08-10 13:26:51 +02:00
import net.yacy.kelondro.data.meta.URIMetadataNode ;
2009-10-11 02:12:19 +02:00
import net.yacy.kelondro.data.meta.URIMetadataRow ;
2013-02-27 22:40:23 +01:00
import net.yacy.kelondro.data.word.WordReferenceVars ;
2009-10-10 01:32:08 +02:00
import net.yacy.kelondro.index.Cache ;
2010-08-04 15:33:12 +02:00
import net.yacy.kelondro.index.Index ;
2009-10-10 01:32:08 +02:00
import net.yacy.kelondro.index.Row ;
2009-10-10 03:14:19 +02:00
import net.yacy.kelondro.table.SplitTable ;
2011-05-26 16:35:32 +02:00
import net.yacy.kelondro.util.MemoryControl ;
2012-06-25 11:34:38 +02:00
import net.yacy.search.Switchboard ;
2013-02-21 13:23:55 +01:00
import net.yacy.search.schema.CollectionConfiguration ;
import net.yacy.search.schema.CollectionSchema ;
import net.yacy.search.schema.WebgraphConfiguration ;
import net.yacy.search.schema.WebgraphSchema ;
2012-07-19 11:34:05 +02:00
2012-08-10 13:26:51 +02:00
import org.apache.solr.common.SolrDocument ;
2012-08-10 15:39:10 +02:00
import org.apache.solr.common.SolrException ;
2012-08-17 15:46:26 +02:00
import org.apache.solr.common.SolrInputDocument ;
2013-02-04 10:55:49 +01:00
import org.apache.lucene.util.Version ;
2012-07-23 16:28:39 +02:00
2013-01-24 12:39:19 +01:00
public final class Fulltext {
2008-03-26 15:13:05 +01:00
2013-10-07 17:09:40 +02:00
private static final String SOLR_PATH = " solr_45 " ; // the number should be identical to the number in the property luceneMatchVersion in solrconfig.xml
private static final String SOLR_OLD_PATH [ ] = new String [ ] { " solr_36 " , " solr_40 " , " solr_44 " } ;
2012-11-02 12:29:48 +01:00
2008-03-26 15:13:05 +01:00
// class objects
2013-07-23 16:46:44 +02:00
private final File segmentPath ;
private Index urlIndexFile ;
private Export exportthread ; // will have a export thread assigned if exporter is running
private String tablename ;
private ArrayList < HostStat > statsDump ;
private InstanceMirror solrInstances ;
2013-02-21 13:23:55 +01:00
private final CollectionConfiguration collectionConfiguration ;
private final WebgraphConfiguration webgraphConfiguration ;
2013-07-23 16:46:44 +02:00
private boolean writeWebgraph ;
2013-02-21 13:23:55 +01:00
protected Fulltext ( final File segmentPath , final CollectionConfiguration collectionConfiguration , final WebgraphConfiguration webgraphConfiguration ) {
this . segmentPath = segmentPath ;
2012-07-22 13:18:45 +02:00
this . tablename = null ;
this . urlIndexFile = null ;
2009-03-10 14:38:40 +01:00
this . exportthread = null ; // will have a export thread assigned if exporter is running
this . statsDump = null ;
2013-02-21 13:23:55 +01:00
this . solrInstances = new InstanceMirror ( ) ;
this . collectionConfiguration = collectionConfiguration ;
this . webgraphConfiguration = webgraphConfiguration ;
2013-07-23 16:46:44 +02:00
this . writeWebgraph = false ;
}
public void writeWebgraph ( boolean check ) {
this . writeWebgraph = check ;
}
public boolean writeToWebgraph ( ) {
return this . writeWebgraph ;
2011-11-08 12:49:04 +01:00
}
2012-07-23 16:28:39 +02:00
2013-01-14 03:06:24 +01:00
/ * *
* @deprecated
* used only for migration
* @return the connected URLDb
* /
@Deprecated
public Index getURLDb ( ) {
return this . urlIndexFile ;
}
2012-10-18 15:09:04 +02:00
protected void connectUrlDb ( final String tablename , final boolean useTailCache , final boolean exceed134217727 ) {
2012-07-22 13:18:45 +02:00
if ( this . urlIndexFile ! = null ) return ;
this . tablename = tablename ;
2013-02-21 13:23:55 +01:00
this . urlIndexFile = new SplitTable ( new File ( this . segmentPath , " default " ) , tablename , URIMetadataRow . rowdef , useTailCache , exceed134217727 ) ;
2013-01-12 15:20:23 +01:00
// SplitTable always returns != null, even if no file exists.
// as old UrlDb should be null if not exist, check and close if empty
// TODO: check if a SplitTable.open() returning null or error status on not existing file is preferable
if ( this . urlIndexFile . isEmpty ( ) ) {
disconnectUrlDb ( ) ;
}
2011-11-08 12:49:04 +01:00
}
2012-07-22 13:18:45 +02:00
public void disconnectUrlDb ( ) {
if ( this . urlIndexFile = = null ) return ;
this . urlIndexFile . close ( ) ;
this . urlIndexFile = null ;
2012-07-20 11:40:33 +02:00
}
2013-02-21 13:23:55 +01:00
public CollectionConfiguration getDefaultConfiguration ( ) {
return this . collectionConfiguration ;
}
public WebgraphConfiguration getWebgraphConfiguration ( ) {
return this . webgraphConfiguration ;
2012-08-05 15:49:27 +02:00
}
2012-07-23 16:28:39 +02:00
public boolean connectedLocalSolr ( ) {
2013-02-21 13:23:55 +01:00
return this . solrInstances . isConnected0 ( ) ;
2012-07-23 16:28:39 +02:00
}
2013-02-13 02:29:47 +01:00
public void connectLocalSolr ( ) throws IOException {
2013-02-21 13:23:55 +01:00
File solrLocation = new File ( this . segmentPath , SOLR_PATH ) ;
2012-11-02 12:29:48 +01:00
// migrate old solr to new
for ( String oldVersion : SOLR_OLD_PATH ) {
2013-02-21 13:23:55 +01:00
File oldLocation = new File ( this . segmentPath , oldVersion ) ;
2012-11-02 12:29:48 +01:00
if ( oldLocation . exists ( ) ) oldLocation . renameTo ( solrLocation ) ;
}
2013-02-21 13:23:55 +01:00
EmbeddedInstance localCollectionInstance = new EmbeddedInstance ( new File ( new File ( Switchboard . getSwitchboard ( ) . appPath , " defaults " ) , " solr " ) , solrLocation , CollectionSchema . CORE_NAME , new String [ ] { CollectionSchema . CORE_NAME , WebgraphSchema . CORE_NAME } ) ;
EmbeddedSolrConnector localCollectionConnector = new EmbeddedSolrConnector ( localCollectionInstance ) ;
Version luceneVersion = localCollectionConnector . getConfig ( ) . getLuceneVersion ( " luceneMatchVersion " ) ;
2012-07-19 11:34:05 +02:00
String lvn = luceneVersion . name ( ) ;
2013-07-09 14:28:25 +02:00
ConcurrentLog . info ( " Fulltext " , " using lucene version " + lvn ) ;
2012-07-19 11:34:05 +02:00
int p = lvn . indexOf ( '_' ) ;
2012-11-02 12:29:48 +01:00
assert SOLR_PATH . endsWith ( lvn . substring ( p ) ) : " luceneVersion = " + lvn + " , solrPath = " + SOLR_PATH + " , p = " + p + " , check defaults/solr/solrconfig.xml " ;
2013-07-09 14:28:25 +02:00
ConcurrentLog . info ( " Fulltext " , " connected solr in " + solrLocation . toString ( ) + " , lucene version " + lvn + " , default core size: " + localCollectionConnector . getSize ( ) ) ;
2013-02-21 13:23:55 +01:00
this . solrInstances . connect0 ( localCollectionInstance ) ;
2012-06-25 11:34:38 +02:00
}
2012-07-20 11:40:33 +02:00
public void disconnectLocalSolr ( ) {
2013-02-21 13:23:55 +01:00
this . solrInstances . disconnect0 ( ) ;
2012-07-20 11:40:33 +02:00
}
2012-07-22 13:18:45 +02:00
2012-07-23 16:28:39 +02:00
public boolean connectedRemoteSolr ( ) {
2013-02-21 13:23:55 +01:00
return this . solrInstances . isConnected1 ( ) ;
2012-07-23 16:28:39 +02:00
}
2013-02-21 13:23:55 +01:00
public void connectRemoteSolr ( final ArrayList < RemoteInstance > instances ) {
this . solrInstances . connect1 ( new ShardInstance ( instances , ShardSelection . Method . MODULO_HOST_MD5 ) ) ;
2012-07-22 13:18:45 +02:00
}
public void disconnectRemoteSolr ( ) {
2013-02-21 13:23:55 +01:00
this . solrInstances . disconnect1 ( ) ;
2012-07-22 13:18:45 +02:00
}
2012-07-23 16:28:39 +02:00
2013-02-22 15:45:15 +01:00
public EmbeddedSolrConnector getDefaultEmbeddedConnector ( ) {
return this . solrInstances . getDefaultEmbeddedConnector ( ) ;
}
public EmbeddedSolrConnector getEmbeddedConnector ( String corename ) {
return this . solrInstances . getEmbeddedConnector ( corename ) ;
2008-03-26 15:13:05 +01:00
}
2013-02-21 13:23:55 +01:00
public RemoteSolrConnector getDefaultRemoteSolrConnector ( ) {
if ( this . solrInstances . getSolr1 ( ) = = null ) return null ;
try {
return new RemoteSolrConnector ( this . solrInstances . getSolr1 ( ) ) ;
2013-07-17 18:31:30 +02:00
} catch ( final IOException e ) {
2013-02-21 13:23:55 +01:00
return null ;
}
2012-07-24 17:23:29 +02:00
}
2013-02-21 13:23:55 +01:00
public SolrConnector getDefaultConnector ( ) {
2013-05-29 13:09:34 +02:00
synchronized ( this . solrInstances ) {
return this . solrInstances . getDefaultMirrorConnector ( ) ;
}
2013-02-21 13:23:55 +01:00
}
public SolrConnector getWebgraphConnector ( ) {
2013-07-23 16:46:44 +02:00
if ( ! this . writeWebgraph ) return null ;
2013-05-29 13:09:34 +02:00
synchronized ( this . solrInstances ) {
return this . solrInstances . getMirrorConnector ( WebgraphSchema . CORE_NAME ) ;
}
2012-06-26 13:54:48 +02:00
}
2013-11-07 10:01:44 +01:00
public void clearCaches ( ) {
2012-07-22 13:18:45 +02:00
if ( this . urlIndexFile ! = null & & this . urlIndexFile instanceof Cache ) ( ( Cache ) this . urlIndexFile ) . clearCache ( ) ;
2012-06-08 12:48:25 +02:00
if ( this . statsDump ! = null ) this . statsDump . clear ( ) ;
2013-11-07 10:01:44 +01:00
this . solrInstances . clearCaches ( ) ;
2011-07-15 10:38:10 +02:00
this . statsDump = null ;
2008-06-16 23:39:58 +02:00
}
2011-07-15 10:38:10 +02:00
2013-01-04 16:39:34 +01:00
public void clearURLIndex ( ) throws IOException {
2011-07-15 10:38:10 +02:00
if ( this . exportthread ! = null ) this . exportthread . interrupt ( ) ;
2011-09-21 17:08:05 +02:00
if ( this . urlIndexFile = = null ) {
2013-02-21 13:23:55 +01:00
SplitTable . delete ( new File ( this . segmentPath , " default " ) , this . tablename ) ;
2011-09-21 17:08:05 +02:00
} else {
this . urlIndexFile . clear ( ) ;
}
2011-07-15 10:38:10 +02:00
this . statsDump = null ;
2013-02-22 15:45:15 +01:00
this . commit ( true ) ;
2008-05-24 14:30:50 +02:00
}
2011-07-15 10:38:10 +02:00
2013-01-04 16:39:34 +01:00
public void clearLocalSolr ( ) throws IOException {
2013-05-29 13:09:34 +02:00
synchronized ( this . solrInstances ) {
EmbeddedInstance instance = this . solrInstances . getSolr0 ( ) ;
if ( instance ! = null ) {
for ( String name : instance . getCoreNames ( ) ) new EmbeddedSolrConnector ( instance , name ) . clear ( ) ;
}
this . commit ( false ) ;
2013-11-07 10:01:44 +01:00
this . solrInstances . clearCaches ( ) ;
2013-02-21 13:23:55 +01:00
}
2013-01-04 16:39:34 +01:00
}
public void clearRemoteSolr ( ) throws IOException {
2013-05-29 13:09:34 +02:00
synchronized ( this . solrInstances ) {
ShardInstance instance = this . solrInstances . getSolr1 ( ) ;
if ( instance ! = null ) {
for ( String name : instance . getCoreNames ( ) ) new RemoteSolrConnector ( instance , name ) . clear ( ) ;
}
2013-11-07 10:01:44 +01:00
this . solrInstances . clearCaches ( ) ;
2013-02-21 13:23:55 +01:00
}
2013-01-04 16:39:34 +01:00
}
2013-02-21 13:23:55 +01:00
/ * *
* get the size of the default index
* @return
* /
2013-05-08 11:50:46 +02:00
private long collectionSizeLastAccess = 0 ;
private long collectionSizeLastValue = 0 ;
2013-02-22 15:45:15 +01:00
public long collectionSize ( ) {
2013-05-08 11:50:46 +02:00
long t = System . currentTimeMillis ( ) ;
if ( t - this . collectionSizeLastAccess < 1000 ) return this . collectionSizeLastValue ;
2013-02-22 15:45:15 +01:00
long size = this . urlIndexFile = = null ? 0 : this . urlIndexFile . size ( ) ;
2013-05-29 18:27:27 +02:00
size + = this . solrInstances . getDefaultMirrorConnector ( ) . getSize ( ) ;
2013-05-08 11:50:46 +02:00
this . collectionSizeLastAccess = t ;
this . collectionSizeLastValue = size ;
2012-07-22 13:18:45 +02:00
return size ;
2008-03-26 15:13:05 +01:00
}
2013-02-22 15:45:15 +01:00
/ * *
* get the size of the webgraph index
* @return
* /
public long webgraphSize ( ) {
2013-07-23 16:46:44 +02:00
return this . writeWebgraph ? this . getWebgraphConnector ( ) . getSize ( ) : 0 ;
2013-02-22 15:45:15 +01:00
}
2008-03-26 15:13:05 +01:00
public void close ( ) {
2011-07-15 10:38:10 +02:00
this . statsDump = null ;
if ( this . urlIndexFile ! = null ) {
this . urlIndexFile . close ( ) ;
this . urlIndexFile = null ;
2008-03-26 15:13:05 +01:00
}
2013-02-21 13:23:55 +01:00
this . solrInstances . close ( ) ;
2008-03-26 15:13:05 +01:00
}
2012-10-26 07:39:07 +02:00
2013-05-29 18:27:27 +02:00
private long lastCommit = 0 ;
2013-01-23 14:40:58 +01:00
public void commit ( boolean softCommit ) {
2013-05-29 18:27:27 +02:00
long t = System . currentTimeMillis ( ) ;
if ( lastCommit + 10000 > t ) return ;
lastCommit = t ;
2013-02-21 13:23:55 +01:00
getDefaultConnector ( ) . commit ( softCommit ) ;
2013-07-23 16:46:44 +02:00
if ( this . writeWebgraph ) getWebgraphConnector ( ) . commit ( softCommit ) ;
2012-10-26 07:39:07 +02:00
}
2012-11-19 17:24:34 +01:00
public Date getLoadDate ( final String urlHash ) {
if ( urlHash = = null ) return null ;
2012-11-24 22:30:05 +01:00
Date x ;
2012-11-19 17:24:34 +01:00
try {
2013-11-01 17:24:36 +01:00
String d = this . getDefaultConnector ( ) . getFieldById ( urlHash , CollectionSchema . load_date_dt . getSolrFieldName ( ) ) ;
if ( d = = null ) return null ;
x = new Date ( Long . parseLong ( d ) ) ;
2013-07-17 18:31:30 +02:00
} catch ( final IOException e ) {
2012-11-19 17:24:34 +01:00
return null ;
}
2012-11-24 22:30:05 +01:00
return x ;
2012-11-19 17:24:34 +01:00
}
2012-11-23 01:35:28 +01:00
2013-09-15 00:30:23 +02:00
public DigestURL getURL ( final byte [ ] urlHash ) {
2013-05-29 18:27:27 +02:00
if ( urlHash = = null | | this . getDefaultConnector ( ) = = null ) return null ;
2013-03-04 01:13:17 +01:00
2012-11-24 22:30:05 +01:00
String x ;
2012-11-23 01:35:28 +01:00
try {
2013-11-01 17:24:36 +01:00
x = this . getDefaultConnector ( ) . getFieldById ( ASCII . String ( urlHash ) , CollectionSchema . sku . getSolrFieldName ( ) ) ;
2013-07-17 18:31:30 +02:00
} catch ( final IOException e ) {
2012-11-23 01:35:28 +01:00
return null ;
}
if ( x = = null ) return null ;
try {
2013-09-15 00:30:23 +02:00
DigestURL uri = new DigestURL ( x , urlHash ) ;
2012-11-23 01:35:28 +01:00
return uri ;
2013-07-17 18:31:30 +02:00
} catch ( final MalformedURLException e ) {
2012-11-23 01:35:28 +01:00
return null ;
}
}
2012-11-19 17:24:34 +01:00
2013-07-15 18:22:35 +02:00
public URIMetadataNode getMetadata ( final WeakPriorityBlockingQueue . Element < WordReferenceVars > element ) {
2013-02-27 22:40:23 +01:00
if ( element = = null ) return null ;
2013-06-01 05:43:08 +02:00
WordReferenceVars wre = element . getElement ( ) ;
2012-07-25 14:31:54 +02:00
if ( wre = = null ) return null ; // all time was already wasted in takeRWI to get another element
2013-06-01 05:43:08 +02:00
long weight = element . getWeight ( ) ;
2013-02-27 22:40:23 +01:00
URIMetadataNode node = getMetadata ( wre . urlhash ( ) , wre , weight ) ;
return node ;
2011-05-13 08:21:40 +02:00
}
2011-07-15 10:38:10 +02:00
2012-10-16 18:11:57 +02:00
public URIMetadataNode getMetadata ( final byte [ ] urlHash ) {
2011-05-13 08:21:40 +02:00
if ( urlHash = = null ) return null ;
2012-08-17 01:34:38 +02:00
return getMetadata ( urlHash , null , 0 ) ;
2012-08-10 13:26:51 +02:00
}
2012-11-23 01:35:28 +01:00
2013-07-15 18:22:35 +02:00
private URIMetadataNode getMetadata ( final byte [ ] urlHash , final WordReferenceVars wre , final long weight ) {
2013-03-02 10:25:52 +01:00
String u = ASCII . String ( urlHash ) ;
2012-08-10 13:26:51 +02:00
// get the metadata from Solr
try {
2013-04-27 01:32:18 +02:00
SolrDocument doc = this . getDefaultConnector ( ) . getDocumentById ( u ) ;
2012-08-18 19:36:21 +02:00
if ( doc ! = null ) {
2013-03-03 22:38:50 +01:00
if ( this . urlIndexFile ! = null ) this . urlIndexFile . remove ( urlHash ) ; // migration
2012-08-18 19:36:21 +02:00
return new URIMetadataNode ( doc , wre , weight ) ;
}
2013-07-17 18:31:30 +02:00
} catch ( final IOException e ) {
2013-07-09 14:28:25 +02:00
ConcurrentLog . logException ( e ) ;
2012-08-10 13:26:51 +02:00
}
2012-08-13 10:40:04 +02:00
// get the metadata from the old metadata index
if ( this . urlIndexFile ! = null ) try {
2012-09-26 16:05:11 +02:00
// slow migration to solr
final Row . Entry entry = this . urlIndexFile . remove ( urlHash ) ;
if ( entry = = null ) return null ;
2012-10-18 14:29:11 +02:00
URIMetadataRow row = new URIMetadataRow ( entry , wre ) ;
2013-02-21 13:23:55 +01:00
SolrInputDocument solrInput = this . collectionConfiguration . metadata2solr ( row ) ;
2012-10-16 18:11:57 +02:00
this . putDocument ( solrInput ) ;
2013-03-02 10:25:52 +01:00
SolrDocument sd = this . collectionConfiguration . toSolrDocument ( solrInput ) ;
return new URIMetadataNode ( sd , wre , weight ) ;
2012-08-17 01:34:38 +02:00
} catch ( final IOException e ) {
2013-07-09 14:28:25 +02:00
ConcurrentLog . logException ( e ) ;
2012-08-17 01:34:38 +02:00
}
return null ;
}
2012-08-17 15:46:26 +02:00
public void putDocument ( final SolrInputDocument doc ) throws IOException {
2013-02-25 00:09:41 +01:00
SolrConnector connector = this . getDefaultConnector ( ) ;
if ( connector = = null ) return ;
2013-02-21 13:23:55 +01:00
String id = ( String ) doc . getFieldValue ( CollectionSchema . id . getSolrFieldName ( ) ) ;
2013-05-30 12:34:53 +02:00
String url = ( String ) doc . getFieldValue ( CollectionSchema . sku . getSolrFieldName ( ) ) ;
2013-07-09 14:28:25 +02:00
ConcurrentLog . info ( " Fulltext " , " indexing: " + id + " " + url ) ;
2012-08-18 13:05:27 +02:00
byte [ ] idb = ASCII . getBytes ( id ) ;
2012-09-26 16:05:11 +02:00
try {
2013-02-22 15:45:15 +01:00
if ( this . urlIndexFile ! = null ) this . urlIndexFile . remove ( idb ) ;
2013-02-26 17:16:31 +01:00
//Date sdDate = (Date) connector.getFieldById(id, CollectionSchema.last_modified.getSolrFieldName());
//Date docDate = null;
//if (sdDate == null || (docDate = SchemaConfiguration.getDate(doc, CollectionSchema.last_modified)) == null || sdDate.before(docDate)) {
2013-05-11 10:53:12 +02:00
connector . add ( doc ) ;
2013-02-26 17:16:31 +01:00
//}
2013-07-17 18:31:30 +02:00
} catch ( final SolrException e ) {
2013-02-22 15:45:15 +01:00
throw new IOException ( e . getMessage ( ) , e ) ;
}
this . statsDump = null ;
2013-11-07 10:01:44 +01:00
if ( MemoryControl . shortStatus ( ) ) clearCaches ( ) ;
2013-02-22 15:45:15 +01:00
}
2013-07-15 18:22:35 +02:00
2013-02-22 15:45:15 +01:00
public void putEdges ( final Collection < SolrInputDocument > edges ) throws IOException {
2013-07-23 16:46:44 +02:00
if ( ! this . writeToWebgraph ( ) ) return ;
2013-03-05 12:24:01 +01:00
if ( edges = = null | | edges . size ( ) = = 0 ) return ;
2013-02-22 15:45:15 +01:00
try {
this . getWebgraphConnector ( ) . add ( edges ) ;
2013-07-17 18:31:30 +02:00
} catch ( final SolrException e ) {
2012-09-26 16:05:11 +02:00
throw new IOException ( e . getMessage ( ) , e ) ;
2012-08-18 13:05:27 +02:00
}
this . statsDump = null ;
2013-11-07 10:01:44 +01:00
if ( MemoryControl . shortStatus ( ) ) clearCaches ( ) ;
2012-08-17 15:46:26 +02:00
}
2013-07-15 18:22:35 +02:00
2013-08-20 15:46:04 +02:00
/ * *
* deprecated method to store document metadata , use Solr documents wherever possible
* /
2012-10-18 14:29:11 +02:00
public void putMetadata ( final URIMetadataRow entry ) throws IOException {
2013-02-26 17:16:31 +01:00
byte [ ] idb = entry . hash ( ) ;
2013-03-02 10:25:52 +01:00
String id = ASCII . String ( idb ) ;
2012-09-26 16:05:11 +02:00
try {
2013-02-26 17:16:31 +01:00
if ( this . urlIndexFile ! = null ) this . urlIndexFile . remove ( idb ) ;
2013-03-02 10:25:52 +01:00
// because node entries are richer than metadata entries we must check if they exist to prevent that they are overwritten
2013-04-27 01:32:18 +02:00
SolrDocument sd = this . getDefaultConnector ( ) . getDocumentById ( id ) ;
2013-03-02 10:25:52 +01:00
if ( sd = = null | | ( new URIMetadataNode ( sd ) ) . isOlder ( entry ) ) {
2013-03-03 22:38:50 +01:00
putDocument ( getDefaultConfiguration ( ) . metadata2solr ( entry ) ) ;
2013-07-15 18:22:35 +02:00
}
2013-07-17 18:31:30 +02:00
} catch ( final SolrException e ) {
2012-09-26 16:05:11 +02:00
throw new IOException ( e . getMessage ( ) , e ) ;
2008-03-26 15:13:05 +01:00
}
2012-08-10 15:39:10 +02:00
this . statsDump = null ;
2013-11-07 10:01:44 +01:00
if ( MemoryControl . shortStatus ( ) ) clearCaches ( ) ;
2008-03-26 15:13:05 +01:00
}
2012-08-22 16:30:33 +02:00
2012-11-13 16:54:28 +01:00
/ * *
* using a fragment of the url hash ( 6 bytes : bytes 6 to 11 ) it is possible to address all urls from a specific domain
* here such a fragment can be used to delete all these domains at once
* @param hosthash the hash of the host to be deleted
* @param freshdate either NULL or a date in the past which is the limit for deletion . Only documents older than this date are deleted
* @throws IOException
* /
2013-10-24 16:20:20 +02:00
public void deleteStaleDomainHashes ( final Set < String > hosthashes , Date freshdate ) {
2013-05-11 10:53:12 +02:00
// delete in solr
2013-10-24 16:20:20 +02:00
Date now = new Date ( ) ;
deleteDomainWithConstraint ( this . getDefaultConnector ( ) , CollectionSchema . host_id_s . getSolrFieldName ( ) , hosthashes ,
( freshdate = = null | | freshdate . after ( now ) ) ? null :
( CollectionSchema . load_date_dt . getSolrFieldName ( ) + " :[* TO " + ISO8601Formatter . FORMATTER . format ( freshdate ) + " ] " ) ) ;
if ( this . writeWebgraph ) deleteDomainWithConstraint ( this . getWebgraphConnector ( ) , WebgraphSchema . source_host_id_s . getSolrFieldName ( ) , hosthashes ,
( freshdate = = null | | freshdate . after ( now ) ) ? null :
( WebgraphSchema . load_date_dt . getSolrFieldName ( ) + " :[* TO " + ISO8601Formatter . FORMATTER . format ( freshdate ) + " ] " ) ) ;
2013-05-11 10:53:12 +02:00
// delete in old metadata structure
if ( Fulltext . this . urlIndexFile ! = null ) {
final ArrayList < String > l = new ArrayList < String > ( ) ;
CloneableIterator < byte [ ] > i ;
try {
i = Fulltext . this . urlIndexFile . keys ( true , null ) ;
String hash ;
while ( i ! = null & & i . hasNext ( ) ) {
hash = ASCII . String ( i . next ( ) ) ;
2013-10-24 16:20:20 +02:00
if ( hosthashes . contains ( hash . substring ( 6 ) ) ) l . add ( hash ) ;
2012-11-13 16:54:28 +01:00
}
2013-05-11 10:53:12 +02:00
// then delete the urls using this list
for ( final String h : l ) Fulltext . this . urlIndexFile . delete ( ASCII . getBytes ( h ) ) ;
2013-07-17 18:31:30 +02:00
} catch ( final IOException e ) { }
2013-05-11 10:53:12 +02:00
}
// finally remove the line with statistics
if ( Fulltext . this . statsDump ! = null ) {
final Iterator < HostStat > hsi = Fulltext . this . statsDump . iterator ( ) ;
HostStat hs ;
while ( hsi . hasNext ( ) ) {
hs = hsi . next ( ) ;
2013-10-24 16:20:20 +02:00
if ( hosthashes . contains ( hs . hosthash ) ) hsi . remove ( ) ;
2012-11-13 16:54:28 +01:00
}
2013-01-31 13:15:28 +01:00
}
2012-11-13 16:54:28 +01:00
}
2013-10-24 16:20:20 +02:00
public void deleteStaleDomainNames ( final Set < String > hostnames , Date freshdate ) {
Date now = new Date ( ) ;
deleteDomainWithConstraint ( this . getDefaultConnector ( ) , CollectionSchema . host_s . getSolrFieldName ( ) , hostnames ,
( freshdate = = null | | freshdate . after ( now ) ) ? null :
( CollectionSchema . load_date_dt . getSolrFieldName ( ) + " :[* TO " + ISO8601Formatter . FORMATTER . format ( freshdate ) + " ] " ) ) ;
if ( this . writeWebgraph ) deleteDomainWithConstraint ( this . getWebgraphConnector ( ) , WebgraphSchema . source_host_s . getSolrFieldName ( ) , hostnames ,
( freshdate = = null | | freshdate . after ( now ) ) ? null :
( WebgraphSchema . load_date_dt . getSolrFieldName ( ) + " :[* TO " + ISO8601Formatter . FORMATTER . format ( freshdate ) + " ] " ) ) ;
2013-05-11 10:53:12 +02:00
// finally remove the line with statistics
if ( Fulltext . this . statsDump ! = null ) {
final Iterator < HostStat > hsi = Fulltext . this . statsDump . iterator ( ) ;
HostStat hs ;
while ( hsi . hasNext ( ) ) {
hs = hsi . next ( ) ;
2013-10-24 16:20:20 +02:00
if ( hostnames . contains ( hs . hostname ) ) hsi . remove ( ) ;
}
}
}
/ * *
* delete all documents within a domain that are registered as error document
* @param hosthashes
* /
public void deleteDomainErrors ( final Set < String > hosthashes ) {
deleteDomainWithConstraint ( this . getDefaultConnector ( ) , CollectionSchema . host_id_s . getSolrFieldName ( ) , hosthashes , CollectionSchema . failreason_s . getSolrFieldName ( ) + " :[* TO *] " ) ;
}
private static void deleteDomainWithConstraint ( SolrConnector connector , String fieldname , final Set < String > hosthashes , String constraintQuery ) {
if ( hosthashes = = null | | hosthashes . size ( ) = = 0 ) return ;
int subsetscount = 1 + ( hosthashes . size ( ) / 255 ) ; // if the list is too large, we get a "too many boolean clauses" exception
int c = 0 ;
@SuppressWarnings ( " unchecked " )
List < String > [ ] subsets = new ArrayList [ subsetscount ] ;
for ( int i = 0 ; i < subsetscount ; i + + ) subsets [ i ] = new ArrayList < String > ( ) ;
for ( String hosthash : hosthashes ) subsets [ c + + % subsetscount ] . add ( hosthash ) ;
for ( List < String > subset : subsets ) {
try {
StringBuilder query = new StringBuilder ( ) ;
for ( String hosthash : subset ) {
if ( query . length ( ) > 0 ) query . append ( " OR " ) ;
//query.append(CollectionSchema.host_id_s.getSolrFieldName()).append(":\"").append(hosthash).append(":\"");
query . append ( " ({!raw f= " ) . append ( fieldname ) . append ( '}' ) . append ( hosthash ) . append ( " ) " ) ;
2013-01-24 12:39:19 +01:00
}
2013-10-24 16:20:20 +02:00
if ( constraintQuery = = null ) connector . deleteByQuery ( query . toString ( ) ) ; else connector . deleteByQuery ( " ( " + query . toString ( ) + " ) AND " + constraintQuery ) ;
} catch ( final IOException e ) {
2013-01-24 12:39:19 +01:00
}
2013-01-31 13:15:28 +01:00
}
2013-01-24 12:39:19 +01:00
}
2012-11-02 01:22:31 +01:00
/ * *
* remove a full subpath from the index
* @param subpath the left path of the url ; at least until the end of the host
2012-11-13 16:54:28 +01:00
* @param freshdate either NULL or a date in the past which is the limit for deletion . Only documents older than this date are deleted
2012-11-02 01:22:31 +01:00
* @param concurrently if true , then the method returnes immediately and runs concurrently
* /
2013-05-11 10:53:12 +02:00
public int remove ( final String basepath , Date freshdate ) {
2013-09-15 00:30:23 +02:00
DigestURL uri ;
try { uri = new DigestURL ( basepath ) ; } catch ( final MalformedURLException e ) { return 0 ; }
2012-11-02 01:22:31 +01:00
final String host = uri . getHost ( ) ;
2013-02-22 15:45:15 +01:00
final String collectionQuery = CollectionSchema . host_s . getSolrFieldName ( ) + " : \" " + host + " \" " +
2013-02-21 13:23:55 +01:00
( ( freshdate ! = null & & freshdate . before ( new Date ( ) ) ) ? ( " AND " + CollectionSchema . load_date_dt . getSolrFieldName ( ) + " :[* TO " + ISO8601Formatter . FORMATTER . format ( freshdate ) + " ] " ) : " " ) ;
2012-11-13 16:54:28 +01:00
final AtomicInteger count = new AtomicInteger ( 0 ) ;
2013-05-11 10:53:12 +02:00
final BlockingQueue < SolrDocument > docs = Fulltext . this . getDefaultConnector ( ) . concurrentDocumentsByQuery ( collectionQuery , 0 , 1000000 , 600000 , - 1 , CollectionSchema . id . getSolrFieldName ( ) , CollectionSchema . sku . getSolrFieldName ( ) ) ;
try {
SolrDocument doc ;
while ( ( doc = docs . take ( ) ) ! = AbstractSolrConnector . POISON_DOCUMENT ) {
String u = ( String ) doc . getFieldValue ( CollectionSchema . sku . getSolrFieldName ( ) ) ;
if ( u . startsWith ( basepath ) ) {
remove ( ASCII . getBytes ( ( String ) doc . getFieldValue ( CollectionSchema . id . getSolrFieldName ( ) ) ) ) ;
count . incrementAndGet ( ) ;
}
2012-11-02 01:22:31 +01:00
}
2013-05-11 10:53:12 +02:00
if ( count . get ( ) > 0 ) Fulltext . this . commit ( true ) ;
2013-07-17 18:31:30 +02:00
} catch ( final InterruptedException e ) { }
2012-11-13 16:54:28 +01:00
return count . get ( ) ;
2012-11-02 01:22:31 +01:00
}
/ * *
* remove a list of id ' s from the index
* @param deleteIDs a list of urlhashes ; each denoting a document
* @param concurrently if true , then the method returnes immediately and runs concurrently
* /
2013-05-08 13:26:25 +02:00
public void remove ( final Collection < String > deleteIDs ) {
2012-11-02 13:57:43 +01:00
if ( deleteIDs = = null | | deleteIDs . size ( ) = = 0 ) return ;
2013-05-08 13:26:25 +02:00
try {
2013-05-11 10:53:12 +02:00
this . getDefaultConnector ( ) . deleteByIds ( deleteIDs ) ;
2013-07-23 16:46:44 +02:00
if ( this . writeWebgraph ) this . getWebgraphConnector ( ) . deleteByIds ( deleteIDs ) ;
2013-05-08 13:26:25 +02:00
} catch ( final Throwable e ) {
2013-07-09 14:28:25 +02:00
ConcurrentLog . logException ( e ) ;
2013-05-08 13:26:25 +02:00
}
if ( Fulltext . this . urlIndexFile ! = null ) try {
for ( String id : deleteIDs ) {
final Row . Entry r = Fulltext . this . urlIndexFile . remove ( ASCII . getBytes ( id ) ) ;
if ( r ! = null ) Fulltext . this . statsDump = null ;
}
} catch ( final IOException e ) { }
2012-10-31 17:44:45 +01:00
}
2012-06-26 13:54:48 +02:00
public boolean remove ( final byte [ ] urlHash ) {
if ( urlHash = = null ) return false ;
2012-07-24 17:23:29 +02:00
try {
2013-05-08 13:26:25 +02:00
String id = ASCII . String ( urlHash ) ;
2013-05-11 10:53:12 +02:00
this . getDefaultConnector ( ) . deleteById ( id ) ;
2013-07-23 16:46:44 +02:00
if ( this . writeWebgraph ) this . getWebgraphConnector ( ) . deleteById ( id ) ;
2012-07-24 17:23:29 +02:00
} catch ( final Throwable e ) {
2013-07-09 14:28:25 +02:00
ConcurrentLog . logException ( e ) ;
2012-06-26 13:54:48 +02:00
}
2012-07-22 13:18:45 +02:00
if ( this . urlIndexFile ! = null ) try {
2012-06-26 13:54:48 +02:00
final Row . Entry r = this . urlIndexFile . remove ( urlHash ) ;
2011-07-15 10:38:10 +02:00
if ( r ! = null ) this . statsDump = null ;
2008-03-26 15:13:05 +01:00
return r ! = null ;
2008-08-02 14:12:04 +02:00
} catch ( final IOException e ) {
2008-03-26 15:13:05 +01:00
return false ;
}
2012-07-22 13:18:45 +02:00
return false ;
2008-03-26 15:13:05 +01:00
}
2013-05-17 13:59:37 +02:00
@Deprecated
2013-02-22 15:45:15 +01:00
public boolean exists ( final String urlHash ) {
2012-05-15 12:25:14 +02:00
if ( urlHash = = null ) return false ;
2013-06-02 13:50:12 +02:00
if ( this . urlIndexFile ! = null & & this . urlIndexFile . has ( ASCII . getBytes ( urlHash ) ) ) return true ;
2012-07-24 17:23:29 +02:00
try {
2013-04-28 21:20:14 +02:00
if ( this . getDefaultConnector ( ) . existsById ( urlHash ) ) return true ;
2012-07-24 17:23:29 +02:00
} catch ( final Throwable e ) {
2013-07-09 14:28:25 +02:00
ConcurrentLog . logException ( e ) ;
2011-11-08 12:49:04 +01:00
}
2012-07-25 14:31:54 +02:00
return false ;
2008-03-26 15:13:05 +01:00
}
2013-05-17 13:59:37 +02:00
2013-05-27 13:45:09 +02:00
/ * *
* Multiple - test for existing url hashes in the search index .
* All given ids are tested and a subset of the given ids are returned .
* @param ids
* @return a set of ids which exist in the database
* /
2013-11-04 09:37:31 +01:00
public Set < String > exists ( Set < String > ids ) {
2013-05-17 13:59:37 +02:00
HashSet < String > e = new HashSet < String > ( ) ;
if ( ids = = null | | ids . size ( ) = = 0 ) return e ;
2013-11-04 09:37:31 +01:00
if ( ids . size ( ) = = 1 ) return exists ( ids . iterator ( ) . next ( ) ) ? ids : e ;
2013-11-03 18:31:50 +01:00
Set < String > idsC = new HashSet < String > ( ) ;
2013-06-02 13:50:12 +02:00
idsC . addAll ( ids ) ;
if ( this . urlIndexFile ! = null ) {
Iterator < String > idsi = idsC . iterator ( ) ;
String h ;
while ( idsi . hasNext ( ) ) {
h = idsi . next ( ) ;
if ( this . urlIndexFile . has ( ASCII . getBytes ( h ) ) ) {
idsi . remove ( ) ;
e . add ( h ) ;
}
}
}
2013-05-17 13:59:37 +02:00
try {
Set < String > e1 = this . getDefaultConnector ( ) . existsByIds ( idsC ) ;
e . addAll ( e1 ) ;
} catch ( final Throwable ee ) {
2013-07-09 14:28:25 +02:00
ConcurrentLog . logException ( ee ) ;
2013-05-17 13:59:37 +02:00
}
return e ;
}
2008-03-26 15:13:05 +01:00
2012-10-02 11:13:06 +02:00
public String failReason ( final String urlHash ) throws IOException {
if ( urlHash = = null ) return null ;
2013-11-01 17:24:36 +01:00
String reason = this . getDefaultConnector ( ) . getFieldById ( urlHash , CollectionSchema . failreason_s . getSolrFieldName ( ) ) ;
2012-11-24 22:30:05 +01:00
if ( reason = = null ) return null ;
2013-06-01 05:43:08 +02:00
return reason . length ( ) = = 0 ? null : reason ;
2012-10-02 11:13:06 +02:00
}
2012-09-24 17:05:28 +02:00
public List < File > dumpFiles ( ) {
2013-02-21 13:23:55 +01:00
EmbeddedInstance esc = this . solrInstances . getSolr0 ( ) ;
2012-09-24 17:05:28 +02:00
ArrayList < File > zips = new ArrayList < File > ( ) ;
2012-11-07 02:04:08 +01:00
if ( esc = = null ) {
2013-07-09 14:28:25 +02:00
ConcurrentLog . warn ( " Fulltext " , " HOT DUMP selected solr0 == NULL, no dump list! " ) ;
2012-11-07 02:04:08 +01:00
return zips ;
}
2013-02-21 13:23:55 +01:00
if ( esc . getContainerPath ( ) = = null ) {
2013-07-09 14:28:25 +02:00
ConcurrentLog . warn ( " Fulltext " , " HOT DUMP selected solr0.getStoragePath() == NULL, no dump list! " ) ;
2012-11-07 02:04:08 +01:00
return zips ;
}
2013-02-21 13:23:55 +01:00
File storagePath = esc . getContainerPath ( ) . getParentFile ( ) ;
2012-11-07 02:04:08 +01:00
if ( storagePath = = null ) {
2013-07-09 14:28:25 +02:00
ConcurrentLog . warn ( " Fulltext " , " HOT DUMP selected esc.getStoragePath().getParentFile() == NULL, no dump list! " ) ;
2012-11-07 02:04:08 +01:00
return zips ;
}
2013-07-09 14:28:25 +02:00
ConcurrentLog . info ( " Fulltext " , " HOT DUMP dump path = " + storagePath . toString ( ) ) ;
2012-09-24 17:05:28 +02:00
for ( String p : storagePath . list ( ) ) {
if ( p . endsWith ( " zip " ) ) zips . add ( new File ( storagePath , p ) ) ;
}
return zips ;
}
/ * *
* create a dump file from the current solr directory
* @return
* /
public File dumpSolr ( ) {
2013-02-21 13:23:55 +01:00
EmbeddedInstance esc = this . solrInstances . getSolr0 ( ) ;
File storagePath = esc . getContainerPath ( ) ;
2012-09-24 17:05:28 +02:00
File zipOut = new File ( storagePath . toString ( ) + " _ " + GenericFormatter . SHORT_DAY_FORMATTER . format ( ) + " .zip " ) ;
2013-02-21 13:23:55 +01:00
synchronized ( this . solrInstances ) {
2012-09-25 00:19:52 +02:00
this . disconnectLocalSolr ( ) ;
2013-02-21 13:23:55 +01:00
this . solrInstances . close ( ) ;
2012-09-24 17:05:28 +02:00
try {
2012-09-25 00:19:52 +02:00
ZIPWriter . zip ( storagePath , zipOut ) ;
2013-07-17 18:31:30 +02:00
} catch ( final IOException e ) {
2013-07-09 14:28:25 +02:00
ConcurrentLog . logException ( e ) ;
2012-09-25 00:19:52 +02:00
} finally {
2013-02-21 13:23:55 +01:00
this . solrInstances = new InstanceMirror ( ) ;
2012-09-25 00:19:52 +02:00
try {
2013-02-13 02:29:47 +01:00
this . connectLocalSolr ( ) ;
2013-07-17 18:31:30 +02:00
} catch ( final IOException e ) {
2013-07-09 14:28:25 +02:00
ConcurrentLog . logException ( e ) ;
2012-09-25 00:19:52 +02:00
}
2012-09-24 17:05:28 +02:00
}
}
return zipOut ;
}
/ * *
* restore a solr dump to the current solr directory
* @param solrDumpZipFile
* /
public void restoreSolr ( File solrDumpZipFile ) {
2013-02-21 13:23:55 +01:00
EmbeddedInstance esc = this . solrInstances . getSolr0 ( ) ;
File storagePath = esc . getContainerPath ( ) ;
synchronized ( this . solrInstances ) {
2012-09-25 00:19:52 +02:00
this . disconnectLocalSolr ( ) ;
2013-02-21 13:23:55 +01:00
this . solrInstances . close ( ) ;
2012-09-24 17:05:28 +02:00
try {
2012-09-25 00:19:52 +02:00
ZIPReader . unzip ( solrDumpZipFile , storagePath ) ;
2013-07-17 18:31:30 +02:00
} catch ( final IOException e ) {
2013-07-09 14:28:25 +02:00
ConcurrentLog . logException ( e ) ;
2012-09-25 00:19:52 +02:00
} finally {
2013-02-21 13:23:55 +01:00
this . solrInstances = new InstanceMirror ( ) ;
2012-09-25 00:19:52 +02:00
try {
2013-02-13 02:29:47 +01:00
this . connectLocalSolr ( ) ;
2013-07-17 18:31:30 +02:00
} catch ( final IOException e ) {
2013-07-09 14:28:25 +02:00
ConcurrentLog . logException ( e ) ;
2012-09-25 00:19:52 +02:00
}
2012-09-24 17:05:28 +02:00
}
}
}
2013-06-28 14:51:37 +02:00
/ * *
* optimize solr ( experimental to check resource management )
* @param size
* /
public void optimize ( final int size ) {
if ( size < 1 ) return ;
getDefaultConnector ( ) . optimize ( size ) ;
2013-07-23 16:46:44 +02:00
if ( this . writeWebgraph ) getWebgraphConnector ( ) . optimize ( size ) ;
2013-06-28 14:51:37 +02:00
}
2012-09-24 17:05:28 +02:00
2013-05-29 13:09:34 +02:00
/ * *
2013-06-28 14:51:37 +02:00
* reboot solr ( experimental to check resource management )
2013-05-29 13:09:34 +02:00
* /
public void rebootSolr ( ) {
synchronized ( this . solrInstances ) {
this . disconnectLocalSolr ( ) ;
this . solrInstances . close ( ) ;
this . solrInstances = new InstanceMirror ( ) ;
try {
this . connectLocalSolr ( ) ;
2013-07-17 18:31:30 +02:00
} catch ( final IOException e ) {
2013-07-09 14:28:25 +02:00
ConcurrentLog . logException ( e ) ;
2013-05-29 13:09:34 +02:00
}
}
}
2008-03-26 15:13:05 +01:00
// export methods
2013-11-06 19:22:26 +01:00
public Export export ( final File f , final String filter , final String query , final int format , final boolean dom ) {
2011-07-15 10:38:10 +02:00
if ( ( this . exportthread ! = null ) & & ( this . exportthread . isAlive ( ) ) ) {
2013-07-09 14:28:25 +02:00
ConcurrentLog . warn ( " LURL-EXPORT " , " cannot start another export thread, already one running " ) ;
2011-07-15 10:38:10 +02:00
return this . exportthread ;
2008-03-26 15:13:05 +01:00
}
2013-11-06 19:22:26 +01:00
this . exportthread = new Export ( f , filter , query , format , dom ) ;
2008-03-26 15:13:05 +01:00
this . exportthread . start ( ) ;
2011-07-15 10:38:10 +02:00
return this . exportthread ;
2008-03-26 15:13:05 +01:00
}
2011-07-15 10:38:10 +02:00
2008-03-26 15:13:05 +01:00
public Export export ( ) {
return this . exportthread ;
}
2011-07-15 10:38:10 +02:00
2008-03-26 15:13:05 +01:00
public class Export extends Thread {
2008-08-02 14:12:04 +02:00
private final File f ;
2013-01-24 12:39:19 +01:00
private final Pattern pattern ;
2008-03-26 15:13:05 +01:00
private int count ;
2013-11-06 19:22:26 +01:00
private String failure , query ;
2008-08-02 14:12:04 +02:00
private final int format ;
private final boolean dom ;
2011-07-15 10:38:10 +02:00
2013-11-06 19:22:26 +01:00
private Export ( final File f , final String filter , final String query , final int format , boolean dom ) {
2008-03-26 15:13:05 +01:00
// format: 0=text, 1=html, 2=rss/xml
this . f = f ;
2013-01-24 12:39:19 +01:00
this . pattern = filter = = null ? null : Pattern . compile ( filter ) ;
2013-11-06 19:22:26 +01:00
this . query = query = = null ? " *:* " : query ;
2008-03-26 15:13:05 +01:00
this . count = 0 ;
this . failure = null ;
this . format = format ;
this . dom = dom ;
2013-04-22 22:33:13 +02:00
//if ((dom) && (format == 2)) dom = false;
2008-03-26 15:13:05 +01:00
}
2011-07-15 10:38:10 +02:00
2011-12-17 01:27:08 +01:00
@Override
2008-03-26 15:13:05 +01:00
public void run ( ) {
try {
2011-07-15 10:38:10 +02:00
final File parentf = this . f . getParentFile ( ) ;
2009-03-10 21:52:10 +01:00
if ( parentf ! = null ) parentf . mkdirs ( ) ;
2011-07-15 10:38:10 +02:00
final PrintWriter pw = new PrintWriter ( new BufferedOutputStream ( new FileOutputStream ( this . f ) ) ) ;
if ( this . format = = 1 ) {
2008-03-26 15:13:05 +01:00
pw . println ( " <html><head></head><body> " ) ;
}
2011-07-15 10:38:10 +02:00
if ( this . format = = 2 ) {
2008-03-26 15:13:05 +01:00
pw . println ( " <?xml version= \" 1.0 \" encoding= \" UTF-8 \" ?> " ) ;
pw . println ( " <?xml-stylesheet type='text/xsl' href='/yacysearch.xsl' version='1.0'?> " ) ;
2009-02-23 12:39:20 +01:00
pw . println ( " <rss version= \" 2.0 \" xmlns:yacy= \" http://www.yacy.net/ \" xmlns:opensearch= \" http://a9.com/-/spec/opensearch/1.1/ \" xmlns:atom= \" http://www.w3.org/2005/Atom \" > " ) ;
2008-03-26 15:13:05 +01:00
pw . println ( " <channel> " ) ;
2013-01-24 12:39:19 +01:00
pw . println ( " <title>YaCy Peer-to-Peer - Web-Search URL Export</title> " ) ;
2008-03-26 15:13:05 +01:00
pw . println ( " <description></description> " ) ;
pw . println ( " <link>http://yacy.net</link> " ) ;
}
2013-01-24 12:39:19 +01:00
2011-07-15 10:38:10 +02:00
if ( this . dom ) {
2013-11-06 19:22:26 +01:00
Map < String , ReversibleScoreMap < String > > scores = Fulltext . this . getDefaultConnector ( ) . getFacets ( this . query + " AND " + CollectionSchema . httpstatus_i . getSolrFieldName ( ) + " :200 " , 100000000 , CollectionSchema . host_s . getSolrFieldName ( ) ) ;
2013-02-21 13:23:55 +01:00
ReversibleScoreMap < String > stats = scores . get ( CollectionSchema . host_s . getSolrFieldName ( ) ) ;
2013-01-24 12:39:19 +01:00
for ( final String host : stats ) {
if ( this . pattern ! = null & & ! this . pattern . matcher ( host ) . matches ( ) ) continue ;
2011-07-15 10:38:10 +02:00
if ( this . format = = 0 ) pw . println ( host ) ;
if ( this . format = = 1 ) pw . println ( " <a href= \" http:// " + host + " \" > " + host + " </a><br> " ) ;
this . count + + ;
2008-09-04 22:28:36 +02:00
}
} else {
2013-11-06 19:22:26 +01:00
BlockingQueue < SolrDocument > docs = Fulltext . this . getDefaultConnector ( ) . concurrentDocumentsByQuery ( this . query + " AND " + CollectionSchema . httpstatus_i . getSolrFieldName ( ) + " :200 " , 0 , 100000000 , 10 * 60 * 60 * 1000 , 100 ,
2013-02-21 13:23:55 +01:00
CollectionSchema . id . getSolrFieldName ( ) , CollectionSchema . sku . getSolrFieldName ( ) , CollectionSchema . title . getSolrFieldName ( ) ,
2013-07-30 12:48:57 +02:00
CollectionSchema . author . getSolrFieldName ( ) , CollectionSchema . description_txt . getSolrFieldName ( ) , CollectionSchema . size_i . getSolrFieldName ( ) , CollectionSchema . last_modified . getSolrFieldName ( ) ) ;
2013-01-24 12:39:19 +01:00
SolrDocument doc ;
2013-11-06 19:22:26 +01:00
String url , hash , title , author , description ;
2013-01-24 12:39:19 +01:00
Integer size ;
Date date ;
while ( ( doc = docs . take ( ) ) ! = AbstractSolrConnector . POISON_DOCUMENT ) {
2013-11-06 19:22:26 +01:00
hash = getStringFrom ( doc . getFieldValue ( CollectionSchema . id . getSolrFieldName ( ) ) ) ;
url = getStringFrom ( doc . getFieldValue ( CollectionSchema . sku . getSolrFieldName ( ) ) ) ;
title = getStringFrom ( doc . getFieldValue ( CollectionSchema . title . getSolrFieldName ( ) ) ) ;
author = getStringFrom ( doc . getFieldValue ( CollectionSchema . author . getSolrFieldName ( ) ) ) ;
description = getStringFrom ( doc . getFieldValue ( CollectionSchema . description_txt . getSolrFieldName ( ) ) ) ;
2013-02-21 13:23:55 +01:00
size = ( Integer ) doc . getFieldValue ( CollectionSchema . size_i . getSolrFieldName ( ) ) ;
date = ( Date ) doc . getFieldValue ( CollectionSchema . last_modified . getSolrFieldName ( ) ) ;
2013-01-24 12:39:19 +01:00
if ( this . pattern ! = null & & ! this . pattern . matcher ( url ) . matches ( ) ) continue ;
2011-07-15 10:38:10 +02:00
if ( this . format = = 0 ) {
2008-03-26 15:13:05 +01:00
pw . println ( url ) ;
}
2011-07-15 10:38:10 +02:00
if ( this . format = = 1 ) {
2013-11-06 19:22:26 +01:00
if ( title ! = null ) pw . println ( " <a href= \" " + MultiProtocolURL . escape ( url ) + " \" > " + CharacterCoding . unicode2xml ( title , true ) + " </a> " ) ;
2008-03-26 15:13:05 +01:00
}
2011-07-15 10:38:10 +02:00
if ( this . format = = 2 ) {
2008-03-26 15:13:05 +01:00
pw . println ( " <item> " ) ;
2013-11-06 19:22:26 +01:00
if ( title ! = null ) pw . println ( " <title> " + CharacterCoding . unicode2xml ( title , true ) + " </title> " ) ;
2013-09-15 00:30:23 +02:00
pw . println ( " <link> " + MultiProtocolURL . escape ( url ) + " </link> " ) ;
2013-01-24 12:39:19 +01:00
if ( author ! = null & & ! author . isEmpty ( ) ) pw . println ( " <author> " + CharacterCoding . unicode2xml ( author , true ) + " </author> " ) ;
2013-11-06 19:22:26 +01:00
if ( description ! = null & & ! description . isEmpty ( ) ) pw . println ( " <description> " + CharacterCoding . unicode2xml ( description , true ) + " </description> " ) ;
2013-05-09 00:22:45 +02:00
if ( date ! = null ) pw . println ( " <pubDate> " + HeaderFramework . formatRFC1123 ( date ) + " </pubDate> " ) ;
2013-01-24 12:39:19 +01:00
if ( size ! = null ) pw . println ( " <yacy:size> " + size . intValue ( ) + " </yacy:size> " ) ;
pw . println ( " <guid isPermaLink= \" false \" > " + hash + " </guid> " ) ;
2008-03-26 15:13:05 +01:00
pw . println ( " </item> " ) ;
}
2011-07-15 10:38:10 +02:00
this . count + + ;
2008-03-26 15:13:05 +01:00
}
}
2011-07-15 10:38:10 +02:00
if ( this . format = = 1 ) {
2008-03-26 15:13:05 +01:00
pw . println ( " </body></html> " ) ;
}
2011-07-15 10:38:10 +02:00
if ( this . format = = 2 ) {
2008-03-26 15:13:05 +01:00
pw . println ( " </channel> " ) ;
pw . println ( " </rss> " ) ;
}
pw . close ( ) ;
2008-08-02 14:12:04 +02:00
} catch ( final IOException e ) {
2013-07-09 14:28:25 +02:00
ConcurrentLog . logException ( e ) ;
2008-03-26 15:13:05 +01:00
this . failure = e . getMessage ( ) ;
2009-05-05 08:31:35 +02:00
} catch ( final Exception e ) {
2013-07-09 14:28:25 +02:00
ConcurrentLog . logException ( e ) ;
2009-05-05 08:31:35 +02:00
this . failure = e . getMessage ( ) ;
2008-03-26 15:13:05 +01:00
}
// terminate process
}
2011-07-15 10:38:10 +02:00
2008-03-26 15:13:05 +01:00
public File file ( ) {
return this . f ;
}
2011-07-15 10:38:10 +02:00
2008-03-26 15:13:05 +01:00
public String failed ( ) {
return this . failure ;
}
2011-07-15 10:38:10 +02:00
2008-03-26 15:13:05 +01:00
public int count ( ) {
return this . count ;
}
2013-11-06 19:22:26 +01:00
@SuppressWarnings ( " unchecked " )
private String getStringFrom ( final Object o ) {
if ( o = = null ) return " " ;
if ( o instanceof ArrayList ) return ( ( ArrayList < String > ) o ) . get ( 0 ) ;
return ( String ) o ;
}
2011-07-15 10:38:10 +02:00
2008-03-26 15:13:05 +01:00
}
2013-01-31 13:15:28 +01:00
2011-07-15 10:38:10 +02:00
public Iterator < HostStat > statistics ( int count , final ScoreMap < String > domainScore ) {
2011-05-26 12:57:02 +02:00
// prevent too heavy IO.
2011-07-15 10:38:10 +02:00
if ( this . statsDump ! = null & & count < = this . statsDump . size ( ) ) return this . statsDump . iterator ( ) ;
2008-09-04 21:41:57 +02:00
// fetch urls from the database to determine the host in clear text
2011-07-15 10:38:10 +02:00
final Iterator < String > j = domainScore . keys ( false ) ; // iterate urlhash-examples in reverse order (biggest first)
2008-09-04 22:28:36 +02:00
String urlhash ;
2008-09-04 21:41:57 +02:00
count + = 10 ; // make some more to prevent that we have to do this again after deletions too soon.
2011-05-26 12:57:02 +02:00
if ( count < 0 | | domainScore . sizeSmaller ( count ) ) count = domainScore . size ( ) ;
2011-07-15 10:38:10 +02:00
this . statsDump = new ArrayList < HostStat > ( ) ;
2013-09-15 00:30:23 +02:00
DigestURL url ;
2008-09-04 22:28:36 +02:00
while ( j . hasNext ( ) ) {
2008-09-04 21:41:57 +02:00
urlhash = j . next ( ) ;
if ( urlhash = = null ) continue ;
2012-11-23 01:35:28 +01:00
url = this . getURL ( ASCII . getBytes ( urlhash ) ) ;
if ( url = = null | | url . getHost ( ) = = null ) continue ;
2011-07-15 10:38:10 +02:00
if ( this . statsDump = = null ) return new ArrayList < HostStat > ( ) . iterator ( ) ; // some other operation has destroyed the object
this . statsDump . add ( new HostStat ( url . getHost ( ) , url . getPort ( ) , urlhash . substring ( 6 ) , domainScore . get ( urlhash ) ) ) ;
2008-09-04 21:41:57 +02:00
count - - ;
2008-09-04 22:28:36 +02:00
if ( count = = 0 ) break ;
2008-09-04 21:41:57 +02:00
}
// finally return an iterator for the result array
2011-07-15 10:38:10 +02:00
return ( this . statsDump = = null ) ? new ArrayList < HostStat > ( ) . iterator ( ) : this . statsDump . iterator ( ) ;
2008-09-04 21:41:57 +02:00
}
2011-07-15 10:38:10 +02:00
2011-05-26 12:57:02 +02:00
public static class HostStat {
2008-09-04 21:41:57 +02:00
public String hostname , hosthash ;
2008-09-08 20:20:05 +02:00
public int port ;
2008-09-04 21:41:57 +02:00
public int count ;
2012-10-18 15:09:04 +02:00
private HostStat ( final String host , final int port , final String urlhashfragment , final int count ) {
2008-09-04 23:28:00 +02:00
assert urlhashfragment . length ( ) = = 6 ;
2008-09-04 21:41:57 +02:00
this . hostname = host ;
2008-09-08 20:20:05 +02:00
this . port = port ;
2008-09-04 21:41:57 +02:00
this . hosthash = urlhashfragment ;
this . count = count ;
}
}
2008-03-26 15:13:05 +01:00
}