2009-05-28 16:26:05 +02:00
// Segment.java
2009-09-14 23:17:42 +02:00
// (C) 2005-2009 by Michael Peter Christen; mc@yacy.net, Frankfurt a. M., Germany
2009-05-28 16:26:05 +02:00
// first published 2005 on http://yacy.net; full redesign for segments 28.5.2009
//
// This is a part of YaCy, a peer-to-peer based web search engine
//
2011-03-08 02:51:51 +01:00
// $LastChangedDate$
// $LastChangedRevision$
// $LastChangedBy$
2009-05-28 16:26:05 +02:00
//
// LICENSE
//
// This program is free software; you can redistribute it and/or modify
// it under the terms of the GNU General Public License as published by
// the Free Software Foundation; either version 2 of the License, or
// (at your option) any later version.
//
// This program is distributed in the hope that it will be useful,
// but WITHOUT ANY WARRANTY; without even the implied warranty of
// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
// GNU General Public License for more details.
//
// You should have received a copy of the GNU General Public License
// along with this program; if not, write to the Free Software
// Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
2011-09-25 18:59:06 +02:00
package net.yacy.search.index ;
2009-05-28 16:26:05 +02:00
import java.io.File ;
import java.io.IOException ;
2013-01-29 18:14:14 +01:00
import java.net.MalformedURLException ;
2009-05-28 16:26:05 +02:00
import java.util.Date ;
import java.util.Iterator ;
import java.util.Map ;
2012-03-29 17:20:14 +02:00
import java.util.Properties ;
2009-10-23 00:38:04 +02:00
import java.util.Set ;
2012-08-24 14:13:42 +02:00
import java.util.concurrent.BlockingQueue ;
2009-05-28 16:26:05 +02:00
2013-01-23 14:40:58 +01:00
import org.apache.solr.common.SolrDocument ;
import org.apache.solr.common.SolrDocumentList ;
2012-10-18 14:29:11 +02:00
import org.apache.solr.common.SolrInputDocument ;
2011-05-27 10:24:54 +02:00
import net.yacy.cora.document.ASCII ;
2010-05-25 14:54:57 +02:00
import net.yacy.cora.document.MultiProtocolURI ;
2011-03-07 21:36:40 +01:00
import net.yacy.cora.document.UTF8 ;
2012-09-25 21:20:03 +02:00
import net.yacy.cora.federate.solr.connector.AbstractSolrConnector ;
import net.yacy.cora.federate.yacy.CacheStrategy ;
2012-09-21 16:46:57 +02:00
import net.yacy.cora.order.Base64Order ;
2011-12-16 23:59:29 +01:00
import net.yacy.cora.order.ByteOrder ;
2012-07-25 01:53:47 +02:00
import net.yacy.cora.protocol.ResponseHeader ;
2012-07-27 12:13:53 +02:00
import net.yacy.cora.storage.HandleSet ;
2012-09-21 11:02:36 +02:00
import net.yacy.cora.util.LookAheadIterator ;
2012-07-27 12:13:53 +02:00
import net.yacy.cora.util.SpaceExceededException ;
2012-09-21 15:48:16 +02:00
import net.yacy.crawler.data.CrawlQueues ;
import net.yacy.crawler.retrieval.Response ;
2009-10-18 02:53:43 +02:00
import net.yacy.document.Condenser ;
import net.yacy.document.Document ;
2010-06-29 21:20:45 +02:00
import net.yacy.document.Parser ;
2012-02-25 12:42:13 +01:00
import net.yacy.kelondro.data.citation.CitationReference ;
import net.yacy.kelondro.data.citation.CitationReferenceFactory ;
2009-10-11 02:12:19 +02:00
import net.yacy.kelondro.data.meta.DigestURI ;
2013-03-14 12:13:02 +01:00
import net.yacy.kelondro.data.meta.URIMetadataRow ;
2009-10-11 02:12:19 +02:00
import net.yacy.kelondro.data.word.Word ;
import net.yacy.kelondro.data.word.WordReference ;
import net.yacy.kelondro.data.word.WordReferenceFactory ;
import net.yacy.kelondro.data.word.WordReferenceRow ;
2013-03-14 12:13:02 +01:00
import net.yacy.kelondro.index.RowHandleSet ;
2009-10-10 01:13:30 +02:00
import net.yacy.kelondro.logging.Log ;
2009-10-10 02:39:15 +02:00
import net.yacy.kelondro.rwi.IndexCell ;
import net.yacy.kelondro.rwi.ReferenceContainer ;
import net.yacy.kelondro.rwi.ReferenceFactory ;
2012-09-21 16:46:57 +02:00
import net.yacy.kelondro.util.Bitfield ;
2013-03-14 12:13:02 +01:00
import net.yacy.kelondro.util.ByteBuffer ;
2009-10-11 02:12:19 +02:00
import net.yacy.kelondro.util.ISO639 ;
2013-01-23 14:40:58 +01:00
import net.yacy.kelondro.util.MemoryControl ;
2009-10-23 00:38:04 +02:00
import net.yacy.repository.LoaderDispatcher ;
2011-09-25 18:59:06 +02:00
import net.yacy.search.query.SearchEvent ;
2013-02-21 13:23:55 +01:00
import net.yacy.search.schema.CollectionConfiguration ;
import net.yacy.search.schema.CollectionSchema ;
import net.yacy.search.schema.WebgraphConfiguration ;
2009-05-28 16:26:05 +02:00
2009-09-14 23:17:42 +02:00
public class Segment {
2009-05-28 16:26:05 +02:00
2012-04-27 14:18:02 +02:00
// catchall word
2012-05-17 05:18:52 +02:00
public final static String catchallString = " yacyall " ; // a word that is always in all indexes; can be used for zero-word searches to find ALL documents
public final static byte [ ] catchallHash ;
2012-04-27 14:18:02 +02:00
final static Word catchallWord = new Word ( 0 , 0 , 0 ) ;
static {
catchallHash = Word . word2hash ( catchallString ) ; // "KZzU-Vf6h5k-"
catchallWord . flags = new Bitfield ( 4 ) ;
for ( int i = 0 ; i < catchallWord . flags . length ( ) ; i + + ) catchallWord . flags . set ( i , true ) ;
}
2009-05-28 16:26:05 +02:00
// environment constants
public static final long wCacheMaxAge = 1000 * 60 * 30 ; // milliseconds; 30 minutes
public static final int wCacheMaxChunk = 800 ; // maximum number of references for each urlhash
public static final int lowcachedivisor = 900 ;
2012-01-29 20:35:48 +01:00
public static final long targetFileSize = 64 * 1024 * 1024 ; // 256 MB
2009-05-28 16:26:05 +02:00
public static final int writeBufferSize = 4 * 1024 * 1024 ;
2012-07-22 13:18:45 +02:00
public static final String UrlDbName = " text.urlmd " ;
2012-07-23 16:28:39 +02:00
public static final String termIndexName = " text.index " ;
public static final String citationIndexName = " citation.index " ;
2009-05-28 16:26:05 +02:00
// the reference factory
2012-01-06 02:01:20 +01:00
public static final ReferenceFactory < WordReference > wordReferenceFactory = new WordReferenceFactory ( ) ;
2012-02-25 12:42:13 +01:00
public static final ReferenceFactory < CitationReference > citationReferenceFactory = new CitationReferenceFactory ( ) ;
2012-01-06 02:01:20 +01:00
public static final ByteOrder wordOrder = Base64Order . enhancedCoder ;
2011-06-13 23:44:03 +02:00
2009-06-30 15:25:46 +02:00
private final Log log ;
private final File segmentPath ;
2012-08-17 15:52:33 +02:00
protected final Fulltext fulltext ;
2012-07-23 16:28:39 +02:00
protected IndexCell < WordReference > termIndex ;
protected IndexCell < CitationReference > urlCitationIndex ;
2013-02-24 18:09:34 +01:00
protected boolean writeWebgraph ;
2011-06-13 23:44:03 +02:00
2013-02-21 13:23:55 +01:00
/ * *
* create a new Segment
* @param log
* @param segmentPath that should be the path ponting to the directory " SEGMENT "
* @param collectionSchema
* /
public Segment ( final Log log , final File segmentPath ,
final CollectionConfiguration collectionConfiguration , final WebgraphConfiguration webgraphConfiguration ) {
2009-11-02 12:53:14 +01:00
log . logInfo ( " Initializing Segment ' " + segmentPath + " . " ) ;
2009-05-28 16:26:05 +02:00
this . log = log ;
this . segmentPath = segmentPath ;
2011-06-13 23:44:03 +02:00
2009-05-28 16:26:05 +02:00
// create LURL-db
2013-02-21 13:23:55 +01:00
this . fulltext = new Fulltext ( segmentPath , collectionConfiguration , webgraphConfiguration ) ;
2013-02-24 18:09:34 +01:00
this . termIndex = null ;
this . urlCitationIndex = null ;
this . writeWebgraph = false ;
2009-10-09 16:44:20 +02:00
}
2011-06-13 23:44:03 +02:00
2013-02-24 18:09:34 +01:00
public void writeWebgraph ( boolean check ) {
this . writeWebgraph = check ;
}
2013-05-08 13:26:25 +02:00
public boolean writeToWebgraph ( ) {
return this . writeWebgraph ;
}
2012-07-23 16:28:39 +02:00
public boolean connectedRWI ( ) {
return this . termIndex ! = null ;
2012-06-28 14:27:29 +02:00
}
2012-07-23 16:28:39 +02:00
public void connectRWI ( final int entityCacheMaxSize , final long maxFileSize ) throws IOException {
if ( this . termIndex ! = null ) return ;
this . termIndex = new IndexCell < WordReference > (
2013-02-21 13:23:55 +01:00
new File ( this . segmentPath , " default " ) ,
2012-07-23 16:28:39 +02:00
termIndexName ,
wordReferenceFactory ,
wordOrder ,
Word . commonHashLength ,
entityCacheMaxSize ,
targetFileSize ,
maxFileSize ,
writeBufferSize ) ;
2012-06-28 14:27:29 +02:00
}
2012-07-23 16:28:39 +02:00
public void disconnectRWI ( ) {
if ( this . termIndex = = null ) return ;
this . termIndex . close ( ) ;
this . termIndex = null ;
}
public boolean connectedCitation ( ) {
return this . urlCitationIndex ! = null ;
}
public void connectCitation ( final int entityCacheMaxSize , final long maxFileSize ) throws IOException {
if ( this . urlCitationIndex ! = null ) return ;
this . urlCitationIndex = new IndexCell < CitationReference > (
2013-02-21 13:23:55 +01:00
new File ( this . segmentPath , " default " ) ,
2012-07-23 16:28:39 +02:00
citationIndexName ,
citationReferenceFactory ,
wordOrder ,
Word . commonHashLength ,
entityCacheMaxSize ,
targetFileSize ,
maxFileSize ,
writeBufferSize ) ;
}
public void disconnectCitation ( ) {
if ( this . urlCitationIndex = = null ) return ;
this . urlCitationIndex . close ( ) ;
this . urlCitationIndex = null ;
}
public void connectUrlDb ( final boolean useTailCache , final boolean exceed134217727 ) {
2012-08-17 15:52:33 +02:00
this . fulltext . connectUrlDb ( UrlDbName , useTailCache , exceed134217727 ) ;
2012-07-23 16:28:39 +02:00
}
2012-08-17 15:52:33 +02:00
public Fulltext fulltext ( ) {
return this . fulltext ;
2009-05-28 16:26:05 +02:00
}
2009-05-29 12:03:35 +02:00
public IndexCell < WordReference > termIndex ( ) {
return this . termIndex ;
2009-05-28 16:26:05 +02:00
}
2011-11-08 12:49:04 +01:00
2012-03-29 17:20:14 +02:00
public IndexCell < CitationReference > urlCitation ( ) {
return this . urlCitationIndex ;
}
2013-03-14 12:13:02 +01:00
/ * *
* compute the click level using the citation reference database
* @param citations the citation database
* @param searchhash the hash of the url to be checked
* @return the clickdepth level or 999 if the root url cannot be found or a recursion limit is reached
* @throws IOException
* /
public int getClickDepth ( final DigestURI url ) throws IOException {
final byte [ ] searchhash = url . hash ( ) ;
RowHandleSet rootCandidates = url . getPossibleRootHashes ( ) ;
RowHandleSet ignore = new RowHandleSet ( URIMetadataRow . rowdef . primaryKeyLength , URIMetadataRow . rowdef . objectOrder , 100 ) ; // a set of urlhashes to be ignored. This is generated from all hashes that are seen during recursion to prevent enless loops
RowHandleSet levelhashes = new RowHandleSet ( URIMetadataRow . rowdef . primaryKeyLength , URIMetadataRow . rowdef . objectOrder , 1 ) ; // all hashes of a clickdepth. The first call contains the target hash only and therefore just one entry
try { levelhashes . put ( searchhash ) ; } catch ( SpaceExceededException e ) { throw new IOException ( e ) ; }
int leveldepth = 0 ; // the recursion depth and therefore the result depth-1. Shall be 0 for the first call
final byte [ ] hosthash = new byte [ 6 ] ; // the host of the url to be checked
System . arraycopy ( searchhash , 6 , hosthash , 0 , 6 ) ;
long timeout = System . currentTimeMillis ( ) + 10000 ;
for ( int maxdepth = 0 ; maxdepth < 10 & & System . currentTimeMillis ( ) < timeout ; maxdepth + + ) {
RowHandleSet checknext = new RowHandleSet ( URIMetadataRow . rowdef . primaryKeyLength , URIMetadataRow . rowdef . objectOrder , 100 ) ;
// loop over all hashes at this clickdepth; the first call to this loop should contain only one hash and a leveldepth = 0
checkloop : for ( byte [ ] urlhash : levelhashes ) {
// get all the citations for this url and iterate
ReferenceContainer < CitationReference > references = this . urlCitationIndex . get ( urlhash , null ) ;
if ( references = = null | | references . size ( ) = = 0 ) continue checkloop ; // don't know
Iterator < CitationReference > i = references . entries ( ) ;
nextloop : while ( i . hasNext ( ) ) {
CitationReference ref = i . next ( ) ;
if ( ref = = null ) continue nextloop ;
byte [ ] u = ref . urlhash ( ) ;
// check ignore
if ( ignore . has ( u ) ) continue nextloop ;
// check if this is from the same host
if ( ! ByteBuffer . equals ( u , 6 , hosthash , 0 , 6 ) ) continue nextloop ;
// check if the url is a root url
if ( rootCandidates . has ( u ) ) {
return leveldepth + 1 ;
}
// step to next depth level
try { checknext . put ( u ) ; } catch ( SpaceExceededException e ) { }
try { ignore . put ( u ) ; } catch ( SpaceExceededException e ) { }
}
}
leveldepth + + ;
levelhashes = checknext ;
}
return 999 ;
}
2012-07-23 16:28:39 +02:00
public long RWICount ( ) {
if ( this . termIndex = = null ) return 0 ;
return this . termIndex . sizesMax ( ) ;
}
public int RWIBufferCount ( ) {
if ( this . termIndex = = null ) return 0 ;
return this . termIndex . getBufferSize ( ) ;
}
2013-02-26 17:16:31 +01:00
/ * *
* get a guess about the word count . This is only a guess because it uses the term index if present and this index may be
* influenced by index transmission processes in its statistic word distribution . However , it can be a hint for heuristics
* which use the word count . Please do NOT use this if the termIndex is not present because it otherwise uses the solr index
* which makes it painfull slow .
* @param word
* @return the number of references for this word .
* /
public int getWordCountGuess ( String word ) {
2012-09-26 16:56:33 +02:00
if ( word = = null | | word . indexOf ( ':' ) > = 0 | | word . indexOf ( ' ' ) > = 0 | | word . indexOf ( '/' ) > = 0 ) return 0 ;
2013-03-17 10:52:31 +01:00
if ( this . termIndex ! = null ) {
int count = this . termIndex . count ( Word . word2hash ( word ) ) ;
if ( count > 0 ) return count ;
}
try {
2013-04-27 01:32:18 +02:00
return ( int ) this . fulltext . getDefaultConnector ( ) . getCountByQuery ( CollectionSchema . text_t . getSolrFieldName ( ) + " : \" " + word + " \" " ) ;
2013-03-17 10:52:31 +01:00
} catch ( Throwable e ) {
Log . logException ( e ) ;
return 0 ;
2013-02-26 17:16:31 +01:00
}
2012-08-31 13:03:00 +02:00
}
2013-02-22 15:45:15 +01:00
public boolean exists ( final String urlhash ) {
2012-08-17 15:52:33 +02:00
return this . fulltext . exists ( urlhash ) ;
2011-09-25 18:59:06 +02:00
}
2011-06-13 23:44:03 +02:00
2012-06-13 15:53:18 +02:00
/ * *
* discover all urls that start with a given url stub
* @param stub
* @return an iterator for all matching urls
* /
2013-01-30 19:33:48 +01:00
public Iterator < DigestURI > urlSelector ( final MultiProtocolURI stub , final long maxtime , final int maxcount ) {
final BlockingQueue < SolrDocument > docQueue ;
final String urlstub ;
if ( stub = = null ) {
2013-05-04 00:14:22 +02:00
docQueue = this . fulltext . getDefaultConnector ( ) . concurrentDocumentsByQuery ( AbstractSolrConnector . CATCHALL_TERM , 0 , Integer . MAX_VALUE , maxtime , maxcount , CollectionSchema . id . getSolrFieldName ( ) , CollectionSchema . sku . getSolrFieldName ( ) ) ;
2013-01-30 19:33:48 +01:00
urlstub = null ;
} else {
final String host = stub . getHost ( ) ;
String hh = DigestURI . hosthash ( host ) ;
2013-04-27 01:32:18 +02:00
docQueue = this . fulltext . getDefaultConnector ( ) . concurrentDocumentsByQuery ( CollectionSchema . host_id_s + " : \" " + hh + " \" " , 0 , Integer . MAX_VALUE , maxtime , maxcount , CollectionSchema . id . getSolrFieldName ( ) , CollectionSchema . sku . getSolrFieldName ( ) ) ;
2013-01-30 19:33:48 +01:00
urlstub = stub . toNormalform ( true ) ;
}
2012-06-13 15:53:18 +02:00
// now filter the stub from the iterated urls
return new LookAheadIterator < DigestURI > ( ) {
@Override
protected DigestURI next0 ( ) {
2012-08-24 14:13:42 +02:00
while ( true ) {
2013-01-29 18:14:14 +01:00
SolrDocument doc ;
2012-08-24 14:13:42 +02:00
try {
2013-01-29 18:14:14 +01:00
doc = docQueue . take ( ) ;
2012-08-24 14:13:42 +02:00
} catch ( InterruptedException e ) {
Log . logException ( e ) ;
return null ;
}
2013-01-29 18:14:14 +01:00
if ( doc = = null | | doc = = AbstractSolrConnector . POISON_DOCUMENT ) return null ;
2013-02-21 13:23:55 +01:00
String u = ( String ) doc . getFieldValue ( CollectionSchema . sku . getSolrFieldName ( ) ) ;
String id = ( String ) doc . getFieldValue ( CollectionSchema . id . getSolrFieldName ( ) ) ;
2013-01-29 18:14:14 +01:00
DigestURI url ;
try {
url = new DigestURI ( u , ASCII . getBytes ( id ) ) ;
} catch ( MalformedURLException e ) {
continue ;
}
2013-01-30 19:33:48 +01:00
if ( urlstub = = null | | u . startsWith ( urlstub ) ) return url ;
2012-06-13 15:53:18 +02:00
}
}
} ;
}
2009-05-28 16:26:05 +02:00
public void clear ( ) {
try {
2012-07-22 13:18:45 +02:00
if ( this . termIndex ! = null ) this . termIndex . clear ( ) ;
2013-01-04 16:39:34 +01:00
if ( this . fulltext ! = null ) this . fulltext . clearURLIndex ( ) ;
if ( this . fulltext ! = null ) this . fulltext . clearLocalSolr ( ) ;
if ( this . fulltext ! = null ) this . fulltext . clearRemoteSolr ( ) ;
2012-07-22 13:18:45 +02:00
if ( this . urlCitationIndex ! = null ) this . urlCitationIndex . clear ( ) ;
2009-05-28 16:26:05 +02:00
} catch ( final IOException e ) {
2009-11-05 21:28:37 +01:00
Log . logException ( e ) ;
2009-05-28 16:26:05 +02:00
}
}
2013-03-13 17:55:37 +01:00
public void clearCache ( ) {
if ( this . urlCitationIndex ! = null ) this . urlCitationIndex . clearCache ( ) ;
if ( this . termIndex ! = null ) this . termIndex . clearCache ( ) ;
this . fulltext . clearCache ( ) ;
}
2011-06-13 23:44:03 +02:00
2009-05-28 16:26:05 +02:00
public File getLocation ( ) {
return this . segmentPath ;
}
2013-02-22 15:45:15 +01:00
private int addCitationIndex ( final DigestURI url , final Date urlModified , final Map < DigestURI , Properties > anchors ) {
2012-04-27 14:18:02 +02:00
if ( anchors = = null ) return 0 ;
2012-03-29 17:20:14 +02:00
int refCount = 0 ;
// iterate over all outgoing links, this will create a context for those links
final byte [ ] urlhash = url . hash ( ) ;
final long urldate = urlModified . getTime ( ) ;
2013-02-22 15:45:15 +01:00
for ( Map . Entry < DigestURI , Properties > anchorEntry : anchors . entrySet ( ) ) {
DigestURI anchor = anchorEntry . getKey ( ) ;
byte [ ] refhash = anchor . hash ( ) ;
2012-03-29 17:20:14 +02:00
//System.out.println("*** addCitationIndex: urlhash = " + ASCII.String(urlhash) + ", refhash = " + ASCII.String(refhash) + ", urldate = " + urlModified.toString());
2012-07-23 16:28:39 +02:00
if ( this . urlCitationIndex ! = null ) try {
2012-04-13 11:52:59 +02:00
this . urlCitationIndex . add ( refhash , new CitationReference ( urlhash , urldate ) ) ;
2012-03-29 17:20:14 +02:00
} catch ( final Exception e ) {
Log . logException ( e ) ;
}
refCount + + ;
}
return refCount ;
}
2012-04-27 14:18:02 +02:00
2012-05-14 07:41:55 +02:00
public synchronized void close ( ) {
2012-07-22 13:18:45 +02:00
if ( this . termIndex ! = null ) this . termIndex . close ( ) ;
2012-08-17 15:52:33 +02:00
if ( this . fulltext ! = null ) this . fulltext . close ( ) ;
2012-07-22 13:18:45 +02:00
if ( this . urlCitationIndex ! = null ) this . urlCitationIndex . close ( ) ;
2009-05-28 16:26:05 +02:00
}
2012-10-18 14:29:11 +02:00
private static String votedLanguage (
2012-07-25 01:53:47 +02:00
final DigestURI url ,
final String urlNormalform ,
final Document document ,
final Condenser condenser ) {
// do a identification of the language
2009-05-28 16:26:05 +02:00
String language = condenser . language ( ) ; // this is a statistical analysation of the content: will be compared with other attributes
2011-06-13 23:44:03 +02:00
final String bymetadata = document . dc_language ( ) ; // the languageByMetadata may return null if there was no declaration
2009-05-28 16:26:05 +02:00
if ( language = = null ) {
// no statistics available, we take either the metadata (if given) or the TLD
2009-05-29 12:03:35 +02:00
language = ( bymetadata = = null ) ? url . language ( ) : bymetadata ;
2009-05-28 16:26:05 +02:00
} else {
if ( bymetadata = = null ) {
// two possible results: compare and report conflicts
2012-10-18 14:29:11 +02:00
if ( ! language . equals ( url . language ( ) ) ) {
2009-05-28 16:26:05 +02:00
// see if we have a hint in the url that the statistic was right
2012-07-25 01:53:47 +02:00
final String u = urlNormalform . toLowerCase ( ) ;
2009-10-11 02:12:19 +02:00
if ( ! u . contains ( " / " + language + " / " ) & & ! u . contains ( " / " + ISO639 . country ( language ) . toLowerCase ( ) + " / " ) ) {
2009-05-28 16:26:05 +02:00
// no confirmation using the url, use the TLD
2009-05-29 12:03:35 +02:00
language = url . language ( ) ;
2009-05-28 16:26:05 +02:00
} else {
// this is a strong hint that the statistics was in fact correct
}
}
} else {
// here we have three results: we can do a voting
if ( language . equals ( bymetadata ) ) {
2009-09-14 23:17:42 +02:00
//if (log.isFine()) log.logFine("LANGUAGE-BY-STATISTICS: " + entry.url() + " CONFIRMED - METADATA IDENTICAL: " + language);
2009-05-29 12:03:35 +02:00
} else if ( language . equals ( url . language ( ) ) ) {
2009-09-14 23:17:42 +02:00
//if (log.isFine()) log.logFine("LANGUAGE-BY-STATISTICS: " + entry.url() + " CONFIRMED - TLD IS IDENTICAL: " + language);
2009-05-29 12:03:35 +02:00
} else if ( bymetadata . equals ( url . language ( ) ) ) {
2009-09-14 23:17:42 +02:00
//if (log.isFine()) log.logFine("LANGUAGE-BY-STATISTICS: " + entry.url() + " CONFLICTING: " + language + " BUT METADATA AND TLD ARE IDENTICAL: " + bymetadata + ")");
2009-05-28 16:26:05 +02:00
language = bymetadata ;
} else {
2009-09-14 23:17:42 +02:00
//if (log.isFine()) log.logFine("LANGUAGE-BY-STATISTICS: " + entry.url() + " CONFLICTING: ALL DIFFERENT! statistic: " + language + ", metadata: " + bymetadata + ", TLD: + " + entry.url().language() + ". taking metadata.");
2009-05-28 16:26:05 +02:00
language = bymetadata ;
}
}
}
2012-07-25 01:53:47 +02:00
return language ;
}
2011-06-13 23:44:03 +02:00
2012-08-31 14:35:56 +02:00
public void storeRWI ( final ReferenceContainer < WordReference > wordContainer ) throws IOException , SpaceExceededException {
if ( this . termIndex ! = null ) this . termIndex . add ( wordContainer ) ;
}
public void storeRWI ( final byte [ ] termHash , final WordReference entry ) throws IOException , SpaceExceededException {
if ( this . termIndex ! = null ) this . termIndex . add ( termHash , entry ) ;
}
2012-10-18 14:29:11 +02:00
public SolrInputDocument storeDocument (
2012-07-25 01:53:47 +02:00
final DigestURI url ,
final DigestURI referrerURL ,
2013-04-24 01:14:35 +02:00
final String [ ] collections ,
2012-07-25 01:53:47 +02:00
final ResponseHeader responseHeader ,
final Document document ,
final Condenser condenser ,
final SearchEvent searchEvent ,
2012-10-09 11:48:55 +02:00
final String sourceName ,
final boolean storeToRWI
2012-09-26 16:05:11 +02:00
) {
2012-07-25 01:53:47 +02:00
final long startTime = System . currentTimeMillis ( ) ;
2013-01-23 14:40:58 +01:00
// DO A SOFT/HARD COMMIT IF NEEDED
if ( MemoryControl . shortStatus ( ) ) {
// do a 'hard' commit to flush index caches
2013-02-22 15:45:15 +01:00
this . fulltext . commit ( false ) ;
2013-01-23 14:40:58 +01:00
} else {
if (
2013-02-21 13:23:55 +01:00
( this . fulltext . getDefaultConfiguration ( ) . contains ( CollectionSchema . exact_signature_l ) & & this . fulltext . getDefaultConfiguration ( ) . contains ( CollectionSchema . exact_signature_unique_b ) ) | |
( this . fulltext . getDefaultConfiguration ( ) . contains ( CollectionSchema . fuzzy_signature_l ) & & this . fulltext . getDefaultConfiguration ( ) . contains ( CollectionSchema . fuzzy_signature_unique_b ) ) | |
this . fulltext . getDefaultConfiguration ( ) . contains ( CollectionSchema . title_unique_b ) | |
this . fulltext . getDefaultConfiguration ( ) . contains ( CollectionSchema . description_unique_b )
2013-01-23 14:40:58 +01:00
) {
2013-02-21 13:23:55 +01:00
this . fulltext . getDefaultConnector ( ) . commit ( true ) ; // make sure that we have latest information for the postprocessing steps
2013-01-23 14:40:58 +01:00
}
}
2012-07-25 01:53:47 +02:00
// CREATE INDEX
// load some document metadata
2012-10-18 14:29:11 +02:00
final Date loadDate = new Date ( ) ;
2012-07-25 01:53:47 +02:00
final String id = ASCII . String ( url . hash ( ) ) ;
final String dc_title = document . dc_title ( ) ;
2012-10-10 11:46:22 +02:00
final String urlNormalform = url . toNormalform ( true ) ;
2012-07-25 01:53:47 +02:00
final String language = votedLanguage ( url , urlNormalform , document , condenser ) ; // identification of the language
// STORE URL TO LOADED-URL-DB
2012-11-07 02:46:51 +01:00
Date modDate = responseHeader = = null ? new Date ( ) : responseHeader . lastModified ( ) ;
2012-10-18 14:29:11 +02:00
if ( modDate . getTime ( ) > loadDate . getTime ( ) ) modDate = loadDate ;
2012-07-25 01:53:47 +02:00
char docType = Response . docType ( document . dc_format ( ) ) ;
2012-10-18 14:29:11 +02:00
2012-11-21 18:46:49 +01:00
// CREATE SOLR DOCUMENT
2013-04-24 01:14:35 +02:00
final CollectionConfiguration . SolrVector vector = this . fulltext . getDefaultConfiguration ( ) . yacy2solr ( id , collections , responseHeader , document , condenser , referrerURL , language , urlCitationIndex , this . fulltext . getWebgraphConfiguration ( ) ) ;
2012-11-21 18:46:49 +01:00
// FIND OUT IF THIS IS A DOUBLE DOCUMENT
2013-04-17 16:15:27 +02:00
String hostid = url . hosthash ( ) ;
2013-02-21 13:23:55 +01:00
for ( CollectionSchema [ ] checkfields : new CollectionSchema [ ] [ ] {
{ CollectionSchema . exact_signature_l , CollectionSchema . exact_signature_unique_b } ,
{ CollectionSchema . fuzzy_signature_l , CollectionSchema . fuzzy_signature_unique_b } } ) {
CollectionSchema checkfield = checkfields [ 0 ] ;
CollectionSchema uniquefield = checkfields [ 1 ] ;
if ( this . fulltext . getDefaultConfiguration ( ) . contains ( checkfield ) & & this . fulltext . getDefaultConfiguration ( ) . contains ( uniquefield ) ) {
2012-11-21 18:46:49 +01:00
// lookup the document with the same signature
2013-02-22 15:45:15 +01:00
long signature = ( ( Long ) vector . getField ( checkfield . getSolrFieldName ( ) ) . getValue ( ) ) . longValue ( ) ;
2012-11-21 18:46:49 +01:00
try {
2013-04-17 16:15:27 +02:00
if ( this . fulltext . getDefaultConnector ( ) . existsByQuery ( CollectionSchema . host_id_s + " : \" " + hostid + " \" AND " + checkfield . getSolrFieldName ( ) + " : \" " + Long . toString ( signature ) + " \" " ) ) {
2012-11-21 18:46:49 +01:00
// change unique attribut in content
2013-02-22 15:45:15 +01:00
vector . setField ( uniquefield . getSolrFieldName ( ) , false ) ;
2012-11-21 18:46:49 +01:00
}
} catch ( IOException e ) { }
}
}
2013-01-23 14:40:58 +01:00
// CHECK IF TITLE AND DESCRIPTION IS UNIQUE (this is by default not switched on)
2013-04-16 01:35:15 +02:00
if ( this . fulltext . getDefaultConfiguration ( ) . contains ( CollectionSchema . host_id_s ) ) {
uniquecheck : for ( CollectionSchema [ ] checkfields : new CollectionSchema [ ] [ ] {
{ CollectionSchema . title , CollectionSchema . title_exact_signature_l , CollectionSchema . title_unique_b } ,
{ CollectionSchema . description , CollectionSchema . description_exact_signature_l , CollectionSchema . description_unique_b } } ) {
CollectionSchema checkfield = checkfields [ 0 ] ;
CollectionSchema signaturefield = checkfields [ 1 ] ;
CollectionSchema uniquefield = checkfields [ 2 ] ;
if ( this . fulltext . getDefaultConfiguration ( ) . contains ( checkfield ) & & this . fulltext . getDefaultConfiguration ( ) . contains ( signaturefield ) & & this . fulltext . getDefaultConfiguration ( ) . contains ( uniquefield ) ) {
// lookup in the index within the same hosts for the same title or description
//String checkstring = checkfield == CollectionSchema.title ? document.dc_title() : document.dc_description();
Long checkhash = ( Long ) vector . getFieldValue ( signaturefield . getSolrFieldName ( ) ) ;
if ( checkhash = = null ) {
2013-02-22 15:45:15 +01:00
vector . setField ( uniquefield . getSolrFieldName ( ) , false ) ;
2013-04-16 01:35:15 +02:00
continue uniquecheck ;
2013-01-23 14:40:58 +01:00
}
2013-04-16 01:35:15 +02:00
try {
if ( this . fulltext . getDefaultConnector ( ) . existsByQuery ( CollectionSchema . host_id_s + " : \" " + hostid + " \" AND " + signaturefield . getSolrFieldName ( ) + " : \" " + checkhash . toString ( ) + " \" " ) ) {
// switch unique attribute in new document
vector . setField ( uniquefield . getSolrFieldName ( ) , false ) ;
// switch attribute also in all existing documents (which should be exactly only one!)
2013-04-27 01:32:18 +02:00
SolrDocumentList docs = this . fulltext . getDefaultConnector ( ) . getDocumentListByQuery ( CollectionSchema . host_id_s + " : \" " + hostid + " \" AND " + signaturefield . getSolrFieldName ( ) + " : \" " + checkhash . toString ( ) + " \" AND " + uniquefield . getSolrFieldName ( ) + " :true " , 0 , 1000 ) ;
2013-04-16 01:35:15 +02:00
for ( SolrDocument doc : docs ) {
SolrInputDocument sid = this . fulltext . getDefaultConfiguration ( ) . toSolrInputDocument ( doc ) ;
sid . setField ( uniquefield . getSolrFieldName ( ) , false ) ;
this . fulltext . getDefaultConnector ( ) . add ( sid ) ;
}
} else {
vector . setField ( uniquefield . getSolrFieldName ( ) , true ) ;
}
} catch ( IOException e ) { }
}
2013-01-23 14:40:58 +01:00
}
}
2012-12-18 14:42:35 +01:00
// ENRICH DOCUMENT WITH RANKING INFORMATION
2013-04-14 02:01:27 +02:00
if ( this . connectedCitation ( ) ) {
2013-04-14 20:52:40 +02:00
this . fulltext . getDefaultConfiguration ( ) . postprocessing_references ( this , null , vector , url , null ) ;
2013-04-14 02:01:27 +02:00
}
2012-11-21 18:46:49 +01:00
// STORE TO SOLR
2012-11-13 16:54:28 +01:00
String error = null ;
tryloop : for ( int i = 0 ; i < 20 ; i + + ) {
2012-11-07 15:37:14 +01:00
try {
2012-11-13 16:54:28 +01:00
error = null ;
2013-02-22 15:45:15 +01:00
this . fulltext . putDocument ( vector ) ;
break tryloop ;
} catch ( final IOException e ) {
2013-05-03 00:24:39 +02:00
error = " failed to send " + urlNormalform + " to solr: " + e . getMessage ( ) ;
Log . logWarning ( " SOLR " , error ) ;
2013-02-22 15:45:15 +01:00
if ( i = = 10 ) this . fulltext . commit ( false ) ;
try { Thread . sleep ( 1000 ) ; } catch ( InterruptedException e1 ) { }
continue tryloop ;
}
}
2013-02-24 18:09:34 +01:00
if ( this . writeWebgraph ) {
tryloop : for ( int i = 0 ; i < 20 ; i + + ) {
try {
error = null ;
this . fulltext . putEdges ( vector . getWebgraphDocuments ( ) ) ;
break tryloop ;
} catch ( final IOException e ) {
2013-05-03 00:24:39 +02:00
error = " failed to send " + urlNormalform + " to solr: " + e . getMessage ( ) ;
Log . logWarning ( " SOLR " , error ) ;
2013-02-24 18:09:34 +01:00
if ( i = = 10 ) this . fulltext . commit ( false ) ;
try { Thread . sleep ( 1000 ) ; } catch ( InterruptedException e1 ) { }
continue tryloop ;
}
2012-11-07 15:37:14 +01:00
}
2012-11-13 16:54:28 +01:00
}
if ( error ! = null ) {
2013-05-03 00:24:39 +02:00
Log . logSevere ( " SOLR " , error + " , PLEASE REPORT TO bugs.yacy.net " ) ;
//Switchboard.getSwitchboard().pauseCrawlJob(SwitchboardConstants.CRAWLJOB_LOCAL_CRAWL, error);
//Switchboard.getSwitchboard().pauseCrawlJob(SwitchboardConstants.CRAWLJOB_REMOTE_TRIGGERED_CRAWL, error);
2012-08-05 15:49:27 +02:00
}
2012-08-10 15:39:10 +02:00
final long storageEndTime = System . currentTimeMillis ( ) ;
2009-05-28 16:26:05 +02:00
// STORE PAGE INDEX INTO WORD INDEX DB
2012-07-25 01:53:47 +02:00
int outlinksSame = document . inboundLinks ( ) . size ( ) ;
int outlinksOther = document . outboundLinks ( ) . size ( ) ;
final int urlLength = urlNormalform . length ( ) ;
final int urlComps = MultiProtocolURI . urlComps ( url . toString ( ) ) . length ;
// create a word prototype which is re-used for all entries
2012-10-09 11:48:55 +02:00
if ( ( this . termIndex ! = null & & storeToRWI ) | | searchEvent ! = null ) {
final int len = ( document = = null ) ? urlLength : document . dc_title ( ) . length ( ) ;
final WordReferenceRow ientry = new WordReferenceRow (
url . hash ( ) ,
urlLength , urlComps , len ,
condenser . RESULT_NUMB_WORDS ,
condenser . RESULT_NUMB_SENTENCES ,
modDate . getTime ( ) ,
System . currentTimeMillis ( ) ,
UTF8 . getBytes ( language ) ,
docType ,
outlinksSame , outlinksOther ) ;
// iterate over all words of content text
Word wprop = null ;
byte [ ] wordhash ;
String word ;
for ( Map . Entry < String , Word > wentry : condenser . words ( ) . entrySet ( ) ) {
word = wentry . getKey ( ) ;
wprop = wentry . getValue ( ) ;
assert ( wprop . flags ! = null ) ;
ientry . setWord ( wprop ) ;
wordhash = Word . word2hash ( word ) ;
if ( this . termIndex ! = null & & storeToRWI ) try {
this . termIndex . add ( wordhash , ientry ) ;
} catch ( final Exception e ) {
Log . logException ( e ) ;
}
// during a search event it is possible that a heuristic is used which aquires index
// data during search-time. To transfer indexed data directly to the search process
// the following lines push the index data additionally to the search process
// this is done only for searched words
2012-11-18 01:22:41 +01:00
if ( searchEvent ! = null & & ! searchEvent . query . getQueryGoal ( ) . getExcludeHashes ( ) . has ( wordhash ) & & searchEvent . query . getQueryGoal ( ) . getIncludeHashes ( ) . has ( wordhash ) ) {
2012-10-09 11:48:55 +02:00
// if the page was added in the context of a heuristic this shall ensure that findings will fire directly into the search result
ReferenceContainer < WordReference > container ;
try {
container = ReferenceContainer . emptyContainer ( Segment . wordReferenceFactory , wordhash , 1 ) ;
container . add ( ientry ) ;
2013-03-05 21:28:22 +01:00
searchEvent . addRWIs ( container , true , sourceName , 1 , 5000 ) ;
2012-10-09 11:48:55 +02:00
} catch ( final SpaceExceededException e ) {
continue ;
}
}
}
2013-02-26 17:16:31 +01:00
if ( searchEvent ! = null ) searchEvent . addFinalize ( ) ;
2012-10-09 11:48:55 +02:00
// assign the catchall word
ientry . setWord ( wprop = = null ? catchallWord : wprop ) ; // we use one of the word properties as template to get the document characteristics
2012-07-25 01:53:47 +02:00
if ( this . termIndex ! = null ) try {
2012-10-09 11:48:55 +02:00
this . termIndex . add ( catchallHash , ientry ) ;
2012-07-25 01:53:47 +02:00
} catch ( final Exception e ) {
Log . logException ( e ) ;
}
}
2012-04-27 14:18:02 +02:00
2012-03-29 17:20:14 +02:00
// STORE PAGE REFERENCES INTO CITATION INDEX
2012-04-27 14:18:02 +02:00
final int refs = addCitationIndex ( url , modDate , document . getAnchors ( ) ) ;
2012-03-29 17:20:14 +02:00
// finish index time
2009-05-28 16:26:05 +02:00
final long indexingEndTime = System . currentTimeMillis ( ) ;
2011-06-13 23:44:03 +02:00
if ( this . log . isInfo ( ) ) {
2012-10-29 21:42:31 +01:00
this . log . logInfo ( " *Indexed " + condenser . words ( ) . size ( ) + " words in URL " + url +
2012-07-25 01:53:47 +02:00
" [ " + id + " ] " +
2009-05-28 16:26:05 +02:00
" \ n \ tDescription: " + dc_title +
" \ n \ tMimeType: " + document . dc_format ( ) + " | Charset: " + document . getCharset ( ) + " | " +
" Size: " + document . getTextLength ( ) + " bytes | " +
2012-03-29 17:20:14 +02:00
" Anchors: " + refs +
2009-05-28 16:26:05 +02:00
" \ n \ tLinkStorageTime: " + ( storageEndTime - startTime ) + " ms | " +
" indexStorageTime: " + ( indexingEndTime - storageEndTime ) + " ms " ) ;
}
2011-06-13 23:44:03 +02:00
2009-05-28 16:26:05 +02:00
// finished
2013-02-22 15:45:15 +01:00
return vector ;
2009-05-28 16:26:05 +02:00
}
2011-06-13 23:44:03 +02:00
public void removeAllUrlReferences ( final HandleSet urls , final LoaderDispatcher loader , final CacheStrategy cacheStrategy ) {
for ( final byte [ ] urlhash : urls ) removeAllUrlReferences ( urlhash , loader , cacheStrategy ) ;
2010-04-15 15:22:59 +02:00
}
2011-06-13 23:44:03 +02:00
2010-06-21 16:54:54 +02:00
/ * *
* find all the words in a specific resource and remove the url reference from every word index
* finally , delete the url entry
* @param urlhash the hash of the url that shall be removed
* @param loader
* @param cacheStrategy
* @return number of removed words
* /
2011-06-13 23:44:03 +02:00
public int removeAllUrlReferences ( final byte [ ] urlhash , final LoaderDispatcher loader , final CacheStrategy cacheStrategy ) {
2010-06-21 16:54:54 +02:00
2009-10-23 00:38:04 +02:00
if ( urlhash = = null ) return 0 ;
// determine the url string
2012-11-23 01:35:28 +01:00
final DigestURI url = fulltext ( ) . getURL ( urlhash ) ;
if ( url = = null ) return 0 ;
2011-06-13 23:44:03 +02:00
2009-10-23 00:38:04 +02:00
try {
2010-06-22 14:28:53 +02:00
// parse the resource
2012-11-23 01:35:28 +01:00
final Document document = Document . mergeDocuments ( url , null , loader . loadDocuments ( loader . request ( url , true , false ) , cacheStrategy , Integer . MAX_VALUE , null , CrawlQueues . queuedMinLoadDelay ) ) ;
2010-06-22 14:28:53 +02:00
if ( document = = null ) {
2009-10-23 00:38:04 +02:00
// delete just the url entry
2012-08-17 15:52:33 +02:00
fulltext ( ) . remove ( urlhash ) ;
2009-10-23 00:38:04 +02:00
return 0 ;
}
2010-06-22 14:28:53 +02:00
// get the word set
Set < String > words = null ;
2012-10-02 00:02:50 +02:00
words = new Condenser ( document , true , true , null , null , false ) . words ( ) . keySet ( ) ;
2011-06-13 23:44:03 +02:00
2010-06-22 14:28:53 +02:00
// delete all word references
int count = 0 ;
if ( words ! = null ) count = termIndex ( ) . remove ( Word . words2hashesHandles ( words ) , urlhash ) ;
2011-06-13 23:44:03 +02:00
2010-06-22 14:28:53 +02:00
// finally delete the url entry itself
2012-08-17 15:52:33 +02:00
fulltext ( ) . remove ( urlhash ) ;
2010-06-22 14:28:53 +02:00
return count ;
2010-06-29 21:20:45 +02:00
} catch ( final Parser . Failure e ) {
2009-10-23 00:38:04 +02:00
return 0 ;
2011-06-13 23:44:03 +02:00
} catch ( final IOException e ) {
2009-11-05 21:28:37 +01:00
Log . logException ( e ) ;
2009-10-23 00:38:04 +02:00
return 0 ;
}
}
2009-05-28 16:26:05 +02:00
}