2010-03-05 22:25:49 +01:00
// RankingProcess.java
2007-11-07 23:38:09 +01:00
// (C) 2007 by Michael Peter Christen; mc@yacy.net, Frankfurt a. M., Germany
// first published 07.11.2007 on http://yacy.net
//
// This is a part of YaCy, a peer-to-peer based web search engine
//
2009-09-05 22:41:21 +02:00
// $LastChangedDate$
// $LastChangedRevision$
// $LastChangedBy$
2007-11-07 23:38:09 +01:00
//
// LICENSE
//
// This program is free software; you can redistribute it and/or modify
// it under the terms of the GNU General Public License as published by
// the Free Software Foundation; either version 2 of the License, or
// (at your option) any later version.
//
// This program is distributed in the hope that it will be useful,
// but WITHOUT ANY WARRANTY; without even the implied warranty of
// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
// GNU General Public License for more details.
//
// You should have received a copy of the GNU General Public License
// along with this program; if not, write to the Free Software
// Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
2009-07-09 00:14:57 +02:00
package de.anomic.search ;
2007-11-07 23:38:09 +01:00
import java.io.File ;
import java.io.IOException ;
2008-02-07 23:16:36 +01:00
import java.util.ArrayList ;
2009-05-27 00:30:20 +02:00
import java.util.Comparator ;
2009-11-12 00:31:12 +01:00
import java.util.ConcurrentModificationException ;
2007-11-07 23:38:09 +01:00
import java.util.Iterator ;
2010-03-05 22:25:49 +01:00
import java.util.List ;
2007-11-07 23:38:09 +01:00
import java.util.Map ;
2010-04-25 23:37:36 +02:00
import java.util.TreeMap ;
2009-08-28 00:54:32 +02:00
import java.util.concurrent.BlockingQueue ;
2008-02-27 16:16:47 +01:00
import java.util.concurrent.ConcurrentHashMap ;
2009-09-24 21:14:35 +02:00
import java.util.concurrent.TimeUnit ;
2007-11-07 23:38:09 +01:00
2010-05-25 14:54:57 +02:00
import net.yacy.cora.document.MultiProtocolURI ;
2010-09-09 17:30:25 +02:00
import net.yacy.cora.storage.WeakPriorityBlockingQueue ;
import net.yacy.cora.storage.WeakPriorityBlockingQueue.ReverseElement ;
2009-10-18 02:53:43 +02:00
import net.yacy.document.Condenser ;
2009-10-11 02:12:19 +02:00
import net.yacy.kelondro.data.meta.DigestURI ;
import net.yacy.kelondro.data.meta.URIMetadataRow ;
2010-01-29 16:59:24 +01:00
import net.yacy.kelondro.data.meta.URIMetadataRow.Components ;
2009-10-11 02:12:19 +02:00
import net.yacy.kelondro.data.word.Word ;
import net.yacy.kelondro.data.word.WordReference ;
import net.yacy.kelondro.data.word.WordReferenceVars ;
2009-10-10 01:32:08 +02:00
import net.yacy.kelondro.index.BinSearch ;
2010-04-15 15:22:59 +02:00
import net.yacy.kelondro.index.HandleSet ;
import net.yacy.kelondro.index.RowSpaceExceededException ;
2009-11-05 21:28:37 +01:00
import net.yacy.kelondro.logging.Log ;
2009-10-10 01:22:22 +02:00
import net.yacy.kelondro.order.Digest ;
2009-10-10 02:39:15 +02:00
import net.yacy.kelondro.rwi.ReferenceContainer ;
import net.yacy.kelondro.rwi.TermSearch ;
2009-12-08 15:25:51 +01:00
import net.yacy.kelondro.util.EventTracker ;
2009-10-10 03:14:19 +02:00
import net.yacy.kelondro.util.FileUtils ;
2009-10-10 01:22:22 +02:00
2009-10-20 00:34:44 +02:00
import de.anomic.yacy.graphics.ProfilingGraph ;
2007-11-07 23:38:09 +01:00
2009-08-24 17:24:02 +02:00
public final class RankingProcess extends Thread {
2007-11-07 23:38:09 +01:00
2009-01-30 16:33:00 +01:00
public static BinSearch [ ] ybrTables = null ; // block-rank tables
2010-04-19 18:42:37 +02:00
private static final int maxYBR = 3 ; // the lower this value, the faster the search
2007-11-07 23:38:09 +01:00
private static boolean useYBR = true ;
2010-03-07 01:12:58 +01:00
private static final int maxDoubleDomAll = 100 , maxDoubleDomSpecial = 10000 ;
2007-11-07 23:38:09 +01:00
2009-07-09 00:14:57 +02:00
private final QueryParams query ;
2010-04-15 15:22:59 +02:00
private final HandleSet urlhashes ; // map for double-check; String/Long relation, addresses ranking number (backreference for deletion)
2008-08-02 14:12:04 +02:00
private final int [ ] flagcount ; // flag counter
2010-04-15 15:22:59 +02:00
private final HandleSet misses ; // contains url-hashes that could not been found in the LURL-DB
2010-01-11 00:09:48 +01:00
//private final int[] domZones;
2010-04-25 23:37:36 +02:00
private TreeMap < byte [ ] , ReferenceContainer < WordReference > > localSearchInclusion ;
2009-08-27 17:19:48 +02:00
2010-01-13 01:04:37 +01:00
private int remote_resourceSize , remote_indexCount , remote_peerCount ;
private int local_resourceSize , local_indexCount ;
2010-09-09 17:30:25 +02:00
private final WeakPriorityBlockingQueue < ReverseElement < WordReferenceVars > > stack ;
2009-09-04 01:09:53 +02:00
private int feeders ;
2010-09-09 17:30:25 +02:00
private final ConcurrentHashMap < String , WeakPriorityBlockingQueue < ReverseElement < WordReferenceVars > > > doubleDomCache ; // key = domhash (6 bytes); value = like stack
2010-04-19 18:42:37 +02:00
//private final HandleSet handover; // key = urlhash; used for double-check of urls that had been handed over to search process
2009-08-27 17:19:48 +02:00
2010-03-05 22:25:49 +01:00
private final Navigator ref ; // reference score computation for the commonSense heuristic
private final Navigator hostNavigator ;
private final Navigator authorNavigator ;
private final Navigator namespaceNavigator ;
2009-12-03 13:25:03 +01:00
private final ReferenceOrder order ;
2007-11-07 23:38:09 +01:00
2009-12-03 13:25:03 +01:00
public RankingProcess ( final QueryParams query , final ReferenceOrder order , final int maxentries , final int concurrency ) {
2007-11-07 23:38:09 +01:00
// we collect the urlhashes and construct a list with urlEntry objects
// attention: if minEntries is too high, this method will not terminate within the maxTime
2007-11-22 00:14:57 +01:00
// sortorder: 0 = hash, 1 = url, 2 = ranking
2009-06-03 10:49:54 +02:00
this . localSearchInclusion = null ;
2010-09-09 17:30:25 +02:00
this . stack = new WeakPriorityBlockingQueue < ReverseElement < WordReferenceVars > > ( maxentries ) ;
this . doubleDomCache = new ConcurrentHashMap < String , WeakPriorityBlockingQueue < ReverseElement < WordReferenceVars > > > ( ) ;
2007-11-07 23:38:09 +01:00
this . query = query ;
2009-12-03 13:25:03 +01:00
this . order = order ;
2008-01-30 22:58:30 +01:00
this . remote_peerCount = 0 ;
this . remote_resourceSize = 0 ;
2010-01-13 01:04:37 +01:00
this . remote_indexCount = 0 ;
2008-01-30 22:58:30 +01:00
this . local_resourceSize = 0 ;
2010-01-13 01:04:37 +01:00
this . local_indexCount = 0 ;
2010-04-15 15:22:59 +02:00
this . urlhashes = new HandleSet ( URIMetadataRow . rowdef . primaryKeyLength , URIMetadataRow . rowdef . objectOrder , 0 ) ;
this . misses = new HandleSet ( URIMetadataRow . rowdef . primaryKeyLength , URIMetadataRow . rowdef . objectOrder , 0 ) ;
2007-11-22 00:14:57 +01:00
this . flagcount = new int [ 32 ] ;
for ( int i = 0 ; i < 32 ; i + + ) { this . flagcount [ i ] = 0 ; }
2010-03-05 22:25:49 +01:00
this . hostNavigator = new Navigator ( ) ;
this . authorNavigator = new Navigator ( ) ;
this . namespaceNavigator = new Navigator ( ) ;
this . ref = new Navigator ( ) ;
2009-09-04 01:09:53 +02:00
this . feeders = concurrency ;
assert this . feeders > = 1 ;
2007-11-22 00:14:57 +01:00
}
2009-11-24 12:13:11 +01:00
public QueryParams getQuery ( ) {
return this . query ;
}
2009-12-03 13:25:03 +01:00
public ReferenceOrder getOrder ( ) {
return this . order ;
}
2009-08-24 17:24:02 +02:00
public void run ( ) {
2009-08-27 17:19:48 +02:00
// do a search
2009-08-24 17:24:02 +02:00
// sort the local containers and truncate it to a limited count,
// so following sortings together with the global results will be fast
try {
2009-08-27 17:19:48 +02:00
long timer = System . currentTimeMillis ( ) ;
2009-11-24 12:13:11 +01:00
final TermSearch < WordReference > search = this . query . getSegment ( ) . termIndex ( ) . query (
2009-08-27 17:19:48 +02:00
query . queryHashes ,
query . excludeHashes ,
null ,
Segment . wordReferenceFactory ,
query . maxDistance ) ;
this . localSearchInclusion = search . inclusion ( ) ;
final ReferenceContainer < WordReference > index = search . joined ( ) ;
2010-09-13 11:33:04 +02:00
EventTracker . update ( EventTracker . EClass . SEARCH , new ProfilingGraph . searchEvent ( query . id ( true ) , SearchEvent . Type . JOIN , query . queryString , index . size ( ) , System . currentTimeMillis ( ) - timer ) , false ) ;
2009-12-02 01:37:59 +01:00
if ( index . isEmpty ( ) ) {
2009-08-27 17:19:48 +02:00
return ;
}
2010-09-13 11:33:04 +02:00
add ( index , true , " local index: " + this . query . getSegment ( ) . getLocation ( ) , - 1 ) ;
2009-08-24 17:24:02 +02:00
} catch ( final Exception e ) {
2009-11-05 21:28:37 +01:00
Log . logException ( e ) ;
2009-08-24 17:24:02 +02:00
}
2009-09-04 01:09:53 +02:00
oneFeederTerminated ( ) ;
2009-08-24 17:24:02 +02:00
}
2010-09-13 11:33:04 +02:00
public void add ( final ReferenceContainer < WordReference > index , final boolean local , String resourceName , final int fullResource ) {
2007-11-07 23:38:09 +01:00
// we collect the urlhashes and construct a list with urlEntry objects
// attention: if minEntries is too high, this method will not terminate within the maxTime
2008-01-29 00:41:39 +01:00
assert ( index ! = null ) ;
2009-12-02 01:37:59 +01:00
if ( index . isEmpty ( ) ) return ;
2010-01-13 01:04:37 +01:00
2008-01-30 22:58:30 +01:00
if ( local ) {
2010-01-13 01:04:37 +01:00
this . local_resourceSize + = index . size ( ) ;
2008-01-30 22:58:30 +01:00
} else {
this . remote_resourceSize + = fullResource ;
this . remote_peerCount + + ;
}
2007-11-07 23:38:09 +01:00
2007-12-04 21:19:13 +01:00
long timer = System . currentTimeMillis ( ) ;
2008-02-21 11:06:57 +01:00
// normalize entries
2009-12-03 13:25:03 +01:00
final BlockingQueue < WordReferenceVars > decodedEntries = this . order . normalizeWith ( index ) ;
2010-09-13 11:33:04 +02:00
EventTracker . update ( EventTracker . EClass . SEARCH , new ProfilingGraph . searchEvent ( query . id ( true ) , SearchEvent . Type . NORMALIZING , resourceName , index . size ( ) , System . currentTimeMillis ( ) - timer ) , false ) ;
2007-11-07 23:38:09 +01:00
2008-02-21 11:06:57 +01:00
// iterate over normalized entries and select some that are better than currently stored
2010-03-05 22:25:49 +01:00
timer = System . currentTimeMillis ( ) ;
2009-05-26 00:27:34 +02:00
String domhash ;
2009-06-08 00:51:15 +02:00
boolean nav_hosts = this . query . navigators . equals ( " all " ) | | this . query . navigators . indexOf ( " hosts " ) > = 0 ;
2010-01-12 21:56:37 +01:00
final ArrayList < WordReferenceVars > filteredEntries = new ArrayList < WordReferenceVars > ( ) ;
2009-08-28 01:20:59 +02:00
// apply all constraints
2009-08-28 00:54:32 +02:00
try {
2010-01-12 21:56:37 +01:00
WordReferenceVars iEntry ;
while ( true ) {
2009-09-24 21:14:35 +02:00
iEntry = decodedEntries . poll ( 1 , TimeUnit . SECONDS ) ;
if ( iEntry = = null | | iEntry = = WordReferenceVars . poison ) break ;
2010-04-15 15:22:59 +02:00
assert ( iEntry . metadataHash ( ) . length = = index . row ( ) . primaryKeyLength ) ;
2009-08-28 00:54:32 +02:00
//if (iEntry.urlHash().length() != index.row().primaryKeyLength) continue;
2007-11-16 15:48:09 +01:00
2009-08-28 00:54:32 +02:00
// increase flag counts
for ( int j = 0 ; j < 32 ; j + + ) {
if ( iEntry . flags ( ) . get ( j ) ) { flagcount [ j ] + + ; }
}
// check constraints
if ( ! testFlags ( iEntry ) ) continue ;
// check document domain
2009-11-19 00:56:05 +01:00
if ( query . contentdom ! = ContentDomain . TEXT ) {
if ( ( query . contentdom = = ContentDomain . AUDIO ) & & ( ! ( iEntry . flags ( ) . get ( Condenser . flag_cat_hasaudio ) ) ) ) continue ;
if ( ( query . contentdom = = ContentDomain . VIDEO ) & & ( ! ( iEntry . flags ( ) . get ( Condenser . flag_cat_hasvideo ) ) ) ) continue ;
if ( ( query . contentdom = = ContentDomain . IMAGE ) & & ( ! ( iEntry . flags ( ) . get ( Condenser . flag_cat_hasimage ) ) ) ) continue ;
if ( ( query . contentdom = = ContentDomain . APP ) & & ( ! ( iEntry . flags ( ) . get ( Condenser . flag_cat_hasapp ) ) ) ) continue ;
2009-08-28 00:54:32 +02:00
}
2008-02-21 11:06:57 +01:00
2009-08-28 00:54:32 +02:00
// check tld domain
2010-04-15 15:22:59 +02:00
if ( ! DigestURI . matchesAnyDomDomain ( iEntry . metadataHash ( ) , this . query . zonecode ) ) {
2009-08-28 00:54:32 +02:00
// filter out all tld that do not match with wanted tld domain
continue ;
}
// check site constraints
2010-09-14 16:32:24 +02:00
if ( query . sitehash ! = null & & ! new String ( iEntry . metadataHash ( ) , 6 , 6 ) . equals ( query . sitehash ) ) {
2009-08-28 00:54:32 +02:00
// filter out all domains that do not match with the site constraint
continue ;
}
// count domZones
2010-01-11 00:09:48 +01:00
//this.domZones[DigestURI.domDomain(iEntry.metadataHash())]++;
2009-08-28 00:54:32 +02:00
// get statistics for host navigator
2010-03-23 14:41:41 +01:00
if ( nav_hosts & & query . urlMask_isCatchall ) {
2010-04-15 15:22:59 +02:00
String uhb = new String ( iEntry . urlHash ) ;
domhash = uhb . substring ( 6 ) ;
this . hostNavigator . inc ( domhash , uhb ) ;
2009-08-28 00:54:32 +02:00
}
// accept
2010-01-12 21:56:37 +01:00
filteredEntries . add ( iEntry ) ;
2009-08-28 00:54:32 +02:00
// increase counter for statistics
2010-01-13 01:04:37 +01:00
if ( local ) this . local_indexCount + + ; else this . remote_indexCount + + ;
2009-08-28 00:54:32 +02:00
}
2010-01-13 01:04:37 +01:00
2010-01-12 16:01:44 +01:00
// do the ranking
for ( WordReferenceVars fEntry : filteredEntries ) {
2010-09-14 16:32:24 +02:00
// insert with double-check
try {
if ( ! urlhashes . put ( fEntry . metadataHash ( ) ) ) {
stack . put ( new ReverseElement < WordReferenceVars > ( fEntry , this . order . cardinal ( fEntry ) ) ) ; // inserts the element and removes the worst (which is smallest)
}
2010-04-15 15:22:59 +02:00
} catch ( RowSpaceExceededException e ) {
Log . logException ( e ) ;
}
2010-01-12 16:01:44 +01:00
}
2009-08-28 01:20:59 +02:00
2010-01-12 16:01:44 +01:00
} catch ( InterruptedException e ) { }
2007-11-22 00:14:57 +01:00
//if ((query.neededResults() > 0) && (container.size() > query.neededResults())) remove(true, true);
2010-09-13 11:33:04 +02:00
EventTracker . update ( EventTracker . EClass . SEARCH , new ProfilingGraph . searchEvent ( query . id ( true ) , SearchEvent . Type . PRESORT , resourceName , index . size ( ) , System . currentTimeMillis ( ) - timer ) , false ) ;
2007-11-16 15:48:09 +01:00
}
2009-05-26 00:27:34 +02:00
2009-09-04 01:09:53 +02:00
/ * *
* method to signal the incoming stack that one feeder has terminated
* /
public void oneFeederTerminated ( ) {
this . feeders - - ;
assert this . feeders > = 0 : " feeders = " + this . feeders ;
}
2010-04-19 18:42:37 +02:00
protected void moreFeeders ( final int countMoreFeeders ) {
2009-09-04 01:09:53 +02:00
this . feeders + = countMoreFeeders ;
}
2010-04-19 18:42:37 +02:00
private boolean feedingIsFinished ( ) {
2009-09-04 01:09:53 +02:00
return this . feeders = = 0 ;
}
2009-04-07 11:34:41 +02:00
private boolean testFlags ( final WordReference ientry ) {
2007-11-22 00:14:57 +01:00
if ( query . constraint = = null ) return true ;
// test if ientry matches with filter
// if all = true: let only entries pass that has all matching bits
// if all = false: let all entries pass that has at least one matching bit
if ( query . allofconstraint ) {
for ( int i = 0 ; i < 32 ; i + + ) {
if ( ( query . constraint . get ( i ) ) & & ( ! ientry . flags ( ) . get ( i ) ) ) return false ;
}
return true ;
}
for ( int i = 0 ; i < 32 ; i + + ) {
if ( ( query . constraint . get ( i ) ) & & ( ientry . flags ( ) . get ( i ) ) ) return true ;
}
return false ;
2007-11-07 23:38:09 +01:00
}
2010-04-19 18:42:37 +02:00
protected Map < byte [ ] , ReferenceContainer < WordReference > > searchContainerMap ( ) {
2007-11-22 00:14:57 +01:00
// direct access to the result maps is needed for abstract generation
// this is only available if execQuery() was called before
2009-06-03 10:49:54 +02:00
return localSearchInclusion ;
2007-11-22 00:14:57 +01:00
}
2010-09-10 00:42:54 +02:00
private ReverseElement < WordReferenceVars > takeRWI ( final boolean skipDoubleDom , long timeout ) {
2008-04-24 17:09:06 +02:00
// returns from the current RWI list the best entry and removes this entry from the list
2010-09-09 17:30:25 +02:00
WeakPriorityBlockingQueue < ReverseElement < WordReferenceVars > > m ;
ReverseElement < WordReferenceVars > rwi ;
2010-09-10 00:42:54 +02:00
try {
while ( ( rwi = stack . poll ( timeout ) ) ! = null ) {
if ( ! skipDoubleDom ) return rwi ;
// check doubledom
2010-09-14 16:32:24 +02:00
final String domhash = new String ( rwi . getElement ( ) . metadataHash ( ) , 6 , 6 ) ;
2010-09-10 00:42:54 +02:00
m = this . doubleDomCache . get ( domhash ) ;
if ( m = = null ) {
// first appearance of dom
m = new WeakPriorityBlockingQueue < ReverseElement < WordReferenceVars > > ( ( query . specialRights ) ? maxDoubleDomSpecial : maxDoubleDomAll ) ;
this . doubleDomCache . put ( domhash , m ) ;
return rwi ;
}
// second appearances of dom
m . put ( rwi ) ;
2007-11-22 00:14:57 +01:00
}
2010-09-10 00:42:54 +02:00
} catch ( InterruptedException e1 ) {
2007-11-22 00:14:57 +01:00
}
2010-09-10 00:42:54 +02:00
2007-11-22 00:14:57 +01:00
// no more entries in sorted RWI entries. Now take Elements from the doubleDomCache
// find best entry from all caches
2010-09-09 17:30:25 +02:00
ReverseElement < WordReferenceVars > bestEntry = null ;
ReverseElement < WordReferenceVars > o ;
2009-11-12 00:31:12 +01:00
synchronized ( this . doubleDomCache ) {
2010-09-09 17:30:25 +02:00
final Iterator < WeakPriorityBlockingQueue < ReverseElement < WordReferenceVars > > > i = this . doubleDomCache . values ( ) . iterator ( ) ;
2009-11-12 00:31:12 +01:00
while ( i . hasNext ( ) ) {
try {
m = i . next ( ) ;
} catch ( ConcurrentModificationException e ) {
Log . logException ( e ) ;
break ; // not the best solution...
}
if ( m = = null ) continue ;
2009-12-02 01:37:59 +01:00
if ( m . isEmpty ( ) ) continue ;
2009-11-12 00:31:12 +01:00
if ( bestEntry = = null ) {
2010-09-09 17:30:25 +02:00
bestEntry = m . peek ( ) ;
2009-11-12 00:31:12 +01:00
continue ;
}
2010-09-09 17:30:25 +02:00
o = m . peek ( ) ;
2010-09-10 00:42:54 +02:00
if ( o = = null ) continue ;
2010-09-09 17:30:25 +02:00
if ( o . getWeight ( ) < bestEntry . getWeight ( ) ) {
2009-11-12 00:31:12 +01:00
bestEntry = o ;
}
2007-11-22 00:14:57 +01:00
}
}
2008-02-21 11:06:57 +01:00
if ( bestEntry = = null ) return null ;
2010-09-10 00:42:54 +02:00
2007-11-22 00:14:57 +01:00
// finally remove the best entry from the doubledom cache
2010-09-09 17:30:25 +02:00
m = this . doubleDomCache . get ( new String ( bestEntry . getElement ( ) . metadataHash ( ) ) . substring ( 6 ) ) ;
o = m . poll ( ) ;
2009-06-19 01:24:23 +02:00
//assert o == null || o.element.metadataHash().equals(bestEntry.element.metadataHash()) : "bestEntry.element.metadataHash() = " + bestEntry.element.metadataHash() + ", o.element.metadataHash() = " + o.element.metadataHash();
2008-02-21 11:06:57 +01:00
return bestEntry ;
2007-11-22 00:14:57 +01:00
}
2009-11-24 12:13:11 +01:00
/ * *
* get one metadata entry from the ranked results . This will be the ' best ' entry so far
* according to the applied ranking . If there are no more entries left or the timeout
* limit is reached then null is returned . The caller may distinguish the timeout case
* from the case where there will be no more also in the future by calling this . feedingIsFinished ( )
* @param skipDoubleDom should be true if it is wanted that double domain entries are skipped
* @param timeout the time this method may take for a result computation
* @return a metadata entry for a url
* /
2009-10-11 02:12:19 +02:00
public URIMetadataRow takeURL ( final boolean skipDoubleDom , final int timeout ) {
2009-04-30 00:14:12 +02:00
// returns from the current RWI list the best URL entry and removes this entry from the list
2009-09-04 01:09:53 +02:00
long timeLimit = System . currentTimeMillis ( ) + timeout ;
2010-03-05 22:25:49 +01:00
int p = - 1 ;
2010-04-15 15:22:59 +02:00
byte [ ] urlhash ;
2010-09-10 00:42:54 +02:00
long timeleft ;
while ( ( timeleft = timeLimit - System . currentTimeMillis ( ) ) > 0 ) {
final ReverseElement < WordReferenceVars > obrwi = takeRWI ( skipDoubleDom , timeleft ) ;
2009-09-04 01:09:53 +02:00
if ( obrwi = = null ) {
if ( this . feedingIsFinished ( ) ) return null ;
try { Thread . sleep ( 50 ) ; } catch ( final InterruptedException e1 ) { }
continue ;
}
2010-09-09 17:30:25 +02:00
urlhash = obrwi . getElement ( ) . metadataHash ( ) ;
final URIMetadataRow page = this . query . getSegment ( ) . urlMetadata ( ) . load ( urlhash , obrwi . getElement ( ) , obrwi . getWeight ( ) ) ;
2009-08-27 22:20:07 +02:00
if ( page = = null ) {
2010-04-15 15:22:59 +02:00
try {
2010-09-09 17:30:25 +02:00
misses . put ( obrwi . getElement ( ) . metadataHash ( ) ) ;
2010-04-15 15:22:59 +02:00
} catch ( RowSpaceExceededException e ) {
Log . logException ( e ) ;
}
2009-08-27 22:20:07 +02:00
continue ;
}
// prepare values for constraint check
2009-10-11 02:12:19 +02:00
final URIMetadataRow . Components metadata = page . metadata ( ) ;
2009-08-27 22:20:07 +02:00
2010-03-23 11:17:28 +01:00
// check errors
if ( metadata = = null ) {
2009-08-27 22:20:07 +02:00
continue ; // rare case where the url is corrupted
}
2010-03-23 14:41:41 +01:00
if ( ! query . urlMask_isCatchall ) {
// check url mask
if ( ! metadata . matches ( query . urlMask ) ) {
continue ;
}
// in case that we do not have e catchall filter for urls
// we must also construct the domain navigator here
2010-09-14 16:32:24 +02:00
this . hostNavigator . inc ( new String ( urlhash , 6 , 6 ) , new String ( urlhash ) ) ;
2010-03-23 11:17:28 +01:00
}
2010-03-23 14:41:41 +01:00
// check for more errors
2010-03-23 11:17:28 +01:00
if ( metadata . url ( ) = = null ) {
continue ; // rare case where the url is corrupted
}
2009-08-27 22:20:07 +02:00
final String pageurl = metadata . url ( ) . toNormalform ( true , true ) ;
final String pageauthor = metadata . dc_creator ( ) ;
final String pagetitle = metadata . dc_title ( ) . toLowerCase ( ) ;
2010-03-23 11:17:28 +01:00
2009-08-27 22:20:07 +02:00
// check exclusion
2010-06-22 14:28:53 +02:00
if ( ( QueryParams . anymatch ( pagetitle , query . excludeHashes ) ) | |
( QueryParams . anymatch ( pageurl . toLowerCase ( ) , query . excludeHashes ) ) | |
( QueryParams . anymatch ( pageauthor . toLowerCase ( ) , query . excludeHashes ) ) ) {
2009-08-27 22:20:07 +02:00
continue ;
}
// check index-of constraint
if ( ( query . constraint ! = null ) & &
( query . constraint . get ( Condenser . flag_cat_indexof ) ) & &
( ! ( pagetitle . startsWith ( " index of " ) ) ) ) {
final Iterator < byte [ ] > wi = query . queryHashes . iterator ( ) ;
2009-11-24 12:13:11 +01:00
while ( wi . hasNext ( ) ) try { this . query . getSegment ( ) . termIndex ( ) . remove ( wi . next ( ) , page . hash ( ) ) ; } catch ( IOException e ) { }
2009-08-27 22:20:07 +02:00
continue ;
}
// check content domain
2009-11-19 00:56:05 +01:00
if ( ( query . contentdom = = ContentDomain . AUDIO & & page . laudio ( ) = = 0 ) | |
( query . contentdom = = ContentDomain . VIDEO & & page . lvideo ( ) = = 0 ) | |
( query . contentdom = = ContentDomain . IMAGE & & page . limage ( ) = = 0 ) | |
( query . contentdom = = ContentDomain . APP & & page . lapp ( ) = = 0 ) ) {
2009-08-27 22:20:07 +02:00
continue ;
}
// evaluate information of metadata for navigation
// author navigation:
if ( pageauthor ! = null & & pageauthor . length ( ) > 0 ) {
// add author to the author navigator
String authorhash = new String ( Word . word2hash ( pageauthor ) ) ;
//System.out.println("*** DEBUG authorhash = " + authorhash + ", query.authorhash = " + this.query.authorhash + ", author = " + author);
2009-06-09 17:22:23 +02:00
2009-08-27 22:20:07 +02:00
// check if we already are filtering for authors
if ( this . query . authorhash ! = null & & ! this . query . authorhash . equals ( authorhash ) ) {
continue ;
}
// add author to the author navigator
2010-03-05 22:25:49 +01:00
this . authorNavigator . inc ( authorhash , pageauthor ) ;
2009-08-27 22:20:07 +02:00
} else if ( this . query . authorhash ! = null ) {
continue ;
2009-06-09 17:22:23 +02:00
}
2009-08-27 22:20:07 +02:00
2010-03-05 22:25:49 +01:00
// namespace navigation
String pagepath = metadata . url ( ) . getPath ( ) ;
if ( ( p = pagepath . indexOf ( ':' ) ) > = 0 ) {
pagepath = pagepath . substring ( 0 , p ) ;
p = pagepath . lastIndexOf ( '/' ) ;
if ( p > = 0 ) {
pagepath = pagepath . substring ( p + 1 ) ;
this . namespaceNavigator . inc ( pagepath , pagepath ) ;
}
}
2009-08-27 22:20:07 +02:00
// accept url
2010-04-19 18:42:37 +02:00
/ *
try {
this . handover . put ( page . hash ( ) ) ; // remember that we handed over this url
} catch ( RowSpaceExceededException e ) {
Log . logException ( e ) ;
}
* /
2009-08-27 22:20:07 +02:00
return page ;
2007-11-22 00:14:57 +01:00
}
return null ;
2009-08-25 23:27:01 +02:00
}
2010-04-19 18:42:37 +02:00
protected int size ( ) {
2010-09-09 17:30:25 +02:00
int c = stack . sizeAvailable ( ) ;
for ( WeakPriorityBlockingQueue < ReverseElement < WordReferenceVars > > s : this . doubleDomCache . values ( ) ) {
c + = s . sizeAvailable ( ) ;
2009-08-27 17:19:48 +02:00
}
2007-11-22 00:14:57 +01:00
return c ;
2007-11-07 23:38:09 +01:00
}
2009-12-02 01:37:59 +01:00
public boolean isEmpty ( ) {
if ( ! stack . isEmpty ( ) ) return false ;
2010-09-09 17:30:25 +02:00
for ( WeakPriorityBlockingQueue < ReverseElement < WordReferenceVars > > s : this . doubleDomCache . values ( ) ) {
2009-12-02 01:37:59 +01:00
if ( ! s . isEmpty ( ) ) return false ;
}
return true ;
}
2007-11-16 15:48:09 +01:00
public int [ ] flagCount ( ) {
2007-11-22 00:14:57 +01:00
return flagcount ;
2007-11-16 15:48:09 +01:00
}
2008-01-30 22:58:30 +01:00
// "results from a total number of <remote_resourceSize + local_resourceSize> known (<local_resourceSize> local, <remote_resourceSize> remote), <remote_indexCount> links from <remote_peerCount> other YaCy peers."
2007-11-07 23:38:09 +01:00
public int filteredCount ( ) {
2008-01-30 22:58:30 +01:00
// the number of index entries that are considered as result set
2010-09-09 17:30:25 +02:00
return this . stack . sizeAvailable ( ) ;
2007-11-07 23:38:09 +01:00
}
2010-01-13 01:04:37 +01:00
public int getLocalIndexCount ( ) {
// the number of results in the local peer after filtering
return this . local_indexCount ;
}
public int getLocalResourceSize ( ) {
// the number of hits in the local peer (index size, size of the collection in the own index)
return this . local_resourceSize ;
}
2008-01-30 22:58:30 +01:00
public int getRemoteIndexCount ( ) {
// the number of result contributions from all the remote peers
return this . remote_indexCount ;
}
public int getRemoteResourceSize ( ) {
// the number of all hits in all the remote peers
return this . remote_resourceSize ;
}
2010-01-13 01:04:37 +01:00
public int getRemotePeerCount ( ) {
// the number of remote peers that have contributed
return this . remote_peerCount ;
2008-01-30 22:58:30 +01:00
}
2010-04-15 15:22:59 +02:00
public Iterator < byte [ ] > miss ( ) {
2007-11-22 00:14:57 +01:00
return this . misses . iterator ( ) ;
2007-11-07 23:38:09 +01:00
}
2010-03-05 22:25:49 +01:00
public ArrayList < Navigator . Item > getNamespaceNavigator ( int count ) {
if ( ! this . query . navigators . equals ( " all " ) & & this . query . navigators . indexOf ( " namespace " ) < 0 ) return new ArrayList < Navigator . Item > ( 0 ) ;
Navigator . Item [ ] hsa = this . namespaceNavigator . entries ( ) ;
int rc = Math . min ( count , hsa . length ) ;
ArrayList < Navigator . Item > result = new ArrayList < Navigator . Item > ( ) ;
for ( int i = 0 ; i < rc ; i + + ) result . add ( hsa [ i ] ) ;
2010-03-23 14:41:41 +01:00
if ( result . size ( ) < 2 ) result . clear ( ) ; // navigators with one entry are not useful
2010-03-05 22:25:49 +01:00
return result ;
2009-06-09 00:01:26 +02:00
}
2010-03-05 22:25:49 +01:00
public List < Navigator . Item > getHostNavigator ( int count ) {
List < Navigator . Item > result = new ArrayList < Navigator . Item > ( ) ;
if ( ! this . query . navigators . equals ( " all " ) & & this . query . navigators . indexOf ( " hosts " ) < 0 ) return result ;
2009-06-08 00:51:15 +02:00
2010-03-05 22:25:49 +01:00
List < Navigator . Item > hsa = this . hostNavigator . entries ( 10 ) ;
2009-10-11 02:12:19 +02:00
URIMetadataRow mr ;
DigestURI url ;
2009-06-22 14:25:18 +02:00
String hostname ;
2010-01-29 16:59:24 +01:00
Components metadata ;
2010-03-05 22:25:49 +01:00
loop : for ( Navigator . Item item : hsa ) {
2010-04-08 02:11:32 +02:00
mr = this . query . getSegment ( ) . urlMetadata ( ) . load ( item . name . getBytes ( ) , null , 0 ) ;
2009-06-02 17:20:10 +02:00
if ( mr = = null ) continue ;
2010-01-29 16:59:24 +01:00
metadata = mr . metadata ( ) ;
if ( metadata = = null ) continue ;
url = metadata . url ( ) ;
2009-06-02 17:20:10 +02:00
if ( url = = null ) continue ;
2009-06-22 14:25:18 +02:00
hostname = url . getHost ( ) ;
if ( hostname = = null ) continue ;
2009-06-22 14:30:22 +02:00
if ( query . tenant ! = null & & ! hostname . contains ( query . tenant ) & & ! url . toNormalform ( true , true ) . contains ( query . tenant ) ) continue ;
2010-03-05 22:25:49 +01:00
for ( Navigator . Item entry : result ) if ( entry . name . equals ( hostname ) ) continue loop ; // check if one entry already exists
result . add ( new Navigator . Item ( hostname , item . count ) ) ;
2009-06-02 17:20:10 +02:00
}
2010-03-23 14:41:41 +01:00
if ( result . size ( ) < 2 ) result . clear ( ) ; // navigators with one entry are not useful
2009-06-02 17:20:10 +02:00
return result ;
}
public static final Comparator < Map . Entry < String , Integer > > mecomp = new Comparator < Map . Entry < String , Integer > > ( ) {
public int compare ( Map . Entry < String , Integer > o1 , Map . Entry < String , Integer > o2 ) {
if ( o1 . getValue ( ) . intValue ( ) < o2 . getValue ( ) . intValue ( ) ) return 1 ;
if ( o2 . getValue ( ) . intValue ( ) < o1 . getValue ( ) . intValue ( ) ) return - 1 ;
return 0 ;
}
} ;
2010-03-05 22:25:49 +01:00
public Map < String , Navigator . Item > getTopics ( ) {
return this . ref . map ( ) ;
2009-11-09 20:14:51 +01:00
}
2010-03-05 22:25:49 +01:00
public List < Navigator . Item > getTopicNavigator ( final int count ) {
2007-11-07 23:38:09 +01:00
// create a list of words that had been computed by statistics over all
// words that appeared in the url or the description of all urls
2010-03-05 22:25:49 +01:00
if ( ! this . query . navigators . equals ( " all " ) & & this . query . navigators . indexOf ( " topics " ) < 0 ) return new ArrayList < Navigator . Item > ( 0 ) ;
2010-03-23 14:41:41 +01:00
List < Navigator . Item > result = this . ref . entries ( 10 ) ;
if ( result . size ( ) < 2 ) result . clear ( ) ; // navigators with one entry are not useful
return result ;
2007-11-07 23:38:09 +01:00
}
2009-06-02 17:20:10 +02:00
public void addTopic ( final String [ ] words ) {
2007-11-07 23:38:09 +01:00
String word ;
for ( int i = 0 ; i < words . length ; i + + ) {
word = words [ i ] . toLowerCase ( ) ;
2009-06-04 01:49:06 +02:00
if ( word . length ( ) > 2 & &
" http_html_php_ftp_www_com_org_net_gov_edu_index_home_page_for_usage_the_and_ " . indexOf ( word ) < 0 & &
2010-04-15 15:22:59 +02:00
! query . queryHashes . has ( Word . word2hash ( word ) ) & &
2009-06-04 01:49:06 +02:00
word . matches ( " [a-z]+ " ) & &
2009-07-19 22:37:44 +02:00
! Switchboard . badwords . contains ( word ) & &
! Switchboard . stopwords . contains ( word ) ) {
2010-03-05 22:25:49 +01:00
ref . inc ( word , word ) ;
2009-06-02 17:20:10 +02:00
}
2007-11-07 23:38:09 +01:00
}
}
2009-08-24 17:24:02 +02:00
protected void addTopics ( final ResultEntry resultEntry ) {
2007-11-07 23:38:09 +01:00
// take out relevant information for reference computation
if ( ( resultEntry . url ( ) = = null ) | | ( resultEntry . title ( ) = = null ) ) return ;
2009-06-02 17:20:10 +02:00
//final String[] urlcomps = htmlFilterContentScraper.urlComps(resultEntry.url().toNormalform(true, true)); // word components of the url
2010-05-25 14:54:57 +02:00
final String [ ] descrcomps = MultiProtocolURI . splitpattern . split ( resultEntry . title ( ) . toLowerCase ( ) ) ; // words in the description
2007-11-07 23:38:09 +01:00
// add references
2009-06-02 17:20:10 +02:00
//addTopic(urlcomps);
addTopic ( descrcomps ) ;
2007-11-07 23:38:09 +01:00
}
2010-03-05 22:25:49 +01:00
public List < Navigator . Item > getAuthorNavigator ( final int count ) {
2009-06-09 00:01:26 +02:00
// create a list of words that had been computed by statistics over all
// words that appeared in the url or the description of all urls
2010-03-05 22:25:49 +01:00
if ( ! this . query . navigators . equals ( " all " ) & & this . query . navigators . indexOf ( " authors " ) < 0 ) return new ArrayList < Navigator . Item > ( 0 ) ;
2010-03-23 14:41:41 +01:00
List < Navigator . Item > result = this . authorNavigator . entries ( count ) ;
if ( result . size ( ) < 2 ) result . clear ( ) ; // navigators with one entry are not useful
return result ;
2009-06-09 00:01:26 +02:00
}
2008-08-02 14:12:04 +02:00
public static void loadYBR ( final File rankingPath , final int count ) {
2007-11-07 23:38:09 +01:00
// load ranking tables
if ( rankingPath . exists ( ) ) {
2009-01-30 16:33:00 +01:00
ybrTables = new BinSearch [ count ] ;
2007-11-07 23:38:09 +01:00
String ybrName ;
File f ;
try {
for ( int i = 0 ; i < count ; i + + ) {
2009-01-30 16:33:00 +01:00
ybrName = " YBR-4- " + Digest . encodeHex ( i , 2 ) + " .idx " ;
2007-11-07 23:38:09 +01:00
f = new File ( rankingPath , ybrName ) ;
if ( f . exists ( ) ) {
2009-01-31 02:06:56 +01:00
ybrTables [ i ] = new BinSearch ( FileUtils . read ( f ) , 6 ) ;
2007-11-07 23:38:09 +01:00
} else {
ybrTables [ i ] = null ;
}
}
2008-08-02 14:12:04 +02:00
} catch ( final IOException e ) {
2007-11-07 23:38:09 +01:00
ybrTables = null ;
}
} else {
ybrTables = null ;
}
}
public static boolean canUseYBR ( ) {
return ybrTables ! = null ;
}
public static boolean isUsingYBR ( ) {
return useYBR ;
}
2008-08-02 14:12:04 +02:00
public static void switchYBR ( final boolean usage ) {
2007-11-07 23:38:09 +01:00
useYBR = usage ;
}
2010-04-25 23:37:36 +02:00
public static int ybr ( final byte [ ] urlHash ) {
2007-11-07 23:38:09 +01:00
// returns the YBR value in a range of 0..15, where 0 means best ranking and 15 means worst ranking
if ( ybrTables = = null ) return 15 ;
if ( ! ( useYBR ) ) return 15 ;
2010-04-25 23:37:36 +02:00
byte [ ] domhash = new byte [ 6 ] ;
System . arraycopy ( urlHash , 6 , domhash , 0 , 6 ) ;
2008-08-02 14:12:04 +02:00
final int m = Math . min ( maxYBR , ybrTables . length ) ;
2008-04-24 15:31:55 +02:00
for ( int i = 0 ; i < m ; i + + ) {
2010-04-25 23:37:36 +02:00
if ( ( ybrTables [ i ] ! = null ) & & ( ybrTables [ i ] . contains ( domhash ) ) ) {
2007-11-07 23:38:09 +01:00
//System.out.println("YBR FOUND: " + urlHash + " (" + i + ")");
return i ;
}
}
//System.out.println("NOT FOUND: " + urlHash);
return 15 ;
}
2007-11-22 00:14:57 +01:00
2007-11-07 23:38:09 +01:00
}