2007-11-07 23:38:09 +01:00
// plasmaSearchRankingProcess.java
// (C) 2007 by Michael Peter Christen; mc@yacy.net, Frankfurt a. M., Germany
// first published 07.11.2007 on http://yacy.net
//
// This is a part of YaCy, a peer-to-peer based web search engine
//
// $LastChangedDate: 2006-04-02 22:40:07 +0200 (So, 02 Apr 2006) $
// $LastChangedRevision: 1986 $
// $LastChangedBy: orbiter $
//
// LICENSE
//
// This program is free software; you can redistribute it and/or modify
// it under the terms of the GNU General Public License as published by
// the Free Software Foundation; either version 2 of the License, or
// (at your option) any later version.
//
// This program is distributed in the hope that it will be useful,
// but WITHOUT ANY WARRANTY; without even the implied warranty of
// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
// GNU General Public License for more details.
//
// You should have received a copy of the GNU General Public License
// along with this program; if not, write to the Free Software
// Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
package de.anomic.plasma ;
import java.io.File ;
import java.io.IOException ;
2008-02-07 23:16:36 +01:00
import java.util.ArrayList ;
2009-05-27 00:30:20 +02:00
import java.util.Arrays ;
import java.util.Comparator ;
2007-11-07 23:38:09 +01:00
import java.util.HashMap ;
import java.util.Iterator ;
import java.util.Map ;
import java.util.Set ;
import java.util.TreeSet ;
2008-02-27 16:16:47 +01:00
import java.util.concurrent.ConcurrentHashMap ;
2007-11-07 23:38:09 +01:00
import de.anomic.htmlFilter.htmlFilterContentScraper ;
2009-01-30 16:33:00 +01:00
import de.anomic.kelondro.index.BinSearch ;
2009-01-30 23:08:08 +01:00
import de.anomic.kelondro.order.Digest ;
2009-03-02 11:00:32 +01:00
import de.anomic.kelondro.text.Reference ;
import de.anomic.kelondro.text.ReferenceContainer ;
import de.anomic.kelondro.text.ReferenceOrder ;
2009-05-28 16:26:05 +02:00
import de.anomic.kelondro.text.Segment ;
2009-06-03 10:49:54 +02:00
import de.anomic.kelondro.text.TermSearch ;
2009-04-03 15:23:45 +02:00
import de.anomic.kelondro.text.metadataPrototype.URLMetadataRow ;
2009-04-07 11:34:41 +02:00
import de.anomic.kelondro.text.referencePrototype.WordReference ;
2009-04-03 15:23:45 +02:00
import de.anomic.kelondro.text.referencePrototype.WordReferenceVars ;
2009-01-30 23:44:20 +01:00
import de.anomic.kelondro.util.SortStack ;
2009-01-31 02:06:56 +01:00
import de.anomic.kelondro.util.FileUtils ;
2009-04-03 15:23:45 +02:00
import de.anomic.plasma.parser.Word ;
import de.anomic.plasma.parser.Condenser ;
2007-11-17 02:53:02 +01:00
import de.anomic.server.serverProfiling ;
2008-03-05 22:46:55 +01:00
import de.anomic.yacy.yacyURL ;
2007-11-07 23:38:09 +01:00
public final class plasmaSearchRankingProcess {
2009-01-30 16:33:00 +01:00
public static BinSearch [ ] ybrTables = null ; // block-rank tables
2008-04-24 15:31:55 +02:00
public static final int maxYBR = 3 ; // the lower this value, the faster the search
2007-11-07 23:38:09 +01:00
private static boolean useYBR = true ;
2008-05-23 11:45:33 +02:00
private static final int maxDoubleDomAll = 20 , maxDoubleDomSpecial = 10000 ;
2007-11-07 23:38:09 +01:00
2009-04-03 15:23:45 +02:00
private final SortStack < WordReferenceVars > stack ;
private final HashMap < String , SortStack < WordReferenceVars > > doubleDomCache ; // key = domhash (6 bytes); value = like stack
2008-08-02 14:12:04 +02:00
private final HashMap < String , String > handover ; // key = urlhash, value = urlstring; used for double-check of urls that had been handed over to search process
private final plasmaSearchQuery query ;
private final int maxentries ;
2008-01-30 22:58:30 +01:00
private int remote_peerCount , remote_indexCount , remote_resourceSize , local_resourceSize ;
2009-03-02 00:58:14 +01:00
private final ReferenceOrder order ;
2008-08-02 14:12:04 +02:00
private final ConcurrentHashMap < String , Integer > urlhashes ; // map for double-check; String/Long relation, addresses ranking number (backreference for deletion)
private final int [ ] flagcount ; // flag counter
private final TreeSet < String > misses ; // contains url-hashes that could not been found in the LURL-DB
2009-05-28 16:26:05 +02:00
private final Segment indexSegment ;
2009-06-03 10:49:54 +02:00
private HashMap < byte [ ] , ReferenceContainer < WordReference > > localSearchInclusion ;
2008-08-02 14:12:04 +02:00
private final int [ ] domZones ;
2009-06-02 17:20:10 +02:00
private final ConcurrentHashMap < String , hoststat > hostNavigator ;
private final ConcurrentHashMap < String , Integer > ref ; // reference score computation for the commonSense heuristic
2007-11-07 23:38:09 +01:00
2009-04-02 15:26:47 +02:00
public plasmaSearchRankingProcess (
2009-05-28 16:26:05 +02:00
final Segment indexSegment ,
2009-04-02 15:26:47 +02:00
final plasmaSearchQuery query ,
final int maxentries ,
final int concurrency ) {
2007-11-07 23:38:09 +01:00
// we collect the urlhashes and construct a list with urlEntry objects
// attention: if minEntries is too high, this method will not terminate within the maxTime
2007-11-22 00:14:57 +01:00
// sortorder: 0 = hash, 1 = url, 2 = ranking
2009-06-03 10:49:54 +02:00
this . localSearchInclusion = null ;
2009-04-03 15:23:45 +02:00
this . stack = new SortStack < WordReferenceVars > ( maxentries ) ;
this . doubleDomCache = new HashMap < String , SortStack < WordReferenceVars > > ( ) ;
2007-12-27 18:56:59 +01:00
this . handover = new HashMap < String , String > ( ) ;
2009-03-02 00:58:14 +01:00
this . order = ( query = = null ) ? null : new ReferenceOrder ( query . ranking , query . targetlang ) ;
2007-11-07 23:38:09 +01:00
this . query = query ;
this . maxentries = maxentries ;
2008-01-30 22:58:30 +01:00
this . remote_peerCount = 0 ;
this . remote_indexCount = 0 ;
this . remote_resourceSize = 0 ;
this . local_resourceSize = 0 ;
2008-02-27 16:16:47 +01:00
this . urlhashes = new ConcurrentHashMap < String , Integer > ( 0 , 0 . 75f , concurrency ) ;
2007-12-27 18:56:59 +01:00
this . misses = new TreeSet < String > ( ) ;
2009-05-28 16:26:05 +02:00
this . indexSegment = indexSegment ;
2007-11-22 00:14:57 +01:00
this . flagcount = new int [ 32 ] ;
for ( int i = 0 ; i < 32 ; i + + ) { this . flagcount [ i ] = 0 ; }
2009-05-29 12:52:50 +02:00
this . hostNavigator = new ConcurrentHashMap < String , hoststat > ( ) ;
2009-06-02 17:20:10 +02:00
this . ref = new ConcurrentHashMap < String , Integer > ( ) ;
this . domZones = new int [ 8 ] ;
2008-03-10 00:58:22 +01:00
for ( int i = 0 ; i < 8 ; i + + ) { this . domZones [ i ] = 0 ; }
2007-11-22 00:14:57 +01:00
}
2009-04-03 15:23:45 +02:00
public long ranking ( final WordReferenceVars word ) {
2008-02-21 11:06:57 +01:00
return order . cardinal ( word ) ;
}
2008-03-11 12:09:38 +01:00
public int [ ] zones ( ) {
return this . domZones ;
}
2008-01-30 22:58:30 +01:00
public void execQuery ( ) {
2007-11-22 00:14:57 +01:00
2007-12-04 21:19:13 +01:00
long timer = System . currentTimeMillis ( ) ;
2009-06-03 10:49:54 +02:00
final TermSearch < WordReference > search = this . indexSegment . termIndex ( ) . query (
2009-05-29 12:03:35 +02:00
query . queryHashes ,
query . excludeHashes ,
null ,
2009-05-28 16:26:05 +02:00
Segment . wordReferenceFactory ,
2008-08-20 10:37:39 +02:00
query . maxDistance ) ;
2009-06-03 10:49:54 +02:00
this . localSearchInclusion = search . inclusion ( ) ;
final ReferenceContainer < WordReference > index = search . joined ( ) ;
2009-03-13 01:13:47 +01:00
serverProfiling . update ( " SEARCH " , new plasmaProfiling . searchEvent ( query . id ( true ) , plasmaSearchEvent . JOIN , index . size ( ) , System . currentTimeMillis ( ) - timer ) , false ) ;
2008-08-20 10:37:39 +02:00
if ( index . size ( ) = = 0 ) {
2007-11-22 00:14:57 +01:00
return ;
}
2008-03-05 22:46:55 +01:00
insertRanked ( index , true , index . size ( ) ) ;
2007-11-07 23:38:09 +01:00
}
2009-04-15 08:34:27 +02:00
public void insertRanked ( final ReferenceContainer < WordReference > index , final boolean local , final int fullResource ) {
2007-11-07 23:38:09 +01:00
// we collect the urlhashes and construct a list with urlEntry objects
// attention: if minEntries is too high, this method will not terminate within the maxTime
2008-01-29 00:41:39 +01:00
assert ( index ! = null ) ;
if ( index . size ( ) = = 0 ) return ;
2008-01-30 22:58:30 +01:00
if ( local ) {
this . local_resourceSize + = fullResource ;
} else {
this . remote_resourceSize + = fullResource ;
this . remote_peerCount + + ;
}
2007-11-07 23:38:09 +01:00
2007-12-04 21:19:13 +01:00
long timer = System . currentTimeMillis ( ) ;
2008-02-21 11:06:57 +01:00
// normalize entries
2009-04-03 15:23:45 +02:00
final ArrayList < WordReferenceVars > decodedEntries = this . order . normalizeWith ( index ) ;
2009-03-13 01:13:47 +01:00
serverProfiling . update ( " SEARCH " , new plasmaProfiling . searchEvent ( query . id ( true ) , plasmaSearchEvent . NORMALIZING , index . size ( ) , System . currentTimeMillis ( ) - timer ) , false ) ;
2007-11-07 23:38:09 +01:00
2008-02-21 11:06:57 +01:00
// iterate over normalized entries and select some that are better than currently stored
2008-02-10 23:50:09 +01:00
timer = System . currentTimeMillis ( ) ;
2009-04-03 15:23:45 +02:00
final Iterator < WordReferenceVars > i = decodedEntries . iterator ( ) ;
WordReferenceVars iEntry ;
2007-11-07 23:38:09 +01:00
Long r ;
2009-05-26 00:27:34 +02:00
hoststat hs ;
String domhash ;
2007-11-07 23:38:09 +01:00
while ( i . hasNext ( ) ) {
2008-01-30 01:15:43 +01:00
iEntry = i . next ( ) ;
2009-04-07 11:34:41 +02:00
assert ( iEntry . metadataHash ( ) . length ( ) = = index . row ( ) . primaryKeyLength ) ;
2008-02-21 11:06:57 +01:00
//if (iEntry.urlHash().length() != index.row().primaryKeyLength) continue;
2007-11-16 15:48:09 +01:00
// increase flag counts
for ( int j = 0 ; j < 32 ; j + + ) {
2007-11-22 00:14:57 +01:00
if ( iEntry . flags ( ) . get ( j ) ) { flagcount [ j ] + + ; }
2007-11-16 15:48:09 +01:00
}
2007-11-07 23:38:09 +01:00
2007-11-17 02:53:02 +01:00
// kick out entries that are too bad according to current findings
2008-08-06 21:43:12 +02:00
r = Long . valueOf ( order . cardinal ( iEntry ) ) ;
2008-02-21 11:06:57 +01:00
if ( ( maxentries > = 0 ) & & ( stack . size ( ) > = maxentries ) & & ( stack . bottom ( r . longValue ( ) ) ) ) continue ;
2007-11-22 00:14:57 +01:00
2007-11-07 23:38:09 +01:00
// check constraints
2007-11-22 00:14:57 +01:00
if ( ! testFlags ( iEntry ) ) continue ;
2008-02-21 11:06:57 +01:00
// check document domain
2007-11-07 23:38:09 +01:00
if ( query . contentdom ! = plasmaSearchQuery . CONTENTDOM_TEXT ) {
2009-04-03 15:23:45 +02:00
if ( ( query . contentdom = = plasmaSearchQuery . CONTENTDOM_AUDIO ) & & ( ! ( iEntry . flags ( ) . get ( Condenser . flag_cat_hasaudio ) ) ) ) continue ;
if ( ( query . contentdom = = plasmaSearchQuery . CONTENTDOM_VIDEO ) & & ( ! ( iEntry . flags ( ) . get ( Condenser . flag_cat_hasvideo ) ) ) ) continue ;
if ( ( query . contentdom = = plasmaSearchQuery . CONTENTDOM_IMAGE ) & & ( ! ( iEntry . flags ( ) . get ( Condenser . flag_cat_hasimage ) ) ) ) continue ;
if ( ( query . contentdom = = plasmaSearchQuery . CONTENTDOM_APP ) & & ( ! ( iEntry . flags ( ) . get ( Condenser . flag_cat_hasapp ) ) ) ) continue ;
2007-11-07 23:38:09 +01:00
}
2008-02-21 11:06:57 +01:00
2008-03-05 22:46:55 +01:00
// check tld domain
2009-04-07 11:34:41 +02:00
if ( ! yacyURL . matchesAnyDomDomain ( iEntry . metadataHash ( ) , this . query . zonecode ) ) {
2008-03-05 22:46:55 +01:00
// filter out all tld that do not match with wanted tld domain
continue ;
}
2009-04-02 15:26:47 +02:00
// check site constraints
2009-04-07 11:34:41 +02:00
if ( query . sitehash ! = null & & ! iEntry . metadataHash ( ) . substring ( 6 ) . equals ( query . sitehash ) ) {
2009-04-02 15:26:47 +02:00
// filter out all domains that do not match with the site constraint
2009-05-11 23:20:23 +02:00
continue ;
2009-04-02 15:26:47 +02:00
}
2008-03-10 00:58:22 +01:00
// count domZones
2009-04-07 11:34:41 +02:00
this . domZones [ yacyURL . domDomain ( iEntry . metadataHash ( ) ) ] + + ;
2008-03-10 00:58:22 +01:00
2009-05-26 00:27:34 +02:00
// get statistics for host navigator
domhash = iEntry . urlHash . substring ( 6 ) ;
hs = this . hostNavigator . get ( domhash ) ;
if ( hs = = null ) {
this . hostNavigator . put ( domhash , new hoststat ( iEntry . urlHash ) ) ;
} else {
hs . inc ( ) ;
}
2008-02-21 11:06:57 +01:00
// insert
if ( ( maxentries < 0 ) | | ( stack . size ( ) < maxentries ) ) {
// in case that we don't have enough yet, accept any new entry
2009-04-07 11:34:41 +02:00
if ( urlhashes . containsKey ( iEntry . metadataHash ( ) ) ) continue ;
2008-02-21 11:06:57 +01:00
stack . push ( iEntry , r ) ;
2007-11-07 23:38:09 +01:00
} else {
2008-02-21 11:06:57 +01:00
// if we already have enough entries, insert only such that are necessary to get a better result
if ( stack . bottom ( r . longValue ( ) ) ) {
2007-11-07 23:38:09 +01:00
continue ;
}
2008-08-02 15:57:00 +02:00
// double-check
2009-04-07 11:34:41 +02:00
if ( urlhashes . containsKey ( iEntry . metadataHash ( ) ) ) continue ;
2008-08-02 15:57:00 +02:00
stack . push ( iEntry , r ) ;
2007-11-07 23:38:09 +01:00
}
// increase counter for statistics
2008-01-30 22:58:30 +01:00
if ( ! local ) this . remote_indexCount + + ;
2007-11-07 23:38:09 +01:00
}
2007-11-22 00:14:57 +01:00
//if ((query.neededResults() > 0) && (container.size() > query.neededResults())) remove(true, true);
2009-03-13 01:13:47 +01:00
serverProfiling . update ( " SEARCH " , new plasmaProfiling . searchEvent ( query . id ( true ) , plasmaSearchEvent . PRESORT , index . size ( ) , System . currentTimeMillis ( ) - timer ) , false ) ;
2007-11-16 15:48:09 +01:00
}
2009-05-26 00:27:34 +02:00
2009-04-07 11:34:41 +02:00
private boolean testFlags ( final WordReference ientry ) {
2007-11-22 00:14:57 +01:00
if ( query . constraint = = null ) return true ;
// test if ientry matches with filter
// if all = true: let only entries pass that has all matching bits
// if all = false: let all entries pass that has at least one matching bit
if ( query . allofconstraint ) {
for ( int i = 0 ; i < 32 ; i + + ) {
if ( ( query . constraint . get ( i ) ) & & ( ! ientry . flags ( ) . get ( i ) ) ) return false ;
}
return true ;
}
for ( int i = 0 ; i < 32 ; i + + ) {
if ( ( query . constraint . get ( i ) ) & & ( ientry . flags ( ) . get ( i ) ) ) return true ;
}
return false ;
2007-11-07 23:38:09 +01:00
}
2009-06-03 10:49:54 +02:00
public Map < byte [ ] , ReferenceContainer < WordReference > > searchContainerMap ( ) {
2007-11-22 00:14:57 +01:00
// direct access to the result maps is needed for abstract generation
// this is only available if execQuery() was called before
2009-06-03 10:49:54 +02:00
return localSearchInclusion ;
2007-11-22 00:14:57 +01:00
}
// todo:
// - remove redundant urls (sub-path occurred before)
// - move up shorter urls
// - root-domain guessing to prefer the root domain over other urls if search word appears in domain name
2009-04-03 15:23:45 +02:00
private SortStack < WordReferenceVars > . stackElement bestRWI ( final boolean skipDoubleDom ) {
2008-04-24 17:09:06 +02:00
// returns from the current RWI list the best entry and removes this entry from the list
2009-04-03 15:23:45 +02:00
SortStack < WordReferenceVars > m ;
SortStack < WordReferenceVars > . stackElement rwi ;
2008-02-21 11:06:57 +01:00
while ( stack . size ( ) > 0 ) {
rwi = stack . pop ( ) ;
2008-04-24 17:09:06 +02:00
if ( rwi = = null ) continue ; // in case that a synchronization problem occurred just go lazy over it
2008-02-21 11:06:57 +01:00
if ( ! skipDoubleDom ) return rwi ;
2007-11-22 00:14:57 +01:00
// check doubledom
2009-04-07 11:34:41 +02:00
final String domhash = rwi . element . metadataHash ( ) . substring ( 6 ) ;
2008-01-30 01:15:43 +01:00
m = this . doubleDomCache . get ( domhash ) ;
2007-11-22 00:14:57 +01:00
if ( m = = null ) {
// first appearance of dom
2009-04-03 15:23:45 +02:00
m = new SortStack < WordReferenceVars > ( ( query . specialRights ) ? maxDoubleDomSpecial : maxDoubleDomAll ) ;
2007-11-22 00:14:57 +01:00
this . doubleDomCache . put ( domhash , m ) ;
2008-02-21 11:06:57 +01:00
return rwi ;
2007-11-22 00:14:57 +01:00
}
// second appearances of dom
2008-02-21 11:06:57 +01:00
m . push ( rwi ) ;
2007-11-22 00:14:57 +01:00
}
// no more entries in sorted RWI entries. Now take Elements from the doubleDomCache
// find best entry from all caches
2009-04-03 15:23:45 +02:00
final Iterator < SortStack < WordReferenceVars > > i = this . doubleDomCache . values ( ) . iterator ( ) ;
SortStack < WordReferenceVars > . stackElement bestEntry = null ;
SortStack < WordReferenceVars > . stackElement o ;
2007-11-22 00:14:57 +01:00
while ( i . hasNext ( ) ) {
2007-12-28 19:47:45 +01:00
m = i . next ( ) ;
2008-04-24 17:09:06 +02:00
if ( m = = null ) continue ;
2007-11-22 00:14:57 +01:00
if ( m . size ( ) = = 0 ) continue ;
if ( bestEntry = = null ) {
2008-02-21 11:06:57 +01:00
bestEntry = m . top ( ) ;
2007-11-22 00:14:57 +01:00
continue ;
}
2008-02-21 11:06:57 +01:00
o = m . top ( ) ;
2008-02-22 00:40:38 +01:00
if ( o . weight . longValue ( ) < bestEntry . weight . longValue ( ) ) {
2008-02-21 11:06:57 +01:00
bestEntry = o ;
2007-11-22 00:14:57 +01:00
}
}
2008-02-21 11:06:57 +01:00
if ( bestEntry = = null ) return null ;
2007-11-22 00:14:57 +01:00
// finally remove the best entry from the doubledom cache
2009-04-07 11:34:41 +02:00
m = this . doubleDomCache . get ( bestEntry . element . metadataHash ( ) . substring ( 6 ) ) ;
2008-02-21 11:06:57 +01:00
o = m . pop ( ) ;
2009-04-23 22:54:13 +02:00
assert o = = null | | o . element . metadataHash ( ) . equals ( bestEntry . element . metadataHash ( ) ) : " bestEntry.element.metadataHash() = " + bestEntry . element . metadataHash ( ) + " , o.element.metadataHash() = " + o . element . metadataHash ( ) ;
2008-02-21 11:06:57 +01:00
return bestEntry ;
2007-11-22 00:14:57 +01:00
}
2009-04-03 15:23:45 +02:00
public URLMetadataRow bestURL ( final boolean skipDoubleDom ) {
2009-04-30 00:14:12 +02:00
// returns from the current RWI list the best URL entry and removes this entry from the list
2008-02-21 11:06:57 +01:00
while ( ( stack . size ( ) > 0 ) | | ( size ( ) > 0 ) ) {
2008-04-24 15:31:55 +02:00
if ( ( ( stack . size ( ) = = 0 ) & & ( size ( ) = = 0 ) ) ) break ;
2009-04-03 15:23:45 +02:00
final SortStack < WordReferenceVars > . stackElement obrwi = bestRWI ( skipDoubleDom ) ;
2008-05-23 11:45:33 +02:00
if ( obrwi = = null ) continue ; // *** ? this happened and the thread was suspended silently. cause?
2009-05-29 12:03:35 +02:00
final URLMetadataRow u = indexSegment . urlMetadata ( ) . load ( obrwi . element . metadataHash ( ) , obrwi . element , obrwi . weight . longValue ( ) ) ;
2008-04-24 15:31:55 +02:00
if ( u ! = null ) {
2009-04-03 15:23:45 +02:00
final URLMetadataRow . Components metadata = u . metadata ( ) ;
2009-04-30 00:14:12 +02:00
if ( metadata . url ( ) ! = null ) {
String urlstring = metadata . url ( ) . toNormalform ( true , true ) ;
if ( urlstring = = null | | ! urlstring . matches ( query . urlMask ) ) continue ;
this . handover . put ( u . hash ( ) , metadata . url ( ) . toNormalform ( true , false ) ) ; // remember that we handed over this url
return u ;
}
2008-04-24 15:31:55 +02:00
}
2009-04-07 11:34:41 +02:00
misses . add ( obrwi . element . metadataHash ( ) ) ;
2007-11-22 00:14:57 +01:00
}
return null ;
}
2008-04-24 17:09:06 +02:00
public int size ( ) {
2007-11-22 00:14:57 +01:00
//assert sortedRWIEntries.size() == urlhashes.size() : "sortedRWIEntries.size() = " + sortedRWIEntries.size() + ", urlhashes.size() = " + urlhashes.size();
2008-02-21 11:06:57 +01:00
int c = stack . size ( ) ;
2009-04-03 15:23:45 +02:00
final Iterator < SortStack < WordReferenceVars > > i = this . doubleDomCache . values ( ) . iterator ( ) ;
2007-12-28 19:47:45 +01:00
while ( i . hasNext ( ) ) c + = i . next ( ) . size ( ) ;
2007-11-22 00:14:57 +01:00
return c ;
2007-11-07 23:38:09 +01:00
}
2007-11-16 15:48:09 +01:00
public int [ ] flagCount ( ) {
2007-11-22 00:14:57 +01:00
return flagcount ;
2007-11-16 15:48:09 +01:00
}
2008-01-30 22:58:30 +01:00
// "results from a total number of <remote_resourceSize + local_resourceSize> known (<local_resourceSize> local, <remote_resourceSize> remote), <remote_indexCount> links from <remote_peerCount> other YaCy peers."
2007-11-07 23:38:09 +01:00
public int filteredCount ( ) {
2008-01-30 22:58:30 +01:00
// the number of index entries that are considered as result set
2008-02-21 11:06:57 +01:00
return this . stack . size ( ) ;
2007-11-07 23:38:09 +01:00
}
2008-01-30 22:58:30 +01:00
public int getRemoteIndexCount ( ) {
// the number of result contributions from all the remote peers
return this . remote_indexCount ;
}
public int getRemotePeerCount ( ) {
// the number of remote peers that have contributed
return this . remote_peerCount ;
2007-11-07 23:38:09 +01:00
}
2008-01-30 22:58:30 +01:00
public int getRemoteResourceSize ( ) {
// the number of all hits in all the remote peers
return this . remote_resourceSize ;
}
public int getLocalResourceSize ( ) {
// the number of hits in the local peer (index size, size of the collection in the own index)
return this . local_resourceSize ;
}
2009-03-02 00:58:14 +01:00
public Reference remove ( final String urlHash ) {
2009-04-03 15:23:45 +02:00
final SortStack < WordReferenceVars > . stackElement se = stack . remove ( urlHash . hashCode ( ) ) ;
2008-02-21 11:06:57 +01:00
if ( se = = null ) return null ;
2007-11-07 23:38:09 +01:00
urlhashes . remove ( urlHash ) ;
2008-02-21 11:06:57 +01:00
return se . element ;
2007-11-07 23:38:09 +01:00
}
2007-11-22 00:14:57 +01:00
2007-12-28 19:47:45 +01:00
public Iterator < String > miss ( ) {
2007-11-22 00:14:57 +01:00
return this . misses . iterator ( ) ;
2007-11-07 23:38:09 +01:00
}
2009-06-02 17:20:10 +02:00
public class hoststat {
public int count ;
public String hashsample ;
public hoststat ( String urlhash ) {
this . count = 1 ;
this . hashsample = urlhash ;
}
public void inc ( ) {
this . count + + ;
}
}
public static final Comparator < hoststat > hscomp = new Comparator < hoststat > ( ) {
public int compare ( hoststat o1 , hoststat o2 ) {
if ( o1 . count < o2 . count ) return 1 ;
if ( o2 . count < o1 . count ) return - 1 ;
return 0 ;
}
} ;
public class NavigatorEntry {
public int count ;
public String name ;
public NavigatorEntry ( String name , int count ) {
this . name = name ;
this . count = count ;
}
}
public ArrayList < NavigatorEntry > getHostNavigator ( int count ) {
hoststat [ ] hsa = this . hostNavigator . values ( ) . toArray ( new hoststat [ this . hostNavigator . size ( ) ] ) ;
Arrays . sort ( hsa , hscomp ) ;
int rc = Math . min ( count , hsa . length ) ;
ArrayList < NavigatorEntry > result = new ArrayList < NavigatorEntry > ( ) ;
URLMetadataRow mr ;
yacyURL url ;
for ( int i = 0 ; i < rc ; i + + ) {
mr = indexSegment . urlMetadata ( ) . load ( hsa [ i ] . hashsample , null , 0 ) ;
if ( mr = = null ) continue ;
url = mr . metadata ( ) . url ( ) ;
if ( url = = null ) continue ;
result . add ( new NavigatorEntry ( url . getHost ( ) , hsa [ i ] . count ) ) ;
}
return result ;
}
public static final Comparator < Map . Entry < String , Integer > > mecomp = new Comparator < Map . Entry < String , Integer > > ( ) {
public int compare ( Map . Entry < String , Integer > o1 , Map . Entry < String , Integer > o2 ) {
if ( o1 . getValue ( ) . intValue ( ) < o2 . getValue ( ) . intValue ( ) ) return 1 ;
if ( o2 . getValue ( ) . intValue ( ) < o1 . getValue ( ) . intValue ( ) ) return - 1 ;
return 0 ;
}
} ;
@SuppressWarnings ( " unchecked " )
public ArrayList < NavigatorEntry > getTopicNavigator ( final int count ) {
2007-11-07 23:38:09 +01:00
// create a list of words that had been computed by statistics over all
// words that appeared in the url or the description of all urls
2009-06-02 17:20:10 +02:00
Map . Entry < String , Integer > [ ] a = this . ref . entrySet ( ) . toArray ( new Map . Entry [ this . ref . size ( ) ] ) ;
Arrays . sort ( a , mecomp ) ;
int rc = Math . min ( count , a . length ) ;
ArrayList < NavigatorEntry > result = new ArrayList < NavigatorEntry > ( ) ;
Map . Entry < String , Integer > e ;
int c ;
for ( int i = 0 ; i < rc ; i + + ) {
e = a [ i ] ;
c = e . getValue ( ) . intValue ( ) ;
if ( c = = 0 ) break ;
result . add ( new NavigatorEntry ( e . getKey ( ) , c ) ) ;
2007-11-07 23:38:09 +01:00
}
2009-06-02 17:20:10 +02:00
return result ;
2007-11-07 23:38:09 +01:00
}
2009-06-02 17:20:10 +02:00
public void addTopic ( final String [ ] words ) {
2007-11-07 23:38:09 +01:00
String word ;
for ( int i = 0 ; i < words . length ; i + + ) {
word = words [ i ] . toLowerCase ( ) ;
2009-06-02 17:20:10 +02:00
Integer c ;
2007-11-07 23:38:09 +01:00
if ( ( word . length ( ) > 2 ) & &
( " http_html_php_ftp_www_com_org_net_gov_edu_index_home_page_for_usage_the_and_ " . indexOf ( word ) < 0 ) & &
2009-06-02 17:20:10 +02:00
( ! ( query . queryHashes . contains ( Word . word2hash ( word ) ) ) ) ) {
c = ref . get ( word ) ;
if ( c = = null ) ref . put ( word , 1 ) ; else ref . put ( word , c . intValue ( ) + 1 ) ;
}
2007-11-07 23:38:09 +01:00
}
}
2009-06-02 17:20:10 +02:00
protected void addTopics ( final plasmaSearchEvent . ResultEntry resultEntry ) {
2007-11-07 23:38:09 +01:00
// take out relevant information for reference computation
if ( ( resultEntry . url ( ) = = null ) | | ( resultEntry . title ( ) = = null ) ) return ;
2009-06-02 17:20:10 +02:00
//final String[] urlcomps = htmlFilterContentScraper.urlComps(resultEntry.url().toNormalform(true, true)); // word components of the url
2008-08-02 14:12:04 +02:00
final String [ ] descrcomps = resultEntry . title ( ) . toLowerCase ( ) . split ( htmlFilterContentScraper . splitrex ) ; // words in the description
2007-11-07 23:38:09 +01:00
// add references
2009-06-02 17:20:10 +02:00
//addTopic(urlcomps);
addTopic ( descrcomps ) ;
2007-11-07 23:38:09 +01:00
}
2009-03-02 00:58:14 +01:00
public ReferenceOrder getOrder ( ) {
2007-11-07 23:38:09 +01:00
return this . order ;
}
2008-08-02 14:12:04 +02:00
public static void loadYBR ( final File rankingPath , final int count ) {
2007-11-07 23:38:09 +01:00
// load ranking tables
if ( rankingPath . exists ( ) ) {
2009-01-30 16:33:00 +01:00
ybrTables = new BinSearch [ count ] ;
2007-11-07 23:38:09 +01:00
String ybrName ;
File f ;
try {
for ( int i = 0 ; i < count ; i + + ) {
2009-01-30 16:33:00 +01:00
ybrName = " YBR-4- " + Digest . encodeHex ( i , 2 ) + " .idx " ;
2007-11-07 23:38:09 +01:00
f = new File ( rankingPath , ybrName ) ;
if ( f . exists ( ) ) {
2009-01-31 02:06:56 +01:00
ybrTables [ i ] = new BinSearch ( FileUtils . read ( f ) , 6 ) ;
2007-11-07 23:38:09 +01:00
} else {
ybrTables [ i ] = null ;
}
}
2008-08-02 14:12:04 +02:00
} catch ( final IOException e ) {
2007-11-07 23:38:09 +01:00
ybrTables = null ;
}
} else {
ybrTables = null ;
}
}
public static boolean canUseYBR ( ) {
return ybrTables ! = null ;
}
public static boolean isUsingYBR ( ) {
return useYBR ;
}
2008-08-02 14:12:04 +02:00
public static void switchYBR ( final boolean usage ) {
2007-11-07 23:38:09 +01:00
useYBR = usage ;
}
2008-08-02 14:12:04 +02:00
public static int ybr ( final String urlHash ) {
2007-11-07 23:38:09 +01:00
// returns the YBR value in a range of 0..15, where 0 means best ranking and 15 means worst ranking
if ( ybrTables = = null ) return 15 ;
if ( ! ( useYBR ) ) return 15 ;
final String domHash = urlHash . substring ( 6 ) ;
2008-08-02 14:12:04 +02:00
final int m = Math . min ( maxYBR , ybrTables . length ) ;
2008-04-24 15:31:55 +02:00
for ( int i = 0 ; i < m ; i + + ) {
2007-11-07 23:38:09 +01:00
if ( ( ybrTables [ i ] ! = null ) & & ( ybrTables [ i ] . contains ( domHash . getBytes ( ) ) ) ) {
//System.out.println("YBR FOUND: " + urlHash + " (" + i + ")");
return i ;
}
}
//System.out.println("NOT FOUND: " + urlHash);
return 15 ;
}
2007-11-22 00:14:57 +01:00
public long postRanking (
2008-08-02 14:12:04 +02:00
final Set < String > topwords ,
final plasmaSearchEvent . ResultEntry rentry ,
final int position ) {
2007-11-22 00:14:57 +01:00
long r = ( 255 - position ) < < 8 ;
// for media search: prefer pages with many links
2008-01-08 21:12:31 +01:00
if ( query . contentdom = = plasmaSearchQuery . CONTENTDOM_IMAGE ) r + = rentry . limage ( ) < < query . ranking . coeff_cathasimage ;
if ( query . contentdom = = plasmaSearchQuery . CONTENTDOM_AUDIO ) r + = rentry . laudio ( ) < < query . ranking . coeff_cathasaudio ;
if ( query . contentdom = = plasmaSearchQuery . CONTENTDOM_VIDEO ) r + = rentry . lvideo ( ) < < query . ranking . coeff_cathasvideo ;
if ( query . contentdom = = plasmaSearchQuery . CONTENTDOM_APP ) r + = rentry . lapp ( ) < < query . ranking . coeff_cathasapp ;
2007-11-22 00:14:57 +01:00
// prefer hit with 'prefer' pattern
2008-01-08 21:12:31 +01:00
if ( rentry . url ( ) . toNormalform ( true , true ) . matches ( query . prefer ) ) r + = 256 < < query . ranking . coeff_prefer ;
if ( rentry . title ( ) . matches ( query . prefer ) ) r + = 256 < < query . ranking . coeff_prefer ;
2007-11-22 00:14:57 +01:00
// apply 'common-sense' heuristic using references
2008-08-02 14:12:04 +02:00
final String urlstring = rentry . url ( ) . toNormalform ( true , true ) ;
final String [ ] urlcomps = htmlFilterContentScraper . urlComps ( urlstring ) ;
final String [ ] descrcomps = rentry . title ( ) . toLowerCase ( ) . split ( htmlFilterContentScraper . splitrex ) ;
2007-11-22 00:14:57 +01:00
for ( int j = 0 ; j < urlcomps . length ; j + + ) {
2008-01-08 21:12:31 +01:00
if ( topwords . contains ( urlcomps [ j ] ) ) r + = Math . max ( 1 , 256 - urlstring . length ( ) ) < < query . ranking . coeff_urlcompintoplist ;
2007-11-22 00:14:57 +01:00
}
for ( int j = 0 ; j < descrcomps . length ; j + + ) {
2008-01-08 21:12:31 +01:00
if ( topwords . contains ( descrcomps [ j ] ) ) r + = Math . max ( 1 , 256 - rentry . title ( ) . length ( ) ) < < query . ranking . coeff_descrcompintoplist ;
2007-11-22 00:14:57 +01:00
}
// apply query-in-result matching
2009-04-16 17:29:00 +02:00
final Set < byte [ ] > urlcomph = Word . words2hashSet ( urlcomps ) ;
final Set < byte [ ] > descrcomph = Word . words2hashSet ( descrcomps ) ;
final Iterator < byte [ ] > shi = query . queryHashes . iterator ( ) ;
byte [ ] queryhash ;
2007-11-22 00:14:57 +01:00
while ( shi . hasNext ( ) ) {
2008-01-08 21:12:31 +01:00
queryhash = shi . next ( ) ;
if ( urlcomph . contains ( queryhash ) ) r + = 256 < < query . ranking . coeff_appurl ;
2008-02-04 23:58:40 +01:00
if ( descrcomph . contains ( queryhash ) ) r + = 256 < < query . ranking . coeff_app_dc_title ;
2007-11-22 00:14:57 +01:00
}
return r ;
}
2007-11-07 23:38:09 +01:00
}