2010-03-05 22:25:49 +01:00
// RankingProcess.java
2007-11-07 23:38:09 +01:00
// (C) 2007 by Michael Peter Christen; mc@yacy.net, Frankfurt a. M., Germany
// first published 07.11.2007 on http://yacy.net
//
// This is a part of YaCy, a peer-to-peer based web search engine
//
2009-09-05 22:41:21 +02:00
// $LastChangedDate$
// $LastChangedRevision$
// $LastChangedBy$
2007-11-07 23:38:09 +01:00
//
// LICENSE
2011-06-22 01:10:50 +02:00
//
2007-11-07 23:38:09 +01:00
// This program is free software; you can redistribute it and/or modify
// it under the terms of the GNU General Public License as published by
// the Free Software Foundation; either version 2 of the License, or
// (at your option) any later version.
//
// This program is distributed in the hope that it will be useful,
// but WITHOUT ANY WARRANTY; without even the implied warranty of
// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
// GNU General Public License for more details.
//
// You should have received a copy of the GNU General Public License
// along with this program; if not, write to the Free Software
// Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
2011-09-26 23:42:28 +02:00
package net.yacy.search.query ;
2007-11-07 23:38:09 +01:00
2009-05-27 00:30:20 +02:00
import java.util.Comparator ;
2009-11-12 00:31:12 +01:00
import java.util.ConcurrentModificationException ;
2010-10-16 01:45:12 +02:00
import java.util.HashMap ;
2007-11-07 23:38:09 +01:00
import java.util.Iterator ;
import java.util.Map ;
2010-11-28 03:57:31 +01:00
import java.util.SortedMap ;
2009-08-28 00:54:32 +02:00
import java.util.concurrent.BlockingQueue ;
2008-02-27 16:16:47 +01:00
import java.util.concurrent.ConcurrentHashMap ;
2009-09-24 21:14:35 +02:00
import java.util.concurrent.TimeUnit ;
2011-11-18 14:09:07 +01:00
import java.util.concurrent.atomic.AtomicInteger ;
2011-06-22 01:10:50 +02:00
import java.util.regex.Pattern ;
2007-11-07 23:38:09 +01:00
2011-05-27 10:24:54 +02:00
import net.yacy.cora.document.ASCII ;
2010-05-25 14:54:57 +02:00
import net.yacy.cora.document.MultiProtocolURI ;
2010-12-17 18:30:09 +01:00
import net.yacy.cora.protocol.Scanner ;
2011-12-16 23:59:29 +01:00
import net.yacy.cora.sorting.ClusteredScoreMap ;
import net.yacy.cora.sorting.ConcurrentScoreMap ;
import net.yacy.cora.sorting.ScoreMap ;
import net.yacy.cora.sorting.WeakPriorityBlockingQueue ;
import net.yacy.cora.sorting.WeakPriorityBlockingQueue.ReverseElement ;
2009-10-18 02:53:43 +02:00
import net.yacy.document.Condenser ;
2011-06-23 13:57:17 +02:00
import net.yacy.kelondro.data.meta.DigestURI ;
2009-10-11 02:12:19 +02:00
import net.yacy.kelondro.data.meta.URIMetadataRow ;
import net.yacy.kelondro.data.word.Word ;
import net.yacy.kelondro.data.word.WordReference ;
import net.yacy.kelondro.data.word.WordReferenceVars ;
2011-09-16 12:00:51 +02:00
import net.yacy.kelondro.index.HandleSet ;
import net.yacy.kelondro.index.RowSpaceExceededException ;
2009-11-05 21:28:37 +01:00
import net.yacy.kelondro.logging.Log ;
2009-10-10 02:39:15 +02:00
import net.yacy.kelondro.rwi.ReferenceContainer ;
import net.yacy.kelondro.rwi.TermSearch ;
2009-12-08 15:25:51 +01:00
import net.yacy.kelondro.util.EventTracker ;
2011-09-25 18:59:06 +02:00
import net.yacy.peers.graphics.ProfilingGraph ;
import net.yacy.search.Switchboard ;
import net.yacy.search.index.Segment ;
2011-09-26 23:42:28 +02:00
import net.yacy.search.ranking.ReferenceOrder ;
2011-09-25 18:59:06 +02:00
import net.yacy.search.snippet.ContentDomain ;
import net.yacy.search.snippet.ResultEntry ;
2007-11-07 23:38:09 +01:00
2011-12-06 02:24:51 +01:00
public final class RWIProcess extends Thread
{
2011-06-22 01:10:50 +02:00
2011-12-06 15:28:48 +01:00
private static final long maxWaitPerResult = 300 ;
2010-10-04 13:54:48 +02:00
private static final int maxDoubleDomAll = 1000 , maxDoubleDomSpecial = 10000 ;
2011-06-22 01:10:50 +02:00
2009-07-09 00:14:57 +02:00
private final QueryParams query ;
2011-09-16 12:00:51 +02:00
private final HandleSet urlhashes ; // map for double-check; String/Long relation, addresses ranking number (backreference for deletion)
2008-08-02 14:12:04 +02:00
private final int [ ] flagcount ; // flag counter
2011-09-16 12:00:51 +02:00
private final HandleSet misses ; // contains url-hashes that could not been found in the LURL-DB
2011-12-06 02:24:51 +01:00
private int sortout ; // counter for referenced that had been sorted out for other reasons
2010-01-11 00:09:48 +01:00
//private final int[] domZones;
2010-11-28 03:57:31 +01:00
private SortedMap < byte [ ] , ReferenceContainer < WordReference > > localSearchInclusion ;
2011-06-22 01:10:50 +02:00
2010-01-13 01:04:37 +01:00
private int remote_resourceSize , remote_indexCount , remote_peerCount ;
2011-09-26 23:42:28 +02:00
private int local_indexCount ;
2011-12-06 15:28:48 +01:00
private final AtomicInteger maxExpectedRemoteReferences , expectedRemoteReferences ,
receivedRemoteReferences ;
2010-10-04 13:54:48 +02:00
private final WeakPriorityBlockingQueue < WordReferenceVars > stack ;
2012-01-09 03:02:35 +01:00
private final AtomicInteger feedersAlive , feedersTerminated ;
2010-10-04 13:54:48 +02:00
private final ConcurrentHashMap < String , WeakPriorityBlockingQueue < WordReferenceVars > > doubleDomCache ; // key = domhash (6 bytes); value = like stack
2010-04-19 18:42:37 +02:00
//private final HandleSet handover; // key = urlhash; used for double-check of urls that had been handed over to search process
2011-06-22 01:10:50 +02:00
2011-12-06 02:24:51 +01:00
private final ScoreMap < String > ref ; // reference score computation for the commonSense heuristic
2011-05-29 22:33:12 +02:00
private final Map < String , byte [ ] > hostResolver ; // a mapping from a host hash (6 bytes) to the full url hash of one of these urls that have the host hash
2009-12-03 13:25:03 +01:00
private final ReferenceOrder order ;
2010-10-04 13:54:48 +02:00
private final long startTime ;
2011-12-06 02:24:51 +01:00
private boolean addRunning ;
2012-01-09 03:02:35 +01:00
private final boolean remote ;
2011-06-22 01:10:50 +02:00
2011-06-23 17:39:52 +02:00
// navigation scores
private final ScoreMap < String > hostNavigator ; // a counter for the appearance of the host hash
private final ScoreMap < String > authorNavigator ; // a counter for the appearances of authors
private final ScoreMap < String > namespaceNavigator ; // a counter for name spaces
private final ScoreMap < String > protocolNavigator ; // a counter for protocol types
private final ScoreMap < String > filetypeNavigator ; // a counter for file types
2012-01-09 03:02:35 +01:00
public RWIProcess ( final QueryParams query , final ReferenceOrder order , final int maxentries , final boolean remote ) {
2007-11-07 23:38:09 +01:00
// we collect the urlhashes and construct a list with urlEntry objects
// attention: if minEntries is too high, this method will not terminate within the maxTime
2007-11-22 00:14:57 +01:00
// sortorder: 0 = hash, 1 = url, 2 = ranking
2011-03-21 14:05:51 +01:00
this . addRunning = true ;
2009-06-03 10:49:54 +02:00
this . localSearchInclusion = null ;
2010-10-04 13:54:48 +02:00
this . stack = new WeakPriorityBlockingQueue < WordReferenceVars > ( maxentries ) ;
this . doubleDomCache = new ConcurrentHashMap < String , WeakPriorityBlockingQueue < WordReferenceVars > > ( ) ;
2007-11-07 23:38:09 +01:00
this . query = query ;
2009-12-03 13:25:03 +01:00
this . order = order ;
2012-01-09 03:02:35 +01:00
this . remote = remote ;
2008-01-30 22:58:30 +01:00
this . remote_peerCount = 0 ;
this . remote_resourceSize = 0 ;
2010-01-13 01:04:37 +01:00
this . remote_indexCount = 0 ;
this . local_indexCount = 0 ;
2011-12-06 02:24:51 +01:00
this . urlhashes =
new HandleSet ( URIMetadataRow . rowdef . primaryKeyLength , URIMetadataRow . rowdef . objectOrder , 100 ) ;
this . misses =
new HandleSet ( URIMetadataRow . rowdef . primaryKeyLength , URIMetadataRow . rowdef . objectOrder , 100 ) ;
2011-05-07 01:04:27 +02:00
this . sortout = 0 ;
2007-11-22 00:14:57 +01:00
this . flagcount = new int [ 32 ] ;
2011-12-06 02:24:51 +01:00
for ( int i = 0 ; i < 32 ; i + + ) {
this . flagcount [ i ] = 0 ;
}
2011-03-13 02:41:44 +01:00
this . hostNavigator = new ConcurrentScoreMap < String > ( ) ;
2011-05-29 22:33:12 +02:00
this . hostResolver = new ConcurrentHashMap < String , byte [ ] > ( ) ;
2011-03-13 02:41:44 +01:00
this . authorNavigator = new ConcurrentScoreMap < String > ( ) ;
this . namespaceNavigator = new ConcurrentScoreMap < String > ( ) ;
2011-06-23 17:39:52 +02:00
this . protocolNavigator = new ConcurrentScoreMap < String > ( ) ;
this . filetypeNavigator = new ConcurrentScoreMap < String > ( ) ;
2011-03-13 02:41:44 +01:00
this . ref = new ConcurrentScoreMap < String > ( ) ;
2012-01-09 03:02:35 +01:00
this . feedersAlive = new AtomicInteger ( 0 ) ;
this . feedersTerminated = new AtomicInteger ( 0 ) ;
2010-10-04 13:54:48 +02:00
this . startTime = System . currentTimeMillis ( ) ;
2011-12-06 15:28:48 +01:00
this . maxExpectedRemoteReferences = new AtomicInteger ( 0 ) ;
2011-12-06 02:24:51 +01:00
this . expectedRemoteReferences = new AtomicInteger ( 0 ) ;
this . receivedRemoteReferences = new AtomicInteger ( 0 ) ;
}
2011-12-06 15:28:48 +01:00
public void addExpectedRemoteReferences ( int x ) {
if ( x > 0 ) {
this . maxExpectedRemoteReferences . addAndGet ( x ) ;
}
this . expectedRemoteReferences . addAndGet ( x ) ;
2011-12-06 02:24:51 +01:00
}
2011-12-06 15:28:48 +01:00
public boolean expectMoreRemoteReferences ( ) {
return this . expectedRemoteReferences . get ( ) > 0 ;
2007-11-22 00:14:57 +01:00
}
2012-01-09 03:02:35 +01:00
2011-12-06 15:28:48 +01:00
public long waitTimeRecommendation ( ) {
return
this . maxExpectedRemoteReferences . get ( ) = = 0 ? 0 :
Math . min ( maxWaitPerResult ,
Math . min (
maxWaitPerResult * this . expectedRemoteReferences . get ( ) / this . maxExpectedRemoteReferences . get ( ) ,
maxWaitPerResult * ( 100 - Math . min ( 100 , this . receivedRemoteReferences . get ( ) ) ) / 100 ) ) ;
}
2012-01-09 03:02:35 +01:00
2009-11-24 12:13:11 +01:00
public QueryParams getQuery ( ) {
return this . query ;
}
2011-06-22 01:10:50 +02:00
2009-12-03 13:25:03 +01:00
public ReferenceOrder getOrder ( ) {
return this . order ;
}
2011-06-22 01:10:50 +02:00
2010-11-28 03:57:31 +01:00
@Override
2009-08-24 17:24:02 +02:00
public void run ( ) {
2009-08-27 17:19:48 +02:00
// do a search
2011-12-21 00:32:03 +01:00
oneFeederStarted ( ) ;
2012-01-09 03:02:35 +01:00
2009-08-24 17:24:02 +02:00
// sort the local containers and truncate it to a limited count,
// so following sortings together with the global results will be fast
try {
2011-06-22 01:10:50 +02:00
final long timer = System . currentTimeMillis ( ) ;
2011-12-06 02:24:51 +01:00
final TermSearch < WordReference > search =
this . query
. getSegment ( )
. termIndex ( )
. query (
this . query . queryHashes ,
this . query . excludeHashes ,
null ,
Segment . wordReferenceFactory ,
this . query . maxDistance ) ;
2009-08-27 17:19:48 +02:00
this . localSearchInclusion = search . inclusion ( ) ;
final ReferenceContainer < WordReference > index = search . joined ( ) ;
2011-12-06 02:24:51 +01:00
EventTracker . update (
EventTracker . EClass . SEARCH ,
new ProfilingGraph . EventSearch (
this . query . id ( true ) ,
SearchEvent . Type . JOIN ,
this . query . queryString ,
index . size ( ) ,
System . currentTimeMillis ( ) - timer ) ,
false ) ;
if ( ! index . isEmpty ( ) ) {
2011-11-24 02:30:12 +01:00
add ( index , true , " local index: " + this . query . getSegment ( ) . getLocation ( ) , - 1 , true ) ;
2009-08-27 17:19:48 +02:00
}
2011-12-06 02:24:51 +01:00
} catch ( final Exception e ) {
2009-11-05 21:28:37 +01:00
Log . logException ( e ) ;
2010-10-04 13:54:48 +02:00
} finally {
oneFeederTerminated ( ) ;
2009-08-24 17:24:02 +02:00
}
}
2011-06-22 01:10:50 +02:00
2011-03-21 14:05:51 +01:00
public void add (
2011-12-06 02:24:51 +01:00
final ReferenceContainer < WordReference > index ,
final boolean local ,
final String resourceName ,
final int fullResource ,
final boolean finalizeAddAtEnd ) {
2007-11-07 23:38:09 +01:00
// we collect the urlhashes and construct a list with urlEntry objects
// attention: if minEntries is too high, this method will not terminate within the maxTime
2011-11-24 00:39:34 +01:00
//Log.logInfo("RWIProcess", "added a container, size = " + index.size());
2007-11-07 23:38:09 +01:00
2011-03-21 14:05:51 +01:00
this . addRunning = true ;
2011-06-22 01:10:50 +02:00
2008-01-29 00:41:39 +01:00
assert ( index ! = null ) ;
2011-12-06 02:24:51 +01:00
if ( index . isEmpty ( ) ) {
return ;
}
2011-06-22 01:10:50 +02:00
2011-12-06 02:24:51 +01:00
if ( ! local ) {
2011-05-18 16:26:28 +02:00
assert fullResource > = 0 : " fullResource = " + fullResource ;
2008-01-30 22:58:30 +01:00
this . remote_resourceSize + = fullResource ;
this . remote_peerCount + + ;
}
2011-06-22 01:10:50 +02:00
2007-12-04 21:19:13 +01:00
long timer = System . currentTimeMillis ( ) ;
2011-06-22 01:10:50 +02:00
2008-02-21 11:06:57 +01:00
// normalize entries
2009-12-03 13:25:03 +01:00
final BlockingQueue < WordReferenceVars > decodedEntries = this . order . normalizeWith ( index ) ;
2011-12-06 15:28:48 +01:00
int is = index . size ( ) ;
2011-12-06 02:24:51 +01:00
EventTracker . update ( EventTracker . EClass . SEARCH , new ProfilingGraph . EventSearch (
this . query . id ( true ) ,
SearchEvent . Type . NORMALIZING ,
resourceName ,
2011-12-06 15:28:48 +01:00
is ,
2011-12-06 02:24:51 +01:00
System . currentTimeMillis ( ) - timer ) , false ) ;
2012-01-09 03:02:35 +01:00
if ( ! local ) {
this . receivedRemoteReferences . addAndGet ( is ) ;
}
2011-06-22 01:10:50 +02:00
2008-02-21 11:06:57 +01:00
// iterate over normalized entries and select some that are better than currently stored
2010-03-05 22:25:49 +01:00
timer = System . currentTimeMillis ( ) ;
2011-12-06 02:24:51 +01:00
final boolean nav_hosts =
this . query . navigators . equals ( " all " ) | | this . query . navigators . indexOf ( " hosts " , 0 ) > = 0 ;
2009-08-28 01:20:59 +02:00
// apply all constraints
2009-08-28 00:54:32 +02:00
try {
2010-01-12 21:56:37 +01:00
WordReferenceVars iEntry ;
2011-06-23 13:57:17 +02:00
final String pattern = this . query . urlMask . pattern ( ) ;
final boolean httpPattern = pattern . equals ( " http://.* " ) ;
2011-12-06 02:24:51 +01:00
final boolean noHttpButProtocolPattern =
pattern . equals ( " https://.* " )
| | pattern . equals ( " ftp://.* " )
| | pattern . equals ( " smb://.* " )
| | pattern . equals ( " file://.* " ) ;
pollloop : while ( true ) {
2010-11-28 03:57:31 +01:00
iEntry = decodedEntries . poll ( 1 , TimeUnit . SECONDS ) ;
2011-12-06 02:24:51 +01:00
if ( iEntry = = null | | iEntry = = WordReferenceVars . poison ) {
break pollloop ;
}
2011-05-26 12:57:02 +02:00
assert ( iEntry . urlhash ( ) . length = = index . row ( ) . primaryKeyLength ) ;
2010-11-28 03:57:31 +01:00
//if (iEntry.urlHash().length() != index.row().primaryKeyLength) continue;
2007-11-16 15:48:09 +01:00
2010-11-28 03:57:31 +01:00
// increase flag counts
2011-12-06 02:24:51 +01:00
for ( int j = 0 ; j < 32 ; j + + ) {
if ( iEntry . flags ( ) . get ( j ) ) {
this . flagcount [ j ] + + ;
}
2010-11-28 03:57:31 +01:00
}
2008-02-21 11:06:57 +01:00
2010-11-28 03:57:31 +01:00
// check constraints
2011-12-06 02:24:51 +01:00
if ( ! testFlags ( iEntry ) ) {
2011-06-23 13:57:17 +02:00
continue pollloop ;
2010-11-28 03:57:31 +01:00
}
// check document domain
2011-12-06 02:24:51 +01:00
if ( this . query . contentdom ! = ContentDomain . TEXT ) {
if ( ( this . query . contentdom = = ContentDomain . AUDIO )
& & ( ! ( iEntry . flags ( ) . get ( Condenser . flag_cat_hasaudio ) ) ) ) {
continue pollloop ;
}
if ( ( this . query . contentdom = = ContentDomain . VIDEO )
& & ( ! ( iEntry . flags ( ) . get ( Condenser . flag_cat_hasvideo ) ) ) ) {
continue pollloop ;
}
if ( ( this . query . contentdom = = ContentDomain . IMAGE )
& & ( ! ( iEntry . flags ( ) . get ( Condenser . flag_cat_hasimage ) ) ) ) {
continue pollloop ;
}
if ( ( this . query . contentdom = = ContentDomain . APP )
& & ( ! ( iEntry . flags ( ) . get ( Condenser . flag_cat_hasapp ) ) ) ) {
continue pollloop ;
}
2010-11-28 03:57:31 +01:00
}
// check tld domain
/ *
if ( ( DigestURI . domDomain ( iEntry . metadataHash ( ) ) & this . query . zonecode ) = = 0 ) {
// filter out all tld that do not match with wanted tld domain
2011-05-07 01:04:27 +02:00
this . sortout + + ;
2010-11-28 03:57:31 +01:00
continue ;
}
* /
2011-06-22 01:10:50 +02:00
2010-09-15 23:09:14 +02:00
// count domZones
//this.domZones[DigestURI.domDomain(iEntry.metadataHash())]++;
2011-06-22 01:10:50 +02:00
2010-11-28 03:57:31 +01:00
// check site constraints
2011-06-22 01:10:50 +02:00
final String hosthash = iEntry . hosthash ( ) ;
2011-12-06 02:24:51 +01:00
if ( this . query . sitehash = = null ) {
2011-12-13 00:16:05 +01:00
if ( this . query . siteexcludes ! = null & & this . query . siteexcludes . contains ( hosthash ) ) {
continue pollloop ;
}
2010-11-28 03:57:31 +01:00
// no site constraint there; maybe collect host navigation information
2011-12-06 02:24:51 +01:00
if ( nav_hosts & & this . query . urlMask_isCatchall ) {
2011-05-26 12:57:02 +02:00
this . hostNavigator . inc ( hosthash ) ;
2011-05-29 22:33:12 +02:00
this . hostResolver . put ( hosthash , iEntry . urlhash ( ) ) ;
2010-11-28 03:57:31 +01:00
}
} else {
2011-12-06 02:24:51 +01:00
if ( ! hosthash . equals ( this . query . sitehash ) ) {
2010-11-28 03:57:31 +01:00
// filter out all domains that do not match with the site constraint
2011-06-23 13:57:17 +02:00
continue pollloop ;
2010-11-28 03:57:31 +01:00
}
}
2011-06-22 01:10:50 +02:00
2011-06-23 13:57:17 +02:00
// check protocol
2011-12-06 02:24:51 +01:00
if ( ! this . query . urlMask_isCatchall ) {
2011-06-23 13:57:17 +02:00
final boolean httpFlagSet = DigestURI . flag4HTTPset ( iEntry . urlHash ) ;
2011-12-06 02:24:51 +01:00
if ( httpPattern & & ! httpFlagSet ) {
continue pollloop ;
}
if ( noHttpButProtocolPattern & & httpFlagSet ) {
continue pollloop ;
}
2011-06-23 13:57:17 +02:00
}
2010-11-28 03:57:31 +01:00
// finally make a double-check and insert result to stack
2011-09-16 12:00:51 +02:00
// the url hashes should be unique, no reason to check that
//if (!this.urlhashes.has(iEntry.urlhash())) {
2011-12-06 02:24:51 +01:00
this . urlhashes . putUnique ( iEntry . urlhash ( ) ) ;
rankingtryloop : while ( true ) {
try {
this . stack . put ( new ReverseElement < WordReferenceVars > ( iEntry , this . order
. cardinal ( iEntry ) ) ) ; // inserts the element and removes the worst (which is smallest)
break rankingtryloop ;
} catch ( final ArithmeticException e ) {
// this may happen if the concurrent normalizer changes values during cardinal computation
continue rankingtryloop ;
2011-05-18 16:26:28 +02:00
}
2011-12-06 02:24:51 +01:00
}
// increase counter for statistics
if ( local ) {
this . local_indexCount + + ;
} else {
this . remote_indexCount + + ;
//}
}
2010-11-28 03:57:31 +01:00
}
2009-08-28 01:20:59 +02:00
2011-12-06 02:24:51 +01:00
} catch ( final InterruptedException e ) {
} catch ( final RowSpaceExceededException e ) {
} finally {
if ( finalizeAddAtEnd ) {
this . addRunning = false ;
}
2011-03-21 14:05:51 +01:00
}
2011-06-22 01:10:50 +02:00
2007-11-22 00:14:57 +01:00
//if ((query.neededResults() > 0) && (container.size() > query.neededResults())) remove(true, true);
2011-12-06 02:24:51 +01:00
EventTracker . update ( EventTracker . EClass . SEARCH , new ProfilingGraph . EventSearch (
this . query . id ( true ) ,
SearchEvent . Type . PRESORT ,
resourceName ,
index . size ( ) ,
System . currentTimeMillis ( ) - timer ) , false ) ;
2007-11-16 15:48:09 +01:00
}
2011-06-22 01:10:50 +02:00
2009-09-04 01:09:53 +02:00
/ * *
* method to signal the incoming stack that one feeder has terminated
* /
public void oneFeederTerminated ( ) {
2012-01-09 03:02:35 +01:00
this . feedersTerminated . incrementAndGet ( ) ;
final int c = this . feedersAlive . decrementAndGet ( ) ;
2011-12-06 02:24:51 +01:00
assert c > = 0 : " feeders = " + c ;
2009-09-04 01:09:53 +02:00
}
2011-06-22 01:10:50 +02:00
2011-12-21 00:32:03 +01:00
public void oneFeederStarted ( ) {
2012-01-09 03:02:35 +01:00
this . feedersAlive . addAndGet ( 1 ) ;
2009-09-04 01:09:53 +02:00
}
2011-06-22 01:10:50 +02:00
2010-10-04 13:54:48 +02:00
public boolean feedingIsFinished ( ) {
2012-01-09 03:02:35 +01:00
return
this . feedersTerminated . intValue ( ) > ( this . remote ? 1 : 0 ) & &
2012-01-10 03:00:55 +01:00
this . feedersAlive . get ( ) = = 0 & &
2012-01-09 03:02:35 +01:00
( ! this . remote | | this . remote_indexCount > 0 ) ;
2009-09-04 01:09:53 +02:00
}
2011-06-22 01:10:50 +02:00
2009-04-07 11:34:41 +02:00
private boolean testFlags ( final WordReference ientry ) {
2011-12-06 02:24:51 +01:00
if ( this . query . constraint = = null ) {
return true ;
}
2007-11-22 00:14:57 +01:00
// test if ientry matches with filter
// if all = true: let only entries pass that has all matching bits
// if all = false: let all entries pass that has at least one matching bit
2011-12-06 02:24:51 +01:00
if ( this . query . allofconstraint ) {
for ( int i = 0 ; i < 32 ; i + + ) {
if ( ( this . query . constraint . get ( i ) ) & & ( ! ientry . flags ( ) . get ( i ) ) ) {
return false ;
}
2007-11-22 00:14:57 +01:00
}
return true ;
}
2011-12-06 02:24:51 +01:00
for ( int i = 0 ; i < 32 ; i + + ) {
if ( ( this . query . constraint . get ( i ) ) & & ( ientry . flags ( ) . get ( i ) ) ) {
return true ;
}
2007-11-22 00:14:57 +01:00
}
return false ;
2007-11-07 23:38:09 +01:00
}
2011-06-22 01:10:50 +02:00
2011-09-25 18:59:06 +02:00
public Map < byte [ ] , ReferenceContainer < WordReference > > searchContainerMap ( ) {
2007-11-22 00:14:57 +01:00
// direct access to the result maps is needed for abstract generation
// this is only available if execQuery() was called before
2011-06-22 01:10:50 +02:00
return this . localSearchInclusion ;
2007-11-22 00:14:57 +01:00
}
2011-06-22 01:10:50 +02:00
2011-12-06 02:24:51 +01:00
private WeakPriorityBlockingQueue . Element < WordReferenceVars > takeRWI (
final boolean skipDoubleDom ,
final long waitingtime ) {
2011-06-22 01:10:50 +02:00
2008-04-24 17:09:06 +02:00
// returns from the current RWI list the best entry and removes this entry from the list
2010-10-04 13:54:48 +02:00
WeakPriorityBlockingQueue < WordReferenceVars > m ;
WeakPriorityBlockingQueue . Element < WordReferenceVars > rwi = null ;
2011-06-22 01:10:50 +02:00
2010-10-05 19:49:53 +02:00
// take one entry from the stack if there are entries on that stack or the feeding is not yet finished
2011-03-21 14:05:51 +01:00
try {
2010-10-04 13:54:48 +02:00
//System.out.println("stack.poll: feeders = " + this.feeders + ", stack.sizeQueue = " + stack.sizeQueue());
int loops = 0 ; // a loop counter to terminate the reading if all the results are from the same domain
2011-12-06 02:24:51 +01:00
// wait some time if we did not get so much remote results so far to get a better ranking over remote results
// we wait at most 30 milliseconds to get a maximum total waiting time of 300 milliseconds for 10 results
2011-12-06 15:28:48 +01:00
long wait = waitTimeRecommendation ( ) ;
2011-12-06 02:24:51 +01:00
if ( wait > 0 ) {
2011-12-06 21:55:50 +01:00
//System.out.println("*** RWIProcess extra wait: " + wait + "ms; expectedRemoteReferences = " + this.expectedRemoteReferences.get() + ", receivedRemoteReferences = " + this.receivedRemoteReferences.get() + ", initialExpectedRemoteReferences = " + this.maxExpectedRemoteReferences.get());
2011-12-06 02:24:51 +01:00
Thread . sleep ( wait ) ;
}
// loop as long as we can expect that we should get more results
2011-12-06 15:28:48 +01:00
final long timeout = System . currentTimeMillis ( ) + waitingtime ;
2011-12-06 02:24:51 +01:00
while ( ( ( ! feedingIsFinished ( ) & & this . addRunning ) | | this . stack . sizeQueue ( ) > 0 )
& & ( this . query . itemsPerPage < 1 | | loops + + < this . query . itemsPerPage ) ) {
if ( waitingtime < = 0 ) {
2011-06-22 01:10:50 +02:00
rwi = this . stack . poll ( ) ;
2011-12-06 02:24:51 +01:00
} else {
timeoutloop : while ( System . currentTimeMillis ( ) < timeout ) {
if ( feedingIsFinished ( ) & & this . stack . sizeQueue ( ) = = 0 ) {
break timeoutloop ;
}
rwi = this . stack . poll ( 50 ) ;
if ( rwi ! = null ) {
break timeoutloop ;
}
}
}
if ( rwi = = null ) {
break ;
2010-10-04 13:54:48 +02:00
}
2011-12-06 02:24:51 +01:00
if ( ! skipDoubleDom ) {
2010-10-04 13:54:48 +02:00
//System.out.println("!skipDoubleDom");
return rwi ;
2011-12-06 02:24:51 +01:00
}
2011-06-22 01:10:50 +02:00
2010-09-10 00:42:54 +02:00
// check doubledom
2011-05-30 06:19:20 +02:00
final String hosthash = rwi . getElement ( ) . hosthash ( ) ;
2011-12-06 02:24:51 +01:00
synchronized ( this . doubleDomCache ) {
2011-05-30 06:19:20 +02:00
m = this . doubleDomCache . get ( hosthash ) ;
2011-12-06 02:24:51 +01:00
if ( m = = null ) {
2010-10-05 19:49:53 +02:00
// first appearance of dom. we create an entry to signal that one of that domain was already returned
2011-12-06 02:24:51 +01:00
m =
new WeakPriorityBlockingQueue < WordReferenceVars > ( ( this . query . specialRights )
? maxDoubleDomSpecial
: maxDoubleDomAll ) ;
2011-05-30 06:19:20 +02:00
this . doubleDomCache . put ( hosthash , m ) ;
2010-10-05 19:49:53 +02:00
return rwi ;
}
// second appearances of dom
m . put ( rwi ) ;
2010-09-10 00:42:54 +02:00
}
2007-11-22 00:14:57 +01:00
}
2011-12-06 02:24:51 +01:00
} catch ( final InterruptedException e1 ) {
}
if ( this . doubleDomCache . isEmpty ( ) ) {
return null ;
}
2011-06-22 01:10:50 +02:00
2007-11-22 00:14:57 +01:00
// no more entries in sorted RWI entries. Now take Elements from the doubleDomCache
// find best entry from all caches
2010-10-04 13:54:48 +02:00
WeakPriorityBlockingQueue . Element < WordReferenceVars > bestEntry = null ;
WeakPriorityBlockingQueue . Element < WordReferenceVars > o ;
2011-12-06 02:24:51 +01:00
synchronized ( this . doubleDomCache ) {
final Iterator < WeakPriorityBlockingQueue < WordReferenceVars > > i =
this . doubleDomCache . values ( ) . iterator ( ) ;
while ( i . hasNext ( ) ) {
2009-11-12 00:31:12 +01:00
try {
m = i . next ( ) ;
2011-12-06 02:24:51 +01:00
} catch ( final ConcurrentModificationException e ) {
2009-11-12 00:31:12 +01:00
Log . logException ( e ) ;
2010-10-05 19:49:53 +02:00
continue ; // not the best solution...
2010-10-04 13:54:48 +02:00
}
2011-12-06 02:24:51 +01:00
if ( m = = null ) {
continue ;
}
if ( m . isEmpty ( ) ) {
continue ;
}
if ( bestEntry = = null ) {
2010-09-09 17:30:25 +02:00
bestEntry = m . peek ( ) ;
2009-11-12 00:31:12 +01:00
continue ;
}
2010-09-09 17:30:25 +02:00
o = m . peek ( ) ;
2011-12-06 02:24:51 +01:00
if ( o = = null ) {
continue ;
}
if ( o . getWeight ( ) < bestEntry . getWeight ( ) ) {
2009-11-12 00:31:12 +01:00
bestEntry = o ;
}
2007-11-22 00:14:57 +01:00
}
2011-12-06 02:24:51 +01:00
if ( bestEntry = = null ) {
return null ;
}
2011-06-22 01:10:50 +02:00
2010-10-05 19:49:53 +02:00
// finally remove the best entry from the doubledom cache
2011-05-30 06:19:20 +02:00
m = this . doubleDomCache . get ( bestEntry . getElement ( ) . hosthash ( ) ) ;
2010-10-05 19:49:53 +02:00
bestEntry = m . poll ( ) ;
2007-11-22 00:14:57 +01:00
}
2008-02-21 11:06:57 +01:00
return bestEntry ;
2007-11-22 00:14:57 +01:00
}
2011-06-22 01:10:50 +02:00
2009-11-24 12:13:11 +01:00
/ * *
2011-12-06 02:24:51 +01:00
* get one metadata entry from the ranked results . This will be the ' best ' entry so far according to the
* applied ranking . If there are no more entries left or the timeout limit is reached then null is
* returned . The caller may distinguish the timeout case from the case where there will be no more also in
* the future by calling this . feedingIsFinished ( )
2011-12-06 15:28:48 +01:00
*
2009-11-24 12:13:11 +01:00
* @param skipDoubleDom should be true if it is wanted that double domain entries are skipped
2010-10-04 13:54:48 +02:00
* @param waitingtime the time this method may take for a result computation
2009-11-24 12:13:11 +01:00
* @return a metadata entry for a url
* /
2010-10-04 13:54:48 +02:00
public URIMetadataRow takeURL ( final boolean skipDoubleDom , final long waitingtime ) {
2009-04-30 00:14:12 +02:00
// returns from the current RWI list the best URL entry and removes this entry from the list
2011-12-06 02:24:51 +01:00
final long timeout = System . currentTimeMillis ( ) + Math . max ( 10 , waitingtime ) ;
int p = - 1 ;
long timeleft ;
while ( ( timeleft = timeout - System . currentTimeMillis ( ) ) > 0 ) {
//System.out.println("timeleft = " + timeleft);
final WeakPriorityBlockingQueue . Element < WordReferenceVars > obrwi =
takeRWI ( skipDoubleDom , timeleft ) ;
if ( obrwi = = null ) {
return null ; // all time was already wasted in takeRWI to get another element
}
2011-05-13 08:21:40 +02:00
final URIMetadataRow page = this . query . getSegment ( ) . urlMetadata ( ) . load ( obrwi ) ;
2011-12-06 02:24:51 +01:00
if ( page = = null ) {
try {
2011-09-16 12:00:51 +02:00
this . misses . putUnique ( obrwi . getElement ( ) . urlhash ( ) ) ;
2011-12-06 02:24:51 +01:00
} catch ( final RowSpaceExceededException e ) {
2011-09-16 12:00:51 +02:00
}
2011-12-06 02:24:51 +01:00
continue ;
2009-08-27 22:20:07 +02:00
}
2011-06-22 01:10:50 +02:00
2011-12-06 02:24:51 +01:00
if ( ! this . query . urlMask_isCatchall ) {
2010-03-23 14:41:41 +01:00
// check url mask
2011-12-17 01:27:08 +01:00
if ( ! page . matches ( this . query . urlMask ) ) {
2011-05-07 01:04:27 +02:00
this . sortout + + ;
2010-03-23 14:41:41 +01:00
continue ;
}
2011-06-22 01:10:50 +02:00
2010-03-23 14:41:41 +01:00
// in case that we do not have e catchall filter for urls
// we must also construct the domain navigator here
2010-12-06 15:34:58 +01:00
//if (query.sitehash == null) {
2011-03-07 21:36:40 +01:00
// this.hostNavigator.inc(UTF8.String(urlhash, 6, 6));
// this.hostResolver.put(UTF8.String(urlhash, 6, 6), UTF8.String(urlhash));
2010-12-06 15:34:58 +01:00
//}
2010-03-23 11:17:28 +01:00
}
2011-06-22 01:10:50 +02:00
2010-03-23 14:41:41 +01:00
// check for more errors
2011-12-17 01:27:08 +01:00
if ( page . url ( ) = = null ) {
2011-05-07 01:04:27 +02:00
this . sortout + + ;
2010-03-23 11:17:28 +01:00
continue ; // rare case where the url is corrupted
}
2011-12-17 01:27:08 +01:00
final String pageurl = page . url ( ) . toNormalform ( true , true ) ;
final String pageauthor = page . dc_creator ( ) ;
final String pagetitle = page . dc_title ( ) . toLowerCase ( ) ;
2010-03-23 11:17:28 +01:00
2009-08-27 22:20:07 +02:00
// check exclusion
2011-12-06 02:24:51 +01:00
if ( ( QueryParams . anymatch ( pagetitle , this . query . excludeHashes ) )
| | ( QueryParams . anymatch ( pageurl . toLowerCase ( ) , this . query . excludeHashes ) )
| | ( QueryParams . anymatch ( pageauthor . toLowerCase ( ) , this . query . excludeHashes ) ) ) {
2011-05-07 01:04:27 +02:00
this . sortout + + ;
2009-08-27 22:20:07 +02:00
continue ;
}
2011-03-31 11:41:30 +02:00
2009-08-27 22:20:07 +02:00
// check index-of constraint
2011-12-06 02:24:51 +01:00
if ( ( this . query . constraint ! = null )
& & ( this . query . constraint . get ( Condenser . flag_cat_indexof ) )
& & ( ! ( pagetitle . startsWith ( " index of " ) ) ) ) {
2011-06-22 01:10:50 +02:00
final Iterator < byte [ ] > wi = this . query . queryHashes . iterator ( ) ;
2011-12-06 02:24:51 +01:00
while ( wi . hasNext ( ) ) {
2010-12-06 15:34:58 +01:00
this . query . getSegment ( ) . termIndex ( ) . removeDelayed ( wi . next ( ) , page . hash ( ) ) ;
}
2011-05-07 01:04:27 +02:00
this . sortout + + ;
2009-08-27 22:20:07 +02:00
continue ;
}
2011-03-31 11:41:30 +02:00
// check location constraint
2011-12-06 02:24:51 +01:00
if ( ( this . query . constraint ! = null )
& & ( this . query . constraint . get ( Condenser . flag_cat_haslocation ) )
2011-12-17 01:27:08 +01:00
& & ( page . lat ( ) = = 0 . 0f | | page . lon ( ) = = 0 . 0f ) ) {
2011-05-07 01:04:27 +02:00
this . sortout + + ;
2011-03-31 11:41:30 +02:00
continue ;
}
2011-06-22 01:10:50 +02:00
2009-08-27 22:20:07 +02:00
// check content domain
2011-12-06 02:24:51 +01:00
if ( ( this . query . contentdom = = ContentDomain . AUDIO & & page . laudio ( ) = = 0 )
| | ( this . query . contentdom = = ContentDomain . VIDEO & & page . lvideo ( ) = = 0 )
| | ( this . query . contentdom = = ContentDomain . IMAGE & & page . limage ( ) = = 0 )
| | ( this . query . contentdom = = ContentDomain . APP & & page . lapp ( ) = = 0 ) ) {
2011-05-07 01:04:27 +02:00
this . sortout + + ;
2011-12-06 02:24:51 +01:00
continue ;
2009-08-27 22:20:07 +02:00
}
2011-06-22 01:10:50 +02:00
2009-08-27 22:20:07 +02:00
// evaluate information of metadata for navigation
// author navigation:
2011-12-06 02:24:51 +01:00
if ( pageauthor ! = null & & pageauthor . length ( ) > 0 ) {
// add author to the author navigator
2011-06-22 01:10:50 +02:00
final String authorhash = ASCII . String ( Word . word2hash ( pageauthor ) ) ;
2010-11-28 03:57:31 +01:00
2009-08-27 22:20:07 +02:00
// check if we already are filtering for authors
2011-12-06 02:24:51 +01:00
if ( this . query . authorhash ! = null & & ! this . query . authorhash . equals ( authorhash ) ) {
2011-05-07 01:04:27 +02:00
this . sortout + + ;
2011-12-06 02:24:51 +01:00
continue ;
}
2011-06-22 01:10:50 +02:00
2011-12-06 02:24:51 +01:00
// add author to the author navigator
2010-10-16 01:45:12 +02:00
this . authorNavigator . inc ( pageauthor ) ;
2011-12-06 02:24:51 +01:00
} else if ( this . query . authorhash ! = null ) {
2011-05-07 01:04:27 +02:00
this . sortout + + ;
2011-12-06 02:24:51 +01:00
continue ;
2009-06-09 17:22:23 +02:00
}
2011-06-22 01:10:50 +02:00
2010-03-05 22:25:49 +01:00
// namespace navigation
2011-12-17 01:27:08 +01:00
String pagepath = page . url ( ) . getPath ( ) ;
2011-12-06 02:24:51 +01:00
if ( ( p = pagepath . indexOf ( ':' ) ) > = 0 ) {
pagepath = pagepath . substring ( 0 , p ) ;
2010-03-05 22:25:49 +01:00
p = pagepath . lastIndexOf ( '/' ) ;
2011-12-06 02:24:51 +01:00
if ( p > = 0 ) {
2010-03-05 22:25:49 +01:00
pagepath = pagepath . substring ( p + 1 ) ;
2010-10-16 01:45:12 +02:00
this . namespaceNavigator . inc ( pagepath ) ;
2010-03-05 22:25:49 +01:00
}
}
2011-06-22 01:10:50 +02:00
2011-06-23 17:39:52 +02:00
// protocol navigation
2011-12-17 01:27:08 +01:00
final String protocol = page . url ( ) . getProtocol ( ) ;
2011-06-23 17:39:52 +02:00
this . protocolNavigator . inc ( protocol ) ;
// file type navigation
2011-12-17 01:27:08 +01:00
final String fileext = page . url ( ) . getFileExtension ( ) ;
2011-12-06 02:24:51 +01:00
if ( fileext . length ( ) > 0 ) {
this . filetypeNavigator . inc ( fileext ) ;
}
2011-06-23 17:39:52 +02:00
2010-12-17 18:30:09 +01:00
// check Scanner
2011-12-17 01:27:08 +01:00
if ( ! Scanner . acceptURL ( page . url ( ) ) ) {
2011-05-07 01:04:27 +02:00
this . sortout + + ;
2010-12-17 18:30:09 +01:00
continue ;
}
2011-06-22 01:10:50 +02:00
2009-08-27 22:20:07 +02:00
// accept url
return page ;
2007-11-22 00:14:57 +01:00
}
return null ;
2009-08-25 23:27:01 +02:00
}
2011-06-22 01:10:50 +02:00
2010-10-04 13:54:48 +02:00
public int sizeQueue ( ) {
2011-06-22 01:10:50 +02:00
int c = this . stack . sizeQueue ( ) ;
2011-12-06 02:24:51 +01:00
for ( final WeakPriorityBlockingQueue < WordReferenceVars > s : this . doubleDomCache . values ( ) ) {
2010-10-04 13:54:48 +02:00
c + = s . sizeQueue ( ) ;
}
return c ;
}
2011-06-22 01:10:50 +02:00
2010-10-04 13:54:48 +02:00
public int sizeAvailable ( ) {
2011-06-22 01:10:50 +02:00
int c = this . stack . sizeAvailable ( ) ;
2011-12-06 02:24:51 +01:00
for ( final WeakPriorityBlockingQueue < WordReferenceVars > s : this . doubleDomCache . values ( ) ) {
2010-09-09 17:30:25 +02:00
c + = s . sizeAvailable ( ) ;
2009-08-27 17:19:48 +02:00
}
2007-11-22 00:14:57 +01:00
return c ;
2007-11-07 23:38:09 +01:00
}
2011-06-22 01:10:50 +02:00
2009-12-02 01:37:59 +01:00
public boolean isEmpty ( ) {
2011-12-06 02:24:51 +01:00
if ( ! this . stack . isEmpty ( ) ) {
return false ;
}
for ( final WeakPriorityBlockingQueue < WordReferenceVars > s : this . doubleDomCache . values ( ) ) {
if ( ! s . isEmpty ( ) ) {
return false ;
}
2009-12-02 01:37:59 +01:00
}
return true ;
}
2011-06-22 01:10:50 +02:00
2007-11-16 15:48:09 +01:00
public int [ ] flagCount ( ) {
2011-12-06 02:24:51 +01:00
return this . flagcount ;
2007-11-16 15:48:09 +01:00
}
2011-06-22 01:10:50 +02:00
2008-01-30 22:58:30 +01:00
// "results from a total number of <remote_resourceSize + local_resourceSize> known (<local_resourceSize> local, <remote_resourceSize> remote), <remote_indexCount> links from <remote_peerCount> other YaCy peers."
2011-06-22 01:10:50 +02:00
2007-11-07 23:38:09 +01:00
public int filteredCount ( ) {
2008-01-30 22:58:30 +01:00
// the number of index entries that are considered as result set
2010-09-09 17:30:25 +02:00
return this . stack . sizeAvailable ( ) ;
2007-11-07 23:38:09 +01:00
}
2010-01-13 01:04:37 +01:00
public int getLocalIndexCount ( ) {
// the number of results in the local peer after filtering
return this . local_indexCount ;
}
2011-06-22 01:10:50 +02:00
2008-01-30 22:58:30 +01:00
public int getRemoteIndexCount ( ) {
// the number of result contributions from all the remote peers
return this . remote_indexCount ;
}
2011-06-22 01:10:50 +02:00
2008-01-30 22:58:30 +01:00
public int getRemoteResourceSize ( ) {
// the number of all hits in all the remote peers
2010-12-06 15:34:58 +01:00
return Math . max ( this . remote_resourceSize , this . remote_indexCount ) ;
2008-01-30 22:58:30 +01:00
}
2011-06-22 01:10:50 +02:00
2010-01-13 01:04:37 +01:00
public int getRemotePeerCount ( ) {
// the number of remote peers that have contributed
return this . remote_peerCount ;
2008-01-30 22:58:30 +01:00
}
2011-06-22 01:10:50 +02:00
2010-04-15 15:22:59 +02:00
public Iterator < byte [ ] > miss ( ) {
2007-11-22 00:14:57 +01:00
return this . misses . iterator ( ) ;
2007-11-07 23:38:09 +01:00
}
2011-06-22 01:10:50 +02:00
2010-12-02 13:19:59 +01:00
public int getMissCount ( ) {
return this . misses . size ( ) ;
}
2011-06-22 01:10:50 +02:00
2011-05-07 01:04:27 +02:00
public int getSortOutCount ( ) {
return this . sortout ;
}
2011-06-22 01:10:50 +02:00
2011-03-13 02:41:44 +01:00
public ScoreMap < String > getNamespaceNavigator ( ) {
2011-12-06 02:24:51 +01:00
if ( ! this . query . navigators . equals ( " all " ) & & this . query . navigators . indexOf ( " namespace " , 0 ) < 0 ) {
return new ClusteredScoreMap < String > ( ) ;
}
if ( this . namespaceNavigator . sizeSmaller ( 2 ) ) {
this . namespaceNavigator . clear ( ) ; // navigators with one entry are not useful
}
2010-10-16 01:45:12 +02:00
return this . namespaceNavigator ;
2009-06-09 00:01:26 +02:00
}
2011-06-22 01:10:50 +02:00
2011-03-13 02:41:44 +01:00
public ScoreMap < String > getHostNavigator ( ) {
2011-06-22 01:10:50 +02:00
final ScoreMap < String > result = new ConcurrentScoreMap < String > ( ) ;
2011-12-06 02:24:51 +01:00
if ( ! this . query . navigators . equals ( " all " ) & & this . query . navigators . indexOf ( " hosts " , 0 ) < 0 ) {
return result ;
}
2011-06-22 01:10:50 +02:00
2010-11-28 03:57:31 +01:00
final Iterator < String > domhashs = this . hostNavigator . keys ( false ) ;
2010-10-16 01:45:12 +02:00
URIMetadataRow row ;
2011-05-29 22:33:12 +02:00
byte [ ] urlhash ;
String hosthash , hostname ;
2011-12-06 02:24:51 +01:00
if ( this . hostResolver ! = null ) {
while ( domhashs . hasNext ( ) & & result . sizeSmaller ( 30 ) ) {
hosthash = domhashs . next ( ) ;
if ( hosthash = = null ) {
continue ;
}
urlhash = this . hostResolver . get ( hosthash ) ;
row = urlhash = = null ? null : this . query . getSegment ( ) . urlMetadata ( ) . load ( urlhash ) ;
2011-12-17 01:27:08 +01:00
hostname = row = = null ? null : row . url ( ) . getHost ( ) ;
2011-12-06 02:24:51 +01:00
if ( hostname ! = null ) {
result . set ( hostname , this . hostNavigator . get ( hosthash ) ) ;
}
2010-12-06 15:34:58 +01:00
}
2009-06-02 17:20:10 +02:00
}
2011-12-06 02:24:51 +01:00
if ( result . sizeSmaller ( 2 ) ) {
result . clear ( ) ; // navigators with one entry are not useful
}
2009-06-02 17:20:10 +02:00
return result ;
}
2011-06-23 17:39:52 +02:00
public ScoreMap < String > getProtocolNavigator ( ) {
2011-12-06 02:24:51 +01:00
if ( ! this . query . navigators . equals ( " all " ) & & this . query . navigators . indexOf ( " protocol " , 0 ) < 0 ) {
return new ClusteredScoreMap < String > ( ) ;
}
if ( this . protocolNavigator . sizeSmaller ( 2 ) ) {
this . protocolNavigator . clear ( ) ; // navigators with one entry are not useful
}
2011-06-23 17:39:52 +02:00
return this . protocolNavigator ;
}
public ScoreMap < String > getFiletypeNavigator ( ) {
2011-12-06 02:24:51 +01:00
if ( ! this . query . navigators . equals ( " all " ) & & this . query . navigators . indexOf ( " filetype " , 0 ) < 0 ) {
return new ClusteredScoreMap < String > ( ) ;
}
if ( this . filetypeNavigator . sizeSmaller ( 2 ) ) {
this . filetypeNavigator . clear ( ) ; // navigators with one entry are not useful
}
2011-06-23 17:39:52 +02:00
return this . filetypeNavigator ;
}
2011-12-06 02:24:51 +01:00
public static final Comparator < Map . Entry < String , Integer > > mecomp =
new Comparator < Map . Entry < String , Integer > > ( ) {
@Override
public int compare ( final Map . Entry < String , Integer > o1 , final Map . Entry < String , Integer > o2 ) {
if ( o1 . getValue ( ) . intValue ( ) < o2 . getValue ( ) . intValue ( ) ) {
return 1 ;
}
if ( o2 . getValue ( ) . intValue ( ) < o1 . getValue ( ) . intValue ( ) ) {
return - 1 ;
}
return 0 ;
}
} ;
2010-10-16 01:45:12 +02:00
2011-06-22 01:10:50 +02:00
public ScoreMap < String > getTopicNavigator ( final int count ) {
2007-11-07 23:38:09 +01:00
// create a list of words that had been computed by statistics over all
// words that appeared in the url or the description of all urls
2011-03-13 02:41:44 +01:00
final ScoreMap < String > result = new ConcurrentScoreMap < String > ( ) ;
2011-12-06 02:24:51 +01:00
if ( ! this . query . navigators . equals ( " all " ) & & this . query . navigators . indexOf ( " topics " , 0 ) < 0 ) {
return result ;
}
if ( this . ref . sizeSmaller ( 2 ) ) {
this . ref . clear ( ) ; // navigators with one entry are not useful
}
2011-02-02 01:22:00 +01:00
final Map < String , Float > counts = new HashMap < String , Float > ( ) ;
2010-11-28 03:57:31 +01:00
final Iterator < String > i = this . ref . keys ( false ) ;
2010-10-16 01:45:12 +02:00
String word ;
byte [ ] termHash ;
int c ;
2011-02-02 01:22:00 +01:00
float q , min = Float . MAX_VALUE , max = Float . MIN_VALUE ;
2010-10-16 01:45:12 +02:00
int ic = count ;
2011-12-06 02:24:51 +01:00
while ( ic - - > 0 & & i . hasNext ( ) ) {
2010-10-16 01:45:12 +02:00
word = i . next ( ) ;
2011-12-06 02:24:51 +01:00
if ( word = = null ) {
continue ;
}
2010-10-16 01:45:12 +02:00
termHash = Word . word2hash ( word ) ;
c = this . query . getSegment ( ) . termIndex ( ) . count ( termHash ) ;
2011-12-06 02:24:51 +01:00
if ( c > 0 ) {
2011-02-02 01:22:00 +01:00
q = ( ( float ) this . ref . get ( word ) ) / ( ( float ) c ) ;
2010-10-16 01:45:12 +02:00
min = Math . min ( min , q ) ;
max = Math . max ( max , q ) ;
counts . put ( word , q ) ;
}
}
2011-12-06 02:24:51 +01:00
if ( max > min ) {
for ( final Map . Entry < String , Float > ce : counts . entrySet ( ) ) {
result . set ( ce . getKey ( ) , ( int ) ( ( ( double ) count ) * ( ce . getValue ( ) - min ) / ( max - min ) ) ) ;
}
2010-10-16 01:45:12 +02:00
}
return this . ref ;
2007-11-07 23:38:09 +01:00
}
2011-06-22 01:10:50 +02:00
private final static Pattern lettermatch = Pattern . compile ( " [a-z]+ " ) ;
2009-06-02 17:20:10 +02:00
public void addTopic ( final String [ ] words ) {
2007-11-07 23:38:09 +01:00
String word ;
2011-12-06 02:24:51 +01:00
for ( final String w : words ) {
2010-11-28 03:57:31 +01:00
word = w . toLowerCase ( ) ;
2011-12-06 02:24:51 +01:00
if ( word . length ( ) > 2
& & " http_html_php_ftp_www_com_org_net_gov_edu_index_home_page_for_usage_the_and_zum_der_die_das_und_the_zur_bzw_mit_blog_wiki_aus_bei_off "
. indexOf ( word ) < 0
& & ! this . query . queryHashes . has ( Word . word2hash ( word ) )
& & lettermatch . matcher ( word ) . matches ( )
& & ! Switchboard . badwords . contains ( word )
& & ! Switchboard . stopwords . contains ( word ) ) {
2011-06-22 01:10:50 +02:00
this . ref . inc ( word ) ;
2009-06-02 17:20:10 +02:00
}
2007-11-07 23:38:09 +01:00
}
}
2011-06-22 01:10:50 +02:00
2011-09-25 18:59:06 +02:00
public void addTopics ( final ResultEntry resultEntry ) {
2007-11-07 23:38:09 +01:00
// take out relevant information for reference computation
2011-12-06 02:24:51 +01:00
if ( ( resultEntry . url ( ) = = null ) | | ( resultEntry . title ( ) = = null ) ) {
return ;
}
2009-06-02 17:20:10 +02:00
//final String[] urlcomps = htmlFilterContentScraper.urlComps(resultEntry.url().toNormalform(true, true)); // word components of the url
2010-05-25 14:54:57 +02:00
final String [ ] descrcomps = MultiProtocolURI . splitpattern . split ( resultEntry . title ( ) . toLowerCase ( ) ) ; // words in the description
2011-06-22 01:10:50 +02:00
2007-11-07 23:38:09 +01:00
// add references
2009-06-02 17:20:10 +02:00
//addTopic(urlcomps);
addTopic ( descrcomps ) ;
2007-11-07 23:38:09 +01:00
}
2011-06-22 01:10:50 +02:00
2011-03-13 02:41:44 +01:00
public ScoreMap < String > getAuthorNavigator ( ) {
2009-06-09 00:01:26 +02:00
// create a list of words that had been computed by statistics over all
// words that appeared in the url or the description of all urls
2011-12-06 02:24:51 +01:00
if ( ! this . query . navigators . equals ( " all " ) & & this . query . navigators . indexOf ( " authors " , 0 ) < 0 ) {
return new ConcurrentScoreMap < String > ( ) ;
}
if ( this . authorNavigator . sizeSmaller ( 2 ) ) {
this . authorNavigator . clear ( ) ; // navigators with one entry are not useful
}
2010-10-16 01:45:12 +02:00
return this . authorNavigator ;
2009-06-09 00:01:26 +02:00
}
2007-11-22 00:14:57 +01:00
2007-11-07 23:38:09 +01:00
}