2007-08-06 02:56:56 +02:00
// plasmaSearchProcessing.java
2007-08-06 01:57:25 +02:00
// (C) 2005 by Michael Peter Christen; mc@yacy.net, Frankfurt a. M., Germany
// first published 17.10.2005 on http://yacy.net
//
// This is a part of YaCy, a peer-to-peer based web search engine
//
// $LastChangedDate: 2006-04-02 22:40:07 +0200 (So, 02 Apr 2006) $
// $LastChangedRevision: 1986 $
// $LastChangedBy: orbiter $
//
// LICENSE
//
// This program is free software; you can redistribute it and/or modify
// it under the terms of the GNU General Public License as published by
// the Free Software Foundation; either version 2 of the License, or
// (at your option) any later version.
//
// This program is distributed in the hope that it will be useful,
// but WITHOUT ANY WARRANTY; without even the implied warranty of
// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
// GNU General Public License for more details.
//
// You should have received a copy of the GNU General Public License
// along with this program; if not, write to the Free Software
// Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
package de.anomic.plasma ;
import java.util.Collection ;
import java.util.HashMap ;
import java.util.Map ;
import java.util.Set ;
import de.anomic.index.indexContainer ;
/ * *
*
* This class provides search processes and keeps a timing record of the processes
* It shall be used to initiate a search and also to evaluate
* the real obtained timings after a search is performed
* /
public class plasmaSearchProcessing implements Cloneable {
// collection:
// time = time to get a RWI out of RAM cache, assortments and WORDS files
// count = maximum number of RWI-entries that shall be collected
// join
// time = time to perform the join between all collected RWIs
// count = maximum number of entries that shall be joined
// presort:
// time = time to do a sort of the joined URL-records
// count = maximum number of entries that shall be pre-sorted
// urlfetch:
// time = time to fetch the real URLs from the LURL database
// count = maximum number of urls that shall be fetched
// postsort:
// time = time for final sort of URLs
// count = maximum number oof URLs that shall be retrieved during sort
// snippetfetch:
// time = time to fetch snippets for selected URLs
// count = maximum number of snipptes to be fetched
public static final char PROCESS_COLLECTION = 'c' ;
public static final char PROCESS_JOIN = 'j' ;
public static final char PROCESS_PRESORT = 'r' ;
public static final char PROCESS_URLFETCH = 'u' ;
public static final char PROCESS_POSTSORT = 'o' ;
public static final char PROCESS_FILTER = 'f' ;
public static final char PROCESS_SNIPPETFETCH = 's' ;
private static final long minimumTargetTime = 100 ;
public static char [ ] sequence = new char [ ] {
PROCESS_COLLECTION ,
PROCESS_JOIN ,
PROCESS_PRESORT ,
PROCESS_URLFETCH ,
PROCESS_POSTSORT ,
PROCESS_FILTER ,
PROCESS_SNIPPETFETCH
} ;
private HashMap targetTime ;
private HashMap targetCount ;
private HashMap yieldTime ;
private HashMap yieldCount ;
private long timer ;
private plasmaSearchProcessing ( ) {
targetTime = new HashMap ( ) ;
targetCount = new HashMap ( ) ;
yieldTime = new HashMap ( ) ;
yieldCount = new HashMap ( ) ;
timer = 0 ;
}
public plasmaSearchProcessing ( long time , int count ) {
this (
3 * time / 12 , 10 * count ,
1 * time / 12 , 10 * count ,
1 * time / 12 , 10 * count ,
2 * time / 12 , 5 * count ,
3 * time / 12 , count ,
1 * time / 12 , count ,
1 * time / 12 , 1
) ;
}
public plasmaSearchProcessing (
long time_collection , int count_collection ,
long time_join , int count_join ,
long time_presort , int count_presort ,
long time_urlfetch , int count_urlfetch ,
long time_postsort , int count_postsort ,
long time_filter , int count_filter ,
long time_snippetfetch , int count_snippetfetch ) {
this ( ) ;
targetTime . put ( new Character ( PROCESS_COLLECTION ) , new Long ( time_collection ) ) ;
targetTime . put ( new Character ( PROCESS_JOIN ) , new Long ( time_join ) ) ;
targetTime . put ( new Character ( PROCESS_PRESORT ) , new Long ( time_presort ) ) ;
targetTime . put ( new Character ( PROCESS_URLFETCH ) , new Long ( time_urlfetch ) ) ;
targetTime . put ( new Character ( PROCESS_POSTSORT ) , new Long ( time_postsort ) ) ;
targetTime . put ( new Character ( PROCESS_FILTER ) , new Long ( time_filter ) ) ;
targetTime . put ( new Character ( PROCESS_SNIPPETFETCH ) , new Long ( time_snippetfetch ) ) ;
targetCount . put ( new Character ( PROCESS_COLLECTION ) , new Integer ( count_collection ) ) ;
targetCount . put ( new Character ( PROCESS_JOIN ) , new Integer ( count_join ) ) ;
targetCount . put ( new Character ( PROCESS_PRESORT ) , new Integer ( count_presort ) ) ;
targetCount . put ( new Character ( PROCESS_URLFETCH ) , new Integer ( count_urlfetch ) ) ;
targetCount . put ( new Character ( PROCESS_POSTSORT ) , new Integer ( count_postsort ) ) ;
targetCount . put ( new Character ( PROCESS_FILTER ) , new Integer ( count_filter ) ) ;
targetCount . put ( new Character ( PROCESS_SNIPPETFETCH ) , new Integer ( count_snippetfetch ) ) ;
}
public Object clone ( ) {
plasmaSearchProcessing p = new plasmaSearchProcessing ( ) ;
p . targetTime = ( HashMap ) this . targetTime . clone ( ) ;
p . targetCount = ( HashMap ) this . targetCount . clone ( ) ;
p . yieldTime = ( HashMap ) this . yieldTime . clone ( ) ;
p . yieldCount = ( HashMap ) this . yieldCount . clone ( ) ;
return p ;
}
public plasmaSearchProcessing ( String s ) {
targetTime = new HashMap ( ) ;
targetCount = new HashMap ( ) ;
yieldTime = new HashMap ( ) ;
yieldCount = new HashMap ( ) ;
intoMap ( s , targetTime , targetCount ) ;
}
public long duetime ( ) {
// returns the old duetime value as sum of all waiting times
long d = 0 ;
for ( int i = 0 ; i < sequence . length ; i + + ) {
d + = ( ( Long ) targetTime . get ( new Character ( sequence [ i ] ) ) ) . longValue ( ) ;
}
return d ;
}
public void putYield ( String s ) {
intoMap ( s , yieldTime , yieldCount ) ;
}
public String yieldToString ( ) {
return toString ( yieldTime , yieldCount ) ;
}
public String targetToString ( ) {
return toString ( targetTime , targetCount ) ;
}
public long getTargetTime ( char type ) {
// sum up all time that was demanded and subtract all that had been wasted
long sum = 0 ;
Long t ;
Character element ;
for ( int i = 0 ; i < sequence . length ; i + + ) {
element = new Character ( sequence [ i ] ) ;
t = ( Long ) targetTime . get ( element ) ;
if ( t ! = null ) sum + = t . longValue ( ) ;
if ( type = = sequence [ i ] ) return ( sum < 0 ) ? minimumTargetTime : sum ;
t = ( Long ) yieldTime . get ( element ) ;
if ( t ! = null ) sum - = t . longValue ( ) ;
}
return minimumTargetTime ;
}
public int getTargetCount ( char type ) {
Integer i = ( Integer ) targetCount . get ( new Character ( type ) ) ;
if ( i = = null ) return - 1 ; else return i . intValue ( ) ;
}
public long getYieldTime ( char type ) {
Long l = ( Long ) yieldTime . get ( new Character ( type ) ) ;
if ( l = = null ) return - 1 ; else return l . longValue ( ) ;
}
public int getYieldCount ( char type ) {
Integer i = ( Integer ) yieldCount . get ( new Character ( type ) ) ;
if ( i = = null ) return - 1 ; else return i . intValue ( ) ;
}
public void startTimer ( ) {
this . timer = System . currentTimeMillis ( ) ;
}
public void setYieldTime ( char type ) {
// sets a time that is computed using the timer
long t = System . currentTimeMillis ( ) - this . timer ;
yieldTime . put ( new Character ( type ) , new Long ( t ) ) ;
}
public void setYieldCount ( char type , int count ) {
yieldCount . put ( new Character ( type ) , new Integer ( count ) ) ;
}
public String reportToString ( ) {
return " target= " + toString ( targetTime , targetCount ) + " ; yield= " + toString ( yieldTime , yieldCount ) ;
}
public static String toString ( HashMap time , HashMap count ) {
// put this into a format in such a way that it can be send in a http header or post argument
// that means that no '=' or spaces are allowed
StringBuffer sb = new StringBuffer ( sequence . length * 10 ) ;
Character element ;
Integer xi ;
Long xl ;
for ( int i = 0 ; i < sequence . length ; i + + ) {
element = new Character ( sequence [ i ] ) ;
sb . append ( " t " ) ;
sb . append ( element ) ;
xl = ( Long ) time . get ( element ) ;
sb . append ( ( xl = = null ) ? " 0 " : xl . toString ( ) ) ;
sb . append ( " | " ) ;
sb . append ( " c " ) ;
sb . append ( element ) ;
xi = ( Integer ) count . get ( element ) ;
sb . append ( ( xi = = null ) ? " 0 " : xi . toString ( ) ) ;
sb . append ( " | " ) ;
}
return sb . toString ( ) ;
}
public static void intoMap ( String s , HashMap time , HashMap count ) {
// this is the reverse method to toString
int p = 0 ;
char ct ;
String elt ;
String v ;
int p1 ;
while ( ( p < s . length ( ) ) & & ( ( p1 = s . indexOf ( '|' , p ) ) > 0 ) ) {
ct = s . charAt ( p ) ;
elt = s . substring ( p + 1 , p + 2 ) ;
v = s . substring ( p + 2 , p1 ) ;
if ( ct = = 't' ) {
time . put ( elt , new Long ( Long . parseLong ( v ) ) ) ;
} else {
count . put ( elt , new Integer ( Integer . parseInt ( v ) ) ) ;
}
}
}
// the processes
2007-08-06 02:56:56 +02:00
// collection
2007-08-06 01:57:25 +02:00
public Map [ ] localSearchContainers (
plasmaSearchQuery query ,
plasmaWordIndex wordIndex ,
Set urlselection ) {
// search for the set of hashes and return a map of of wordhash:indexContainer containing the seach result
// retrieve entities that belong to the hashes
startTimer ( ) ;
long start = System . currentTimeMillis ( ) ;
Map inclusionContainers = ( query . queryHashes . size ( ) = = 0 ) ? new HashMap ( ) : wordIndex . getContainers (
query . queryHashes ,
urlselection ,
true ,
true ,
getTargetTime ( plasmaSearchProcessing . PROCESS_COLLECTION ) * query . queryHashes . size ( ) / ( query . queryHashes . size ( ) + query . excludeHashes . size ( ) ) ) ;
if ( ( inclusionContainers . size ( ) ! = 0 ) & & ( inclusionContainers . size ( ) < query . queryHashes . size ( ) ) ) inclusionContainers = new HashMap ( ) ; // prevent that only a subset is returned
long remaintime = getTargetTime ( plasmaSearchProcessing . PROCESS_COLLECTION ) - System . currentTimeMillis ( ) + start ;
Map exclusionContainers = ( ( inclusionContainers = = null ) | | ( inclusionContainers . size ( ) = = 0 ) | | ( remaintime < = 0 ) ) ? new HashMap ( ) : wordIndex . getContainers (
query . excludeHashes ,
urlselection ,
true ,
true ,
remaintime ) ;
setYieldTime ( plasmaSearchProcessing . PROCESS_COLLECTION ) ;
setYieldCount ( plasmaSearchProcessing . PROCESS_COLLECTION , inclusionContainers . size ( ) ) ;
return new Map [ ] { inclusionContainers , exclusionContainers } ;
}
2007-08-06 02:56:56 +02:00
// join
2007-08-06 01:57:25 +02:00
public indexContainer localSearchJoinExclude (
Collection includeContainers ,
Collection excludeContainers ,
long time , int maxDistance ) {
// join a search result and return the joincount (number of pages after join)
// since this is a conjunction we return an empty entity if any word is not known
2007-08-25 01:12:59 +02:00
if ( includeContainers = = null ) return plasmaWordIndex . emptyContainer ( null , 0 ) ;
2007-08-06 01:57:25 +02:00
// join the result
startTimer ( ) ;
long start = System . currentTimeMillis ( ) ;
indexContainer rcLocal = indexContainer . joinContainers ( includeContainers , time , maxDistance ) ;
long remaining = getTargetTime ( plasmaSearchProcessing . PROCESS_JOIN ) - System . currentTimeMillis ( ) + start ;
if ( ( rcLocal ! = null ) & & ( remaining > 0 ) ) {
indexContainer . excludeContainers ( rcLocal , excludeContainers , remaining ) ;
}
2007-08-25 01:12:59 +02:00
if ( rcLocal = = null ) rcLocal = plasmaWordIndex . emptyContainer ( null , 0 ) ;
2007-08-06 01:57:25 +02:00
setYieldTime ( plasmaSearchProcessing . PROCESS_JOIN ) ;
setYieldCount ( plasmaSearchProcessing . PROCESS_JOIN , rcLocal . size ( ) ) ;
return rcLocal ;
}
}