2006-02-05 00:51:00 +01:00
// plasmaSearchPreOrder.java
2005-10-23 19:50:27 +02:00
// -----------------------
// part of YACY
// (C) by Michael Peter Christen; mc@anomic.de
// first published on http://www.anomic.de
// Frankfurt, Germany, 2005
// Created: 23.10.2005
//
// This program is free software; you can redistribute it and/or modify
// it under the terms of the GNU General Public License as published by
// the Free Software Foundation; either version 2 of the License, or
// (at your option) any later version.
//
// This program is distributed in the hope that it will be useful,
// but WITHOUT ANY WARRANTY; without even the implied warranty of
// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
// GNU General Public License for more details.
//
// You should have received a copy of the GNU General Public License
// along with this program; if not, write to the Free Software
// Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
//
// Using this software in any meaning (reading, learning, copying, compiling,
// running) means that you agree that the Author(s) is (are) not responsible
// for cost, loss of data or any harm that may be caused directly or indirectly
// by usage of this softare or this documentation. The usage of this software
// is on your own risk. The installation and usage (starting/running) of this
// software may allow other people or application to access your computer and
// any attached devices and is highly dependent on the configuration of the
// software which must be done by the user of the software; the author(s) is
// (are) also not responsible for proper configuration and usage of the
// software, even if provoked by documentation provided together with
// the software.
//
// Any changes to this file according to the GPL as documented in the file
// gpl.txt aside this file in the shipment you received can be done to the
// lines that follows this copyright notice here, but changes must not be
// done inside the copyright notive above. A re-distribution must contain
// the intact and unchanged copyright notice.
// Contributions and changes to the program code must be marked as such.
package de.anomic.plasma ;
2005-11-22 16:17:05 +01:00
import java.io.File ;
import java.io.IOException ;
2006-09-10 01:44:54 +02:00
import java.util.HashSet ;
2005-10-23 19:50:27 +02:00
import java.util.Iterator ;
2006-09-30 00:27:20 +02:00
import java.util.Map ;
import java.util.TreeMap ;
2007-04-05 12:14:48 +02:00
import java.util.TreeSet ;
2005-10-23 19:50:27 +02:00
2006-08-03 01:20:03 +02:00
import de.anomic.index.indexContainer ;
2006-11-08 17:17:47 +01:00
import de.anomic.index.indexRWIEntry ;
2006-11-10 02:13:33 +01:00
import de.anomic.plasma.plasmaURL ;
2005-11-23 02:40:02 +01:00
import de.anomic.kelondro.kelondroBinSearch ;
2006-09-30 00:27:20 +02:00
import de.anomic.server.serverCodings ;
import de.anomic.server.serverFileUtils ;
2005-10-23 19:50:27 +02:00
public final class plasmaSearchPreOrder {
2005-11-27 12:55:24 +01:00
public static kelondroBinSearch [ ] ybrTables = null ; // block-rank tables
2005-11-22 16:17:05 +01:00
private static boolean useYBR = true ;
2006-11-08 17:17:47 +01:00
private indexRWIEntry entryMin , entryMax ;
2005-10-23 19:50:27 +02:00
private TreeMap pageAcc ; // key = order hash; value = plasmaLURL.entry
private plasmaSearchQuery query ;
2006-02-05 00:51:00 +01:00
private plasmaSearchRankingProfile ranking ;
2006-12-13 02:39:34 +01:00
private int filteredCount ;
2005-10-23 19:50:27 +02:00
2006-09-07 03:13:03 +02:00
public plasmaSearchPreOrder ( ) {
this . entryMin = null ;
this . entryMax = null ;
this . pageAcc = new TreeMap ( ) ;
this . query = null ;
this . ranking = null ;
}
public plasmaSearchPreOrder ( plasmaSearchQuery query , plasmaSearchRankingProfile ranking , indexContainer container , long maxTime ) {
this . query = query ;
this . ranking = ranking ;
long limitTime = ( maxTime < 0 ) ? Long . MAX_VALUE : System . currentTimeMillis ( ) + maxTime ;
2006-11-08 17:17:47 +01:00
indexRWIEntry iEntry ;
2006-09-07 03:13:03 +02:00
// first pass: find min/max to obtain limits for normalization
Iterator i = container . entries ( ) ;
int count = 0 ;
this . entryMin = null ;
this . entryMax = null ;
while ( i . hasNext ( ) ) {
if ( System . currentTimeMillis ( ) > limitTime ) break ;
2006-11-08 17:17:47 +01:00
iEntry = ( indexRWIEntry ) i . next ( ) ;
if ( this . entryMin = = null ) this . entryMin = ( indexRWIEntry ) iEntry . clone ( ) ; else this . entryMin . min ( iEntry ) ;
if ( this . entryMax = = null ) this . entryMax = ( indexRWIEntry ) iEntry . clone ( ) ; else this . entryMax . max ( iEntry ) ;
2006-09-07 03:13:03 +02:00
count + + ;
}
// second pass: normalize entries and get ranking
i = container . entries ( ) ;
this . pageAcc = new TreeMap ( ) ;
2007-04-05 12:14:48 +02:00
TreeSet searchWords = plasmaSearchQuery . cleanQuery ( query . queryString ) [ 0 ] ;
2006-09-07 03:13:03 +02:00
for ( int j = 0 ; j < count ; j + + ) {
2006-11-08 17:17:47 +01:00
iEntry = ( indexRWIEntry ) i . next ( ) ;
2006-12-06 04:02:57 +01:00
if ( iEntry . urlHash ( ) . length ( ) ! = container . row ( ) . width ( container . row ( ) . primaryKey ( ) ) ) continue ;
2006-11-23 03:16:30 +01:00
if ( ( ! ( query . constraint . equals ( plasmaSearchQuery . catchall_constraint ) ) ) & & ( ! ( iEntry . flags ( ) . allOf ( query . constraint ) ) ) ) continue ; // filter out entries that do not match the search constraint
2006-12-01 17:21:17 +01:00
if ( query . contentdom ! = plasmaSearchQuery . CONTENTDOM_TEXT ) {
if ( ( query . contentdom = = plasmaSearchQuery . CONTENTDOM_AUDIO ) & & ( ! ( iEntry . flags ( ) . get ( plasmaCondenser . flag_cat_hasaudio ) ) ) ) continue ;
if ( ( query . contentdom = = plasmaSearchQuery . CONTENTDOM_VIDEO ) & & ( ! ( iEntry . flags ( ) . get ( plasmaCondenser . flag_cat_hasvideo ) ) ) ) continue ;
if ( ( query . contentdom = = plasmaSearchQuery . CONTENTDOM_IMAGE ) & & ( ! ( iEntry . flags ( ) . get ( plasmaCondenser . flag_cat_hasimage ) ) ) ) continue ;
if ( ( query . contentdom = = plasmaSearchQuery . CONTENTDOM_APP ) & & ( ! ( iEntry . flags ( ) . get ( plasmaCondenser . flag_cat_hasapp ) ) ) ) continue ;
}
2007-04-05 12:14:48 +02:00
pageAcc . put ( serverCodings . encodeHex ( Long . MAX_VALUE - this . ranking . preRanking ( iEntry . generateNormalized ( this . entryMin , this . entryMax ) , searchWords ) , 16 ) + iEntry . urlHash ( ) , iEntry ) ;
2006-09-07 03:13:03 +02:00
}
2006-12-13 02:39:34 +01:00
this . filteredCount = pageAcc . size ( ) ;
}
public int filteredCount ( ) {
return this . filteredCount ;
2006-09-07 03:13:03 +02:00
}
2006-09-10 01:44:54 +02:00
public void remove ( boolean rootDomExt , boolean doubleDom ) {
// this removes all refererences to urls that are extended paths of existing 'RootDom'-urls
2006-09-10 21:44:11 +02:00
if ( pageAcc . size ( ) < = query . wantedResults ) return ;
2006-09-10 01:44:54 +02:00
HashSet rootDoms = new HashSet ( ) ;
HashSet doubleDoms = new HashSet ( ) ;
Iterator i = pageAcc . entrySet ( ) . iterator ( ) ;
Map . Entry entry ;
2006-11-08 17:17:47 +01:00
indexRWIEntry iEntry ;
2006-09-10 01:44:54 +02:00
String hashpart ;
2006-09-10 21:44:11 +02:00
boolean isWordRootURL ;
2007-04-05 12:14:48 +02:00
TreeSet querywords = plasmaSearchQuery . cleanQuery ( query . queryString ( ) ) [ 0 ] ;
2006-09-10 01:44:54 +02:00
while ( i . hasNext ( ) ) {
2006-09-16 02:07:09 +02:00
if ( pageAcc . size ( ) < = query . wantedResults ) break ;
2006-09-10 01:44:54 +02:00
entry = ( Map . Entry ) i . next ( ) ;
2006-11-08 17:17:47 +01:00
iEntry = ( indexRWIEntry ) entry . getValue ( ) ;
2006-09-10 01:44:54 +02:00
hashpart = iEntry . urlHash ( ) . substring ( 6 ) ;
2007-04-03 17:35:29 +02:00
isWordRootURL = plasmaURL . isWordRootURL ( iEntry . urlHash ( ) , querywords ) ;
if ( isWordRootURL ) {
rootDoms . add ( hashpart ) ;
2006-09-10 01:44:54 +02:00
} else {
2007-04-03 17:35:29 +02:00
if ( ( ( rootDomExt ) & & ( rootDoms . contains ( hashpart ) ) ) | |
( ( doubleDom ) & & ( doubleDoms . contains ( hashpart ) ) ) ) {
i . remove ( ) ;
2006-09-10 01:44:54 +02:00
}
}
doubleDoms . add ( hashpart ) ;
}
}
2005-11-22 16:17:05 +01:00
public static void loadYBR ( File rankingPath , int count ) {
// load ranking tables
if ( rankingPath . exists ( ) ) {
2005-11-23 02:40:02 +01:00
ybrTables = new kelondroBinSearch [ count ] ;
2005-11-22 16:17:05 +01:00
String ybrName ;
2005-11-23 12:57:30 +01:00
File f ;
2005-11-22 16:17:05 +01:00
try {
for ( int i = 0 ; i < count ; i + + ) {
ybrName = " YBR-4- " + serverCodings . encodeHex ( i , 2 ) + " .idx " ;
2005-11-23 12:57:30 +01:00
f = new File ( rankingPath , ybrName ) ;
if ( f . exists ( ) ) {
ybrTables [ i ] = new kelondroBinSearch ( serverFileUtils . read ( f ) , 6 ) ;
} else {
ybrTables [ i ] = null ;
}
2005-11-22 16:17:05 +01:00
}
} catch ( IOException e ) {
ybrTables = null ;
}
} else {
ybrTables = null ;
}
}
public static boolean canUseYBR ( ) {
return ybrTables ! = null ;
}
public static boolean isUsingYBR ( ) {
return useYBR ;
}
public static void switchYBR ( boolean usage ) {
useYBR = usage ;
}
2005-10-23 19:50:27 +02:00
public plasmaSearchPreOrder cloneSmart ( ) {
// clones only the top structure
2006-09-07 03:13:03 +02:00
plasmaSearchPreOrder theClone = new plasmaSearchPreOrder ( ) ;
theClone . query = this . query ;
theClone . ranking = this . ranking ;
2005-10-23 19:50:27 +02:00
theClone . pageAcc = ( TreeMap ) this . pageAcc . clone ( ) ;
return theClone ;
}
public boolean hasNext ( ) {
return pageAcc . size ( ) > 0 ;
}
2006-09-08 03:26:06 +02:00
public Object [ ] /*{indexEntry, Long}*/ next ( ) {
2006-09-08 22:26:44 +02:00
String top = ( String ) pageAcc . firstKey ( ) ;
2006-09-08 03:26:06 +02:00
//System.out.println("preorder-key: " + top);
2006-10-13 01:14:41 +02:00
Long preranking ;
try {
preranking = new Long ( Long . MAX_VALUE - Long . parseLong ( top . substring ( 0 , 16 ) , 16 ) ) ; // java.lang.NumberFormatException: For input string: "8000000000020b17" ???
} catch ( NumberFormatException e ) {
e . printStackTrace ( ) ;
preranking = new Long ( 0 ) ;
}
2006-11-08 17:17:47 +01:00
return new Object [ ] { ( indexRWIEntry ) pageAcc . remove ( top ) , preranking } ;
2005-10-23 19:50:27 +02:00
}
2006-11-08 17:17:47 +01:00
public indexRWIEntry [ ] getNormalizer ( ) {
return new indexRWIEntry [ ] { entryMin , entryMax } ;
2006-02-04 15:13:54 +01:00
}
2005-10-23 19:50:27 +02:00
2005-11-22 16:17:05 +01:00
public static int ybr_p ( String urlHash ) {
2006-02-04 15:13:54 +01:00
return 16 * ( 16 - ybr ( urlHash ) ) ;
2005-11-22 16:17:05 +01:00
}
public static int ybr ( String urlHash ) {
if ( ybrTables = = null ) return 16 ;
if ( ! ( useYBR ) ) return 16 ;
final String domHash = urlHash . substring ( 6 ) ;
for ( int i = 0 ; i < ybrTables . length ; i + + ) {
2005-11-23 12:57:30 +01:00
if ( ( ybrTables [ i ] ! = null ) & & ( ybrTables [ i ] . contains ( domHash . getBytes ( ) ) ) ) {
2005-11-22 16:17:05 +01:00
//System.out.println("YBR FOUND: " + urlHash + " (" + i + ")");
return i ;
}
}
//System.out.println("NOT FOUND: " + urlHash);
return 16 ;
}
2005-10-23 19:50:27 +02:00
2005-11-02 17:30:45 +01:00
}