2012-11-18 01:22:41 +01:00
/ * *
* QueryGoal
* Copyright 2012 by Michael Peter Christen ; mc @yacy.net , Frankfurt a . M . , Germany
* First published 16 . 11 . 2005 on http : //yacy.net
*
* This library is free software ; you can redistribute it and / or
* modify it under the terms of the GNU Lesser General Public
* License as published by the Free Software Foundation ; either
* version 2 . 1 of the License , or ( at your option ) any later version .
*
* This library is distributed in the hope that it will be useful ,
* but WITHOUT ANY WARRANTY ; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE . See the GNU
* Lesser General Public License for more details .
*
* You should have received a copy of the GNU Lesser General Public License
* along with this program in the file lgpl21 . txt
* If not , see < http : //www.gnu.org/licenses/>.
* /
package net.yacy.search.query ;
import java.io.UnsupportedEncodingException ;
import java.net.URLEncoder ;
import java.util.ArrayList ;
2013-11-25 15:20:54 +01:00
import java.util.Collection ;
2013-11-26 02:24:47 +01:00
import java.util.Iterator ;
2013-11-25 15:20:54 +01:00
import java.util.Locale ;
2012-12-27 03:19:21 +01:00
import java.util.Map ;
2013-11-26 02:24:47 +01:00
import java.util.Set ;
2012-11-18 01:22:41 +01:00
import java.util.SortedSet ;
2013-11-25 15:20:54 +01:00
import java.util.TreeSet ;
2012-11-18 01:22:41 +01:00
2013-03-17 10:52:31 +01:00
import net.yacy.cora.document.WordCache ;
2013-03-13 14:47:00 +01:00
import net.yacy.cora.federate.solr.Ranking ;
import net.yacy.cora.federate.solr.SchemaDeclaration ;
2013-02-25 14:31:50 +01:00
import net.yacy.cora.federate.solr.SolrType ;
2014-02-26 14:30:48 +01:00
import net.yacy.cora.federate.solr.connector.AbstractSolrConnector ;
2013-11-25 15:20:54 +01:00
import net.yacy.cora.order.NaturalOrder ;
2014-09-16 13:41:01 +02:00
import net.yacy.cora.protocol.Domains ;
2012-11-18 01:22:41 +01:00
import net.yacy.cora.storage.HandleSet ;
import net.yacy.document.parser.html.AbstractScraper ;
import net.yacy.document.parser.html.CharacterCoding ;
import net.yacy.kelondro.data.word.Word ;
2013-11-26 02:24:47 +01:00
import net.yacy.kelondro.util.SetTools ;
2012-11-18 01:22:41 +01:00
import net.yacy.search.index.Segment ;
2013-02-21 13:23:55 +01:00
import net.yacy.search.schema.CollectionConfiguration ;
import net.yacy.search.schema.CollectionSchema ;
2012-11-18 01:22:41 +01:00
public class QueryGoal {
2012-11-18 16:03:34 +01:00
private static char space = ' ' ;
private static char sq = '\'' ;
private static char dq = '"' ;
2014-08-16 14:29:52 +02:00
private static String seps = " .:;#*`,!$%()=?^<>/&_ " ;
2012-11-18 01:22:41 +01:00
2014-03-27 13:29:14 +01:00
public String query_original ;
2013-09-02 04:19:53 +02:00
private HandleSet include_hashes , exclude_hashes ;
2013-11-25 15:20:54 +01:00
private final NormalizedWords include_words , exclude_words ;
2013-09-02 23:09:43 +02:00
private final ArrayList < String > include_strings , exclude_strings ;
2012-11-18 16:03:34 +01:00
2013-11-25 15:20:54 +01:00
public static class NormalizedWords extends TreeSet < String > {
private static final long serialVersionUID = - 3050851079671868007L ;
public NormalizedWords ( ) {
super ( NaturalOrder . naturalComparator ) ;
}
public NormalizedWords ( String [ ] rawWords ) {
super ( NaturalOrder . naturalComparator ) ;
for ( String word : rawWords ) super . add ( word . toLowerCase ( Locale . ENGLISH ) ) ;
}
public NormalizedWords ( Collection < String > rawWords ) {
super ( NaturalOrder . naturalComparator ) ;
for ( String word : rawWords ) super . add ( word . toLowerCase ( Locale . ENGLISH ) ) ;
}
@Override
public boolean add ( String word ) {
return super . add ( word . toLowerCase ( Locale . ENGLISH ) ) ;
}
@Override
public boolean contains ( Object word ) {
if ( ! ( word instanceof String ) ) return false ;
return super . contains ( ( ( String ) word ) . toLowerCase ( Locale . ENGLISH ) ) ;
}
}
2012-11-18 01:22:41 +01:00
2013-09-02 04:19:53 +02:00
public QueryGoal ( HandleSet include_hashes , HandleSet exclude_hashes ) {
2012-12-15 00:05:46 +01:00
this . query_original = null ;
2013-11-25 15:20:54 +01:00
this . include_words = new NormalizedWords ( ) ;
this . exclude_words = new NormalizedWords ( ) ;
2013-03-15 10:04:27 +01:00
this . include_strings = new ArrayList < String > ( ) ;
this . exclude_strings = new ArrayList < String > ( ) ;
2012-11-18 01:22:41 +01:00
this . include_hashes = include_hashes ;
this . exclude_hashes = exclude_hashes ;
}
2014-01-20 00:58:17 +01:00
/ * *
2014-07-24 14:59:37 +02:00
* Creates a QueryGoal from a search query string
2014-01-20 00:58:17 +01:00
* @param query_words search string ( the actual search terms , excluding application specific modifier )
* /
public QueryGoal ( String query_words ) {
2012-12-15 00:05:46 +01:00
assert query_words ! = null ;
2014-01-20 00:58:17 +01:00
this . query_original = query_words ;
2013-11-25 15:20:54 +01:00
this . include_words = new NormalizedWords ( ) ;
this . exclude_words = new NormalizedWords ( ) ;
2012-11-18 16:03:34 +01:00
this . include_strings = new ArrayList < String > ( ) ;
this . exclude_strings = new ArrayList < String > ( ) ;
2012-11-18 01:22:41 +01:00
2012-11-18 16:03:34 +01:00
// remove funny symbols
2013-03-15 10:04:27 +01:00
query_words = CharacterCoding . html2unicode ( AbstractScraper . stripAllTags ( query_words . toCharArray ( ) ) ) . toLowerCase ( ) . trim ( ) ;
2012-11-18 16:03:34 +01:00
int c ;
for ( int i = 0 ; i < seps . length ( ) ; i + + ) {
2013-03-15 10:04:27 +01:00
while ( ( c = query_words . indexOf ( seps . charAt ( i ) ) ) > = 0 ) {
query_words = query_words . substring ( 0 , c ) + ( ( ( c + 1 ) < query_words . length ( ) ) ? ( ' ' + query_words . substring ( c + 1 ) ) : " " ) ;
2012-11-18 01:22:41 +01:00
}
}
2012-11-18 16:03:34 +01:00
// parse first quoted strings
2013-09-02 23:09:43 +02:00
parseQuery ( query_words , this . include_strings , this . exclude_strings ) ;
2012-11-18 16:03:34 +01:00
// .. end then take these strings apart to generate word lists
2013-09-02 23:09:43 +02:00
for ( String s : this . include_strings ) parseQuery ( s , this . include_words , this . include_words ) ;
for ( String s : this . exclude_strings ) parseQuery ( s , this . exclude_words , this . exclude_words ) ;
2012-11-18 16:03:34 +01:00
2013-11-25 15:20:54 +01:00
WordCache . learn ( this . include_words ) ;
WordCache . learn ( this . exclude_words ) ;
2013-03-17 10:52:31 +01:00
2012-11-18 01:22:41 +01:00
this . include_hashes = null ;
this . exclude_hashes = null ;
}
2012-11-18 16:03:34 +01:00
/ *
* EBNF of a query
*
* query = { whitespace , phrase } , [ whitespace ]
* whitespace = space , { space }
* space = ' '
* phrase = [ '-' ] , string
* string = { any character without sq , dq and whitespace } | sq , { any character without sq } , sq | dq , { any character without dq } , dq
* sq = '\''
* dq = '"'
* /
2013-11-25 15:20:54 +01:00
private static void parseQuery ( String s , Collection < String > include_string , Collection < String > exclude_string ) {
2012-11-18 16:03:34 +01:00
while ( s . length ( ) > 0 ) {
// parse query
int p = 0 ;
while ( p < s . length ( ) & & s . charAt ( p ) = = space ) p + + ;
s = s . substring ( p ) ;
if ( s . length ( ) = = 0 ) return ;
// parse phrase
boolean inc = true ;
2014-09-01 00:16:26 +02:00
if ( s . charAt ( 0 ) = = '-' ) {
inc = false ;
s = s . substring ( 1 ) ;
} else if ( s . charAt ( 0 ) = = '+' ) {
inc = true ;
s = s . substring ( 1 ) ;
}
2012-11-18 16:03:34 +01:00
if ( s . length ( ) = = 0 ) return ;
// parse string
char stop = space ;
2014-09-01 00:16:26 +02:00
if ( s . charAt ( 0 ) = = dq ) {
stop = s . charAt ( 0 ) ;
s = s . substring ( 1 ) ;
} else if ( s . charAt ( 0 ) = = sq ) {
stop = s . charAt ( 0 ) ;
s = s . substring ( 1 ) ;
}
2012-11-18 16:03:34 +01:00
p = 0 ;
while ( p < s . length ( ) & & s . charAt ( p ) ! = stop ) p + + ;
String string = s . substring ( 0 , p ) ;
p + + ; // go behind the stop character (eats up space, sq and dq)
s = p < s . length ( ) ? s . substring ( p ) : " " ;
if ( string . length ( ) > 0 ) {
if ( inc ) {
if ( ! include_string . contains ( string ) ) include_string . add ( string ) ;
} else {
if ( ! exclude_string . contains ( string ) ) exclude_string . add ( string ) ;
}
}
}
}
2014-01-20 00:58:17 +01:00
/ * *
* Search query string ( without YaCy specific modifier like site : xxx or / smb )
* the modifier are held separately in a search paramter modifier
*
* @param encodeHTML
* @return
* /
public String getQueryString ( final boolean encodeHTML ) {
2013-03-15 10:04:27 +01:00
if ( this . query_original = = null ) return null ;
2012-12-15 00:05:46 +01:00
String ret ;
if ( encodeHTML ) {
try {
ret = URLEncoder . encode ( this . query_original , " UTF-8 " ) ;
2013-07-17 18:31:30 +02:00
} catch ( final UnsupportedEncodingException e ) {
2012-12-15 00:05:46 +01:00
ret = this . query_original ;
}
} else {
ret = this . query_original ;
}
return ret ;
2012-11-18 01:22:41 +01:00
}
2014-01-20 00:58:17 +01:00
2013-11-25 15:20:54 +01:00
/ * *
* @return a set of hashes of words to be included in the search result .
* if possible , use getIncludeWords instead
* /
2012-11-18 01:22:41 +01:00
public HandleSet getIncludeHashes ( ) {
2013-11-26 02:24:47 +01:00
if ( this . include_hashes = = null ) this . include_hashes = Word . words2hashesHandles ( include_words ) ;
return this . include_hashes ;
2012-11-18 01:22:41 +01:00
}
2013-11-25 15:20:54 +01:00
/ * *
* @return a set of hashes of words to be excluded in the search result
* if possible , use getExcludeWords instead
* /
2012-11-18 01:22:41 +01:00
public HandleSet getExcludeHashes ( ) {
2013-11-26 02:24:47 +01:00
if ( this . exclude_hashes = = null ) this . exclude_hashes = Word . words2hashesHandles ( exclude_words ) ;
return this . exclude_hashes ;
}
public int getIncludeSize ( ) {
2013-11-26 11:47:04 +01:00
assert this . include_hashes = = null | | this . include_words . size ( ) = = 0 | | this . include_hashes . size ( ) = = this . include_words . size ( ) ;
return this . include_hashes = = null ? this . include_words . size ( ) : this . include_hashes . size ( ) ;
2012-11-18 01:22:41 +01:00
}
2013-11-25 15:20:54 +01:00
2013-11-26 02:24:47 +01:00
public int getExcludeSize ( ) {
2013-11-26 11:47:04 +01:00
assert this . exclude_hashes = = null | | this . exclude_words . size ( ) = = 0 | | this . exclude_hashes . size ( ) = = this . exclude_words . size ( ) ;
return this . exclude_hashes = = null ? this . exclude_words . size ( ) : this . exclude_hashes . size ( ) ;
2013-11-26 02:24:47 +01:00
}
2013-11-25 15:20:54 +01:00
/ * *
* @return a set of words to be included in the search result
* /
2013-11-26 02:24:47 +01:00
public Iterator < String > getIncludeWords ( ) {
return this . include_words . iterator ( ) ;
2013-11-25 15:20:54 +01:00
}
/ * *
* @return a set of words to be excluded in the search result
* /
2013-11-26 02:24:47 +01:00
public Iterator < String > getExcludeWords ( ) {
return this . exclude_words . iterator ( ) ;
2013-11-25 15:20:54 +01:00
}
/ * *
* @return a list of include strings which reproduces the original order of the search words and quotation
* /
2013-11-26 02:24:47 +01:00
public Iterator < String > getIncludeStrings ( ) {
return this . include_strings . iterator ( ) ;
2013-11-25 15:20:54 +01:00
}
/ * *
* @return a list of exclude strings which reproduces the original order of the search words and quotation
* /
2013-11-26 02:24:47 +01:00
public Iterator < String > getExcludeStrings ( ) {
return this . exclude_strings . iterator ( ) ;
2013-11-25 15:20:54 +01:00
}
2013-09-02 04:19:53 +02:00
2013-11-26 02:24:47 +01:00
public void removeIncludeWords ( Set < String > words ) {
if ( ! words . isEmpty ( ) ) {
SetTools . excludeDestructiveByTestSmallInLarge ( this . exclude_words , words ) ; //remove stopwords
SetTools . excludeDestructiveByTestSmallInLarge ( this . exclude_strings , words ) ; //remove stopwords
if ( include_hashes ! = null ) for ( String word : words ) this . include_hashes . remove ( Word . word2hash ( word ) ) ;
}
}
2013-11-12 15:54:54 +01:00
/ * *
* the include string may be useful ( and better ) for highlight / snippet computation
* @return the query string containing only the positive literals ( includes ) and without whitespace characters
* /
public String getIncludeString ( ) {
if ( this . include_strings . size ( ) = = 0 ) return " " ;
StringBuilder sb = new StringBuilder ( 10 * include_strings . size ( ) ) ;
for ( String s : this . include_strings ) sb . append ( s ) . append ( ' ' ) ;
return sb . toString ( ) . substring ( 0 , sb . length ( ) - 1 ) ;
}
2012-11-18 16:03:34 +01:00
2013-09-03 07:55:21 +02:00
public boolean isCatchall ( ) {
2014-09-15 20:33:22 +02:00
if ( this . include_hashes ! = null & & this . include_hashes . has ( Segment . catchallHash ) ) return true ;
if ( this . include_strings = = null | | this . include_strings . size ( ) ! = 1 ) return false ;
return ( this . include_strings . contains ( Segment . catchallString ) ) ;
2013-09-03 07:55:21 +02:00
}
2013-11-26 02:24:47 +01:00
public boolean containsInclude ( String word ) {
if ( word = = null | | word . length ( ) = = 0 ) return false ;
String t = word . toLowerCase ( Locale . ENGLISH ) ;
return this . include_strings . contains ( t ) | | this . include_words . contains ( t ) ;
}
2013-09-02 18:55:38 +02:00
public boolean matches ( String text ) {
if ( text = = null | | text . length ( ) = = 0 ) return false ;
2013-09-03 07:55:21 +02:00
// parse special requests
if ( isCatchall ( ) ) return true ;
2013-11-26 02:24:47 +01:00
String t = text . toLowerCase ( Locale . ENGLISH ) ;
2013-09-02 18:55:38 +02:00
for ( String i : this . include_strings ) if ( t . indexOf ( i . toLowerCase ( ) ) < 0 ) return false ;
for ( String e : this . exclude_strings ) if ( t . indexOf ( e . toLowerCase ( ) ) > = 0 ) return false ;
return true ;
}
2013-09-02 23:09:43 +02:00
2012-11-18 01:22:41 +01:00
public void filterOut ( final SortedSet < String > blueList ) {
// filter out words that appear in this set
// this is applied to the queryHashes
2013-11-25 15:20:54 +01:00
for ( String word : blueList ) {
this . include_words . remove ( word ) ;
this . include_strings . remove ( word ) ;
}
2012-11-18 01:22:41 +01:00
final HandleSet blues = Word . words2hashesHandles ( blueList ) ;
for ( final byte [ ] b : blues ) this . include_hashes . remove ( b ) ;
}
2013-09-02 18:55:38 +02:00
2013-09-03 11:14:23 +02:00
public StringBuilder collectionTextQueryString ( CollectionConfiguration configuration , int rankingProfile , boolean noimages ) {
2012-11-18 01:22:41 +01:00
final StringBuilder q = new StringBuilder ( 80 ) ;
2013-09-02 18:55:38 +02:00
// add filter to prevent that results come from failed urls
2013-09-03 07:55:21 +02:00
q . append ( CollectionSchema . httpstatus_i . getSolrFieldName ( ) ) . append ( " :200 " ) ;
2013-09-03 11:14:23 +02:00
if ( noimages ) q . append ( " AND - " ) . append ( CollectionSchema . url_file_ext_s . getSolrFieldName ( ) ) . append ( " :(jpg OR png OR gif) " ) ;
2013-09-02 18:55:38 +02:00
2012-11-18 01:22:41 +01:00
// parse special requests
2013-09-03 07:55:21 +02:00
if ( isCatchall ( ) ) return q ;
2013-09-02 18:55:38 +02:00
// add goal query
StringBuilder w = getGoalQuery ( ) ;
2012-11-18 01:22:41 +01:00
// combine these queries for all relevant fields
2014-09-15 20:33:22 +02:00
if ( w . length ( ) > 0 ) {
q . append ( " AND ( " ) ;
int wc = 0 ;
Float boost ;
Ranking r = configuration . getRanking ( rankingProfile ) ;
for ( Map . Entry < SchemaDeclaration , Float > entry : r . getBoostMap ( ) ) {
SchemaDeclaration field = entry . getKey ( ) ;
boost = entry . getValue ( ) ;
if ( boost = = null | | boost . floatValue ( ) < = 0 . 0f ) continue ;
if ( configuration ! = null & & ! configuration . contains ( field . getSolrFieldName ( ) ) ) continue ;
if ( field . getType ( ) = = SolrType . num_integer ) continue ;
if ( wc > 0 ) q . append ( " OR " ) ;
q . append ( '(' ) ;
q . append ( field . getSolrFieldName ( ) ) . append ( ':' ) . append ( w ) ;
if ( boost ! = null ) q . append ( '^' ) . append ( boost . toString ( ) ) ;
q . append ( ')' ) ;
wc + + ;
}
2012-11-18 01:22:41 +01:00
q . append ( ')' ) ;
}
2013-09-02 18:55:38 +02:00
return q ;
}
2014-09-16 13:41:01 +02:00
public StringBuilder collectionImageQueryString ( final QueryModifier modifier ) {
2013-09-02 18:55:38 +02:00
final StringBuilder q = new StringBuilder ( 80 ) ;
2012-11-18 01:22:41 +01:00
// add filter to prevent that results come from failed urls
2013-09-02 20:02:26 +02:00
q . append ( CollectionSchema . httpstatus_i . getSolrFieldName ( ) ) . append ( " :200 " ) . append ( " AND ( " ) ;
2014-02-26 14:30:48 +01:00
q . append ( CollectionSchema . images_urlstub_sxt . getSolrFieldName ( ) ) . append ( AbstractSolrConnector . CATCHALL_DTERM + " OR " ) ;
2013-12-29 20:14:10 +01:00
q . append ( CollectionSchema . url_file_ext_s . getSolrFieldName ( ) ) . append ( " :(jpg OR png OR gif) OR " ) ;
2013-11-07 03:11:03 +01:00
q . append ( CollectionSchema . content_type . getSolrFieldName ( ) ) . append ( " :(image/*)) " ) ;
2013-09-02 18:55:38 +02:00
// parse special requests
2013-09-03 07:55:21 +02:00
if ( isCatchall ( ) ) return q ;
2013-09-02 18:55:38 +02:00
// add goal query
StringBuilder w = getGoalQuery ( ) ;
// combine these queries for all relevant fields
2014-09-15 20:33:22 +02:00
if ( w . length ( ) > 0 ) {
2014-09-16 13:41:01 +02:00
String hostname = modifier = = null | | modifier . sitehost = = null | | modifier . sitehost . length ( ) = = 0 ? null : Domains . getSmartSLD ( modifier . sitehost ) ;
2014-09-15 20:33:22 +02:00
q . append ( " AND ( " ) ;
2014-09-16 13:41:01 +02:00
q . append ( '(' ) . append ( CollectionSchema . images_text_t . getSolrFieldName ( ) ) . append ( ':' ) . append ( hostname = = null ? w : " ( " + w + " " /*NOT an OR!, the hostname shall only boost*/ + hostname + " ) " ) . append ( " ^100.0) OR " ) ;
2014-09-15 20:33:22 +02:00
q . append ( '(' ) . append ( CollectionSchema . title . getSolrFieldName ( ) ) . append ( ':' ) . append ( w ) . append ( " ^50.0) OR " ) ;
q . append ( '(' ) . append ( CollectionSchema . keywords . getSolrFieldName ( ) ) . append ( ':' ) . append ( w ) . append ( " ^10.0) OR " ) ;
q . append ( '(' ) . append ( CollectionSchema . text_t . getSolrFieldName ( ) ) . append ( ':' ) . append ( w ) . append ( ')' ) ;
q . append ( ')' ) ;
}
2012-11-18 01:22:41 +01:00
return q ;
}
2013-09-02 18:55:38 +02:00
private StringBuilder getGoalQuery ( ) {
int wc = 0 ;
StringBuilder w = new StringBuilder ( 80 ) ;
for ( String s : include_strings ) {
2014-09-15 20:33:22 +02:00
if ( Segment . catchallString . equals ( s ) ) continue ;
2013-09-02 18:55:38 +02:00
if ( wc > 0 ) w . append ( " AND " ) ;
w . append ( dq ) . append ( s ) . append ( dq ) ;
wc + + ;
}
for ( String s : exclude_strings ) {
if ( wc > 0 ) w . append ( " AND - " ) ;
w . append ( dq ) . append ( s ) . append ( dq ) ;
wc + + ;
}
if ( wc > 1 ) { w . insert ( 0 , '(' ) ; w . append ( ')' ) ; }
return w ;
}
2012-11-18 01:22:41 +01:00
}