2011-06-13 23:44:03 +02:00
// QueryParams.java
2005-10-10 02:33:25 +02:00
// -----------------------
// part of YACY
2008-07-20 19:14:51 +02:00
// (C) by Michael Peter Christen; mc@yacy.net
2005-10-10 02:33:25 +02:00
// first published on http://www.anomic.de
// Frankfurt, Germany, 2005
// Created: 10.10.2005
//
2009-09-05 22:41:21 +02:00
// $LastChangedDate$
// $LastChangedRevision$
// $LastChangedBy$
//
2005-10-10 02:33:25 +02:00
// This program is free software; you can redistribute it and/or modify
// it under the terms of the GNU General Public License as published by
// the Free Software Foundation; either version 2 of the License, or
// (at your option) any later version.
//
// This program is distributed in the hope that it will be useful,
// but WITHOUT ANY WARRANTY; without even the implied warranty of
// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
// GNU General Public License for more details.
//
// You should have received a copy of the GNU General Public License
// along with this program; if not, write to the Free Software
// Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
2011-09-25 18:59:06 +02:00
package net.yacy.search.query ;
2005-10-10 02:33:25 +02:00
2010-08-13 17:59:52 +02:00
import java.io.UnsupportedEncodingException ;
import java.net.URLEncoder ;
2012-01-17 01:53:08 +01:00
import java.util.ArrayList ;
import java.util.Collection ;
2010-06-23 13:19:32 +02:00
import java.util.HashMap ;
2006-09-30 00:27:20 +02:00
import java.util.Iterator ;
2012-09-25 23:59:30 +02:00
import java.util.LinkedHashMap ;
2010-06-23 13:19:32 +02:00
import java.util.Map ;
2011-12-13 00:16:05 +01:00
import java.util.Set ;
2010-11-28 03:57:31 +01:00
import java.util.SortedSet ;
2010-03-23 11:17:28 +01:00
import java.util.regex.Pattern ;
2011-03-23 01:48:19 +01:00
import java.util.regex.PatternSyntaxException ;
2005-10-10 02:33:25 +02:00
2012-10-07 07:46:55 +02:00
import org.apache.solr.client.solrj.SolrQuery ;
import org.apache.solr.client.solrj.SolrQuery.ORDER ;
2012-09-26 16:56:33 +02:00
2011-05-27 10:24:54 +02:00
import net.yacy.cora.document.ASCII ;
2012-04-22 00:04:36 +02:00
import net.yacy.cora.document.Classification ;
import net.yacy.cora.document.Classification.ContentDomain ;
2010-06-23 13:19:32 +02:00
import net.yacy.cora.document.MultiProtocolURI ;
2011-03-07 21:36:40 +01:00
import net.yacy.cora.document.UTF8 ;
2012-09-25 21:20:03 +02:00
import net.yacy.cora.federate.solr.YaCySchema ;
import net.yacy.cora.federate.yacy.CacheStrategy ;
2012-09-20 19:38:22 +02:00
import net.yacy.cora.geo.GeoLocation ;
2012-06-11 23:49:30 +02:00
import net.yacy.cora.lod.vocabulary.Tagging ;
2012-09-21 16:46:57 +02:00
import net.yacy.cora.order.Base64Order ;
2012-07-27 12:13:53 +02:00
import net.yacy.cora.storage.HandleSet ;
import net.yacy.cora.util.SpaceExceededException ;
2009-10-18 02:53:43 +02:00
import net.yacy.document.Condenser ;
import net.yacy.document.parser.html.AbstractScraper ;
import net.yacy.document.parser.html.CharacterCoding ;
2009-10-11 02:12:19 +02:00
import net.yacy.kelondro.data.meta.DigestURI ;
import net.yacy.kelondro.data.word.Word ;
2010-04-15 15:22:59 +02:00
import net.yacy.kelondro.data.word.WordReferenceRow ;
2012-07-27 12:13:53 +02:00
import net.yacy.kelondro.index.RowHandleSet ;
2010-04-15 15:22:59 +02:00
import net.yacy.kelondro.logging.Log ;
2012-09-21 16:46:57 +02:00
import net.yacy.kelondro.util.Bitfield ;
2009-10-10 03:14:19 +02:00
import net.yacy.kelondro.util.SetTools ;
2011-10-04 11:06:24 +02:00
import net.yacy.peers.Seed ;
2011-09-25 18:59:06 +02:00
import net.yacy.search.index.Segment ;
2012-10-02 14:29:45 +02:00
import net.yacy.search.index.SolrConfiguration ;
2011-09-25 18:59:06 +02:00
import net.yacy.search.ranking.RankingProfile ;
2005-10-10 02:33:25 +02:00
2009-07-09 00:14:57 +02:00
public final class QueryParams {
2011-06-13 23:44:03 +02:00
2011-11-17 02:05:45 +01:00
public enum Searchdom {
LOCAL , CLUSTER , GLOBAL ;
2011-11-24 15:57:09 +01:00
@Override
public String toString ( ) {
if ( this = = LOCAL ) return " local " ;
else if ( this = = CLUSTER ) return " global " ; // yes thats right: global, not cluster because a cluster search is a global search
else if ( this = = GLOBAL ) return " global " ;
return " local " ;
}
2011-11-17 02:05:45 +01:00
}
2010-10-09 10:55:57 +02:00
private static final String ampersand = " & " ;
2011-06-13 23:44:03 +02:00
2011-11-26 14:40:33 +01:00
public static class Modifier {
2012-11-01 10:22:22 +01:00
private String s ;
private Modifier ( final String modifier ) {
2011-11-26 14:40:33 +01:00
this . s = modifier ;
}
public String getModifier ( ) {
return this . s ;
}
}
2009-01-30 16:33:00 +01:00
public static final Bitfield empty_constraint = new Bitfield ( 4 , " AAAAAA " ) ;
2010-03-23 14:41:41 +01:00
public static final Pattern catchall_pattern = Pattern . compile ( " .* " ) ;
2012-11-01 10:22:22 +01:00
private static final Pattern matchnothing_pattern = Pattern . compile ( " " ) ;
2011-06-13 23:44:03 +02:00
2010-03-23 14:41:41 +01:00
public final String queryString ;
2012-08-16 23:05:37 +02:00
public final HandleSet query_include_hashes , query_exclude_hashes , query_all_hashes ;
2012-11-01 10:22:22 +01:00
private final Collection < String > query_include_words , query_exclude_words , query_all_words ;
2010-03-23 14:41:41 +01:00
public final int itemsPerPage ;
public int offset ;
public final Pattern urlMask , prefer ;
2012-11-01 10:22:22 +01:00
final boolean urlMask_isCatchall ;
private final boolean prefer_isMatchnothing ;
2012-04-22 00:04:36 +02:00
public final Classification . ContentDomain contentdom ;
2010-03-23 14:41:41 +01:00
public final String targetlang ;
2012-11-01 10:22:22 +01:00
protected final Collection < Tagging . Metatag > metatags ;
2010-03-23 14:41:41 +01:00
public final String navigators ;
2011-11-17 02:05:45 +01:00
public final Searchdom domType ;
2012-11-01 10:22:22 +01:00
private final int zonecode ;
private final int domMaxTargets ;
2010-03-23 14:41:41 +01:00
public final int maxDistance ;
public final Bitfield constraint ;
2012-11-01 10:22:22 +01:00
final boolean allofconstraint ;
protected CacheStrategy snippetCacheStrategy ;
2010-03-23 14:41:41 +01:00
public final RankingProfile ranking ;
2010-01-11 00:09:48 +01:00
private final Segment indexSegment ;
2010-03-23 14:41:41 +01:00
public final String host ; // this is the client host that starts the query, not a site operator
public final String sitehash ; // this is a domain hash, 6 bytes long or null
2012-11-01 10:22:22 +01:00
protected final Set < String > siteexcludes ; // set of domain hashes that are excluded if not included by sitehash
2010-03-23 14:41:41 +01:00
public final String authorhash ;
2011-11-26 14:40:33 +01:00
public final Modifier modifier ;
2011-10-04 11:06:24 +02:00
public Seed remotepeer ;
2012-11-01 10:22:22 +01:00
public final long starttime ; // the time when the query started, how long it should take and the time when the timeout is reached (milliseconds)
protected final long maxtime ;
protected final long timeout ;
2008-02-18 00:35:48 +01:00
// values that are set after a search:
public int resultcount ; // number of found results
2011-01-11 23:58:14 +01:00
public int transmitcount ; // number of results that had been shown to the user
2008-02-18 00:35:48 +01:00
public long searchtime , urlretrievaltime , snippetcomputationtime ; // time to perform the search, to get all the urls, and to compute the snippets
2012-11-01 10:22:22 +01:00
private boolean specialRights ; // is true if the user has a special authorization and my use more database-extensive options
2010-10-18 10:09:59 +02:00
public final String userAgent ;
2012-11-01 17:16:43 +01:00
public boolean filterfailurls ;
2012-11-01 10:22:22 +01:00
protected double lat , lon , radius ;
2011-06-13 23:44:03 +02:00
2011-04-12 07:02:36 +02:00
public QueryParams (
final String queryString ,
2010-07-18 11:10:46 +02:00
final int itemsPerPage ,
final Bitfield constraint ,
final Segment indexSegment ,
2010-10-18 10:09:59 +02:00
final RankingProfile ranking ,
final String userAgent ) {
2011-12-14 22:15:51 +01:00
byte [ ] queryHash ;
if ( ( queryString . length ( ) = = 12 ) & & ( Base64Order . enhancedCoder . wellformed ( queryHash = UTF8 . getBytes ( queryString ) ) ) ) {
2010-07-18 11:10:46 +02:00
this . queryString = null ;
2012-08-16 23:05:37 +02:00
this . query_include_words = null ;
this . query_exclude_words = null ;
this . query_all_words = null ;
2012-07-27 12:13:53 +02:00
this . query_include_hashes = new RowHandleSet ( WordReferenceRow . urlEntryRow . primaryKeyLength , WordReferenceRow . urlEntryRow . objectOrder , 0 ) ;
this . query_exclude_hashes = new RowHandleSet ( WordReferenceRow . urlEntryRow . primaryKeyLength , WordReferenceRow . urlEntryRow . objectOrder , 0 ) ;
2012-08-16 23:05:37 +02:00
this . query_all_hashes = new RowHandleSet ( WordReferenceRow . urlEntryRow . primaryKeyLength , WordReferenceRow . urlEntryRow . objectOrder , 0 ) ;
2010-04-15 15:22:59 +02:00
try {
2012-07-09 11:14:50 +02:00
this . query_include_hashes . put ( queryHash ) ;
2012-08-16 23:05:37 +02:00
this . query_all_hashes . put ( queryHash ) ;
2012-07-27 12:13:53 +02:00
} catch ( final SpaceExceededException e ) {
2010-04-15 15:22:59 +02:00
Log . logException ( e ) ;
}
2007-11-16 15:48:09 +01:00
} else {
this . queryString = queryString ;
2012-02-01 18:13:31 +01:00
final Collection < String > [ ] cq = cleanQuery ( queryString ) ;
2012-08-16 23:05:37 +02:00
this . query_include_words = cq [ 0 ] ;
this . query_exclude_words = cq [ 1 ] ;
this . query_all_words = cq [ 2 ] ;
2012-07-09 11:14:50 +02:00
this . query_include_hashes = Word . words2hashesHandles ( cq [ 0 ] ) ;
this . query_exclude_hashes = Word . words2hashesHandles ( cq [ 1 ] ) ;
this . query_all_hashes = Word . words2hashesHandles ( cq [ 2 ] ) ;
2007-11-16 15:48:09 +01:00
}
2008-01-08 21:12:31 +01:00
this . ranking = ranking ;
2011-11-26 14:40:33 +01:00
this . modifier = new Modifier ( " " ) ;
2007-11-16 15:48:09 +01:00
this . maxDistance = Integer . MAX_VALUE ;
2010-03-23 14:41:41 +01:00
this . urlMask = catchall_pattern ;
this . urlMask_isCatchall = true ;
this . prefer = matchnothing_pattern ;
this . prefer_isMatchnothing = true ;
2009-11-19 00:56:05 +01:00
this . contentdom = ContentDomain . ALL ;
2009-08-30 12:28:23 +02:00
this . itemsPerPage = itemsPerPage ;
2007-11-16 15:48:09 +01:00
this . offset = 0 ;
2008-09-21 02:04:42 +02:00
this . targetlang = " en " ;
2012-06-11 23:49:30 +02:00
this . metatags = new ArrayList < Tagging . Metatag > ( 0 ) ;
2011-11-17 02:05:45 +01:00
this . domType = Searchdom . LOCAL ;
2009-10-11 02:12:19 +02:00
this . zonecode = DigestURI . TLD_any_zone_filter ;
2007-11-16 15:48:09 +01:00
this . domMaxTargets = 0 ;
2006-11-23 03:16:30 +01:00
this . constraint = constraint ;
2007-11-16 15:48:09 +01:00
this . allofconstraint = false ;
2010-10-09 10:55:57 +02:00
this . snippetCacheStrategy = null ;
2008-02-18 00:35:48 +01:00
this . host = null ;
2009-04-02 15:26:47 +02:00
this . sitehash = null ;
2011-12-13 00:16:05 +01:00
this . siteexcludes = null ;
2009-06-09 01:30:12 +02:00
this . authorhash = null ;
2008-02-18 00:35:48 +01:00
this . remotepeer = null ;
2012-06-04 15:37:39 +02:00
this . starttime = Long . valueOf ( System . currentTimeMillis ( ) ) ;
this . maxtime = 10000 ;
this . timeout = this . starttime + this . timeout ;
2008-05-23 11:45:33 +02:00
this . specialRights = false ;
2009-06-07 23:48:01 +02:00
this . navigators = " all " ;
2009-11-24 12:13:11 +01:00
this . indexSegment = indexSegment ;
2010-10-18 10:09:59 +02:00
this . userAgent = userAgent ;
2011-01-11 23:58:14 +01:00
this . transmitcount = 0 ;
2011-01-22 10:46:00 +01:00
this . filterfailurls = false ;
2012-05-31 22:39:53 +02:00
this . lat = 0 . 0d ;
this . lon = 0 . 0d ;
this . radius = 0 . 0d ;
2007-11-16 15:48:09 +01:00
}
2011-06-13 23:44:03 +02:00
2009-07-09 00:14:57 +02:00
public QueryParams (
2012-07-09 11:14:50 +02:00
final String queryString ,
2012-08-16 23:05:37 +02:00
final Collection < String > queryWords ,
final Collection < String > excludeWords ,
final Collection < String > fullqueryWords ,
2012-07-09 11:14:50 +02:00
final HandleSet queryHashes ,
2010-07-18 11:10:46 +02:00
final HandleSet excludeHashes ,
2010-04-15 15:22:59 +02:00
final HandleSet fullqueryHashes ,
2011-11-26 14:40:33 +01:00
final String modifier ,
2009-11-19 00:56:05 +01:00
final int maxDistance , final String prefer , final ContentDomain contentdom ,
2008-09-21 02:04:42 +02:00
final String language ,
2012-06-11 23:49:30 +02:00
final Collection < Tagging . Metatag > metatags ,
2009-06-07 23:48:01 +02:00
final String navigators ,
2011-06-13 23:44:03 +02:00
final CacheStrategy snippetCacheStrategy ,
2009-08-30 12:28:23 +02:00
final int itemsPerPage , final int offset , final String urlMask ,
2011-11-17 02:05:45 +01:00
final Searchdom domType , final int domMaxTargets ,
2009-01-30 16:33:00 +01:00
final Bitfield constraint , final boolean allofconstraint ,
2009-04-02 15:26:47 +02:00
final String site ,
2011-12-13 00:16:05 +01:00
final Set < String > siteexcludes ,
2009-06-09 01:30:12 +02:00
final String authorhash ,
2008-08-02 14:12:04 +02:00
final int domainzone ,
final String host ,
2009-11-24 12:13:11 +01:00
final boolean specialRights ,
final Segment indexSegment ,
2010-10-18 10:09:59 +02:00
final RankingProfile ranking ,
2011-01-22 10:46:00 +01:00
final String userAgent ,
2012-05-31 22:39:53 +02:00
final boolean filterfailurls ,
final double lat , final double lon , final double radius ) {
2010-07-18 11:10:46 +02:00
this . queryString = queryString ;
2012-08-16 23:05:37 +02:00
this . query_include_words = queryWords ;
this . query_exclude_words = excludeWords ;
this . query_all_words = fullqueryWords ;
2012-07-09 11:14:50 +02:00
this . query_include_hashes = queryHashes ;
this . query_exclude_hashes = excludeHashes ;
this . query_all_hashes = fullqueryHashes ;
2011-11-26 14:40:33 +01:00
this . modifier = new Modifier ( modifier = = null ? " " : modifier ) ;
2010-07-18 11:10:46 +02:00
this . ranking = ranking ;
this . maxDistance = maxDistance ;
this . contentdom = contentdom ;
2011-08-11 23:37:35 +02:00
this . itemsPerPage = Math . min ( ( specialRights ) ? 10000 : 1000 , itemsPerPage ) ;
this . offset = Math . max ( 0 , Math . min ( ( specialRights ) ? 10000 - this . itemsPerPage : 1000 - this . itemsPerPage , offset ) ) ;
2011-03-23 01:48:19 +01:00
try {
this . urlMask = Pattern . compile ( urlMask . toLowerCase ( ) ) ;
} catch ( final PatternSyntaxException ex ) {
throw new IllegalArgumentException ( " Not a valid regular expression: " + urlMask , ex ) ;
}
2010-03-23 14:41:41 +01:00
this . urlMask_isCatchall = this . urlMask . toString ( ) . equals ( catchall_pattern . toString ( ) ) ;
2011-03-23 01:48:19 +01:00
try {
this . prefer = Pattern . compile ( prefer ) ;
} catch ( final PatternSyntaxException ex ) {
throw new IllegalArgumentException ( " Not a valid regular expression: " + prefer , ex ) ;
}
2010-07-18 11:10:46 +02:00
this . prefer_isMatchnothing = this . prefer . toString ( ) . equals ( matchnothing_pattern . toString ( ) ) ;
assert language ! = null ;
2008-09-21 02:04:42 +02:00
this . targetlang = language ;
2012-01-17 01:53:08 +01:00
this . metatags = metatags ;
2009-06-07 23:48:01 +02:00
this . navigators = navigators ;
2008-09-21 02:04:42 +02:00
this . domType = domType ;
2008-03-11 12:09:38 +01:00
this . zonecode = domainzone ;
2010-07-18 11:10:46 +02:00
this . domMaxTargets = domMaxTargets ;
this . constraint = constraint ;
this . allofconstraint = allofconstraint ;
this . sitehash = site ; assert site = = null | | site . length ( ) = = 6 ;
2012-07-10 22:59:03 +02:00
this . siteexcludes = siteexcludes ! = null & & siteexcludes . isEmpty ( ) ? null : siteexcludes ;
2011-03-23 01:48:19 +01:00
this . authorhash = authorhash ; assert authorhash = = null | | ! authorhash . isEmpty ( ) ;
2010-07-18 11:10:46 +02:00
this . snippetCacheStrategy = snippetCacheStrategy ;
this . host = host ;
2008-02-18 00:35:48 +01:00
this . remotepeer = null ;
2012-06-04 15:37:39 +02:00
this . starttime = Long . valueOf ( System . currentTimeMillis ( ) ) ;
this . maxtime = 10000 ;
this . timeout = this . starttime + this . timeout ;
2010-07-18 11:10:46 +02:00
this . specialRights = specialRights ;
2009-11-24 12:13:11 +01:00
this . indexSegment = indexSegment ;
2010-10-18 10:09:59 +02:00
this . userAgent = userAgent ;
2011-01-11 23:58:14 +01:00
this . transmitcount = 0 ;
2011-01-22 10:46:00 +01:00
this . filterfailurls = filterfailurls ;
2012-05-31 22:39:53 +02:00
// we normalize here the location and radius because that should cause a better caching
// and as surplus it will increase privacy
this . lat = Math . floor ( lat * this . kmNormal ) / this . kmNormal ;
this . lon = Math . floor ( lon * this . kmNormal ) / this . kmNormal ;
this . radius = Math . floor ( radius * this . kmNormal + 1 ) / this . kmNormal ;
2009-11-24 12:13:11 +01:00
}
2011-06-13 23:44:03 +02:00
2012-11-01 10:22:22 +01:00
private double kmNormal = 100 . d ; // 100 =ca 40000.d / 360.d == 111.11 - if lat/lon is multiplied with this, rounded and diveded by this, the location is normalized to a 1km grid
2012-05-31 22:39:53 +02:00
2009-11-24 12:13:11 +01:00
public Segment getSegment ( ) {
return this . indexSegment ;
2005-10-10 02:33:25 +02:00
}
2011-06-13 23:44:03 +02:00
2007-09-04 01:43:55 +02:00
public int neededResults ( ) {
// the number of result lines that must be computed
2009-08-30 12:28:23 +02:00
return this . offset + this . itemsPerPage ;
2007-09-04 01:43:55 +02:00
}
2011-06-13 23:44:03 +02:00
2012-05-21 01:58:29 +02:00
public int itemsPerPage ( ) {
2007-09-08 13:50:19 +02:00
// the number of result lines that are displayed at once (size of result page)
2009-08-30 12:28:23 +02:00
return this . itemsPerPage ;
2007-09-04 01:43:55 +02:00
}
2011-06-13 23:44:03 +02:00
2008-08-02 14:12:04 +02:00
public void setOffset ( final int newOffset ) {
2007-09-04 01:43:55 +02:00
this . offset = newOffset ;
2005-10-10 02:33:25 +02:00
}
2011-06-13 23:44:03 +02:00
2008-04-24 10:42:08 +02:00
public boolean isLocal ( ) {
2011-11-17 02:05:45 +01:00
return this . domType = = Searchdom . LOCAL ;
2007-09-04 01:43:55 +02:00
}
2011-06-13 23:44:03 +02:00
2010-04-15 15:22:59 +02:00
public static HandleSet hashes2Set ( final String query ) {
2012-07-27 12:13:53 +02:00
final HandleSet keyhashes = new RowHandleSet ( WordReferenceRow . urlEntryRow . primaryKeyLength , WordReferenceRow . urlEntryRow . objectOrder , 0 ) ;
2010-07-18 11:10:46 +02:00
if ( query ! = null ) {
for ( int i = 0 ; i < ( query . length ( ) / Word . commonHashLength ) ; i + + ) try {
2011-05-27 10:24:54 +02:00
keyhashes . put ( ASCII . getBytes ( query . substring ( i * Word . commonHashLength , ( i + 1 ) * Word . commonHashLength ) ) ) ;
2012-07-27 12:13:53 +02:00
} catch ( final SpaceExceededException e ) {
2010-07-18 11:10:46 +02:00
Log . logException ( e ) ;
}
2009-04-16 17:29:00 +02:00
}
return keyhashes ;
}
2011-06-13 23:44:03 +02:00
2010-04-15 15:22:59 +02:00
public static HandleSet hashes2Handles ( final String query ) {
2012-07-27 12:13:53 +02:00
final HandleSet keyhashes = new RowHandleSet ( WordReferenceRow . urlEntryRow . primaryKeyLength , WordReferenceRow . urlEntryRow . objectOrder , 0 ) ;
2010-07-18 11:10:46 +02:00
if ( query ! = null ) {
for ( int i = 0 ; i < ( query . length ( ) / Word . commonHashLength ) ; i + + ) try {
2011-05-27 10:24:54 +02:00
keyhashes . put ( ASCII . getBytes ( query . substring ( i * Word . commonHashLength , ( i + 1 ) * Word . commonHashLength ) ) ) ;
2012-07-27 12:13:53 +02:00
} catch ( final SpaceExceededException e ) {
2010-07-18 11:10:46 +02:00
Log . logException ( e ) ;
}
2010-04-15 15:22:59 +02:00
}
return keyhashes ;
}
2011-06-13 23:44:03 +02:00
2010-04-15 15:22:59 +02:00
public static String hashSet2hashString ( final HandleSet hashes ) {
final byte [ ] bb = new byte [ hashes . size ( ) * Word . commonHashLength ] ;
int p = 0 ;
2010-07-18 11:10:46 +02:00
for ( final byte [ ] b : hashes ) {
2011-05-27 10:24:54 +02:00
assert b . length = = Word . commonHashLength : " hash = " + ASCII . String ( b ) ;
2010-04-15 15:22:59 +02:00
System . arraycopy ( b , 0 , bb , p , Word . commonHashLength ) ;
p + = Word . commonHashLength ;
2009-06-02 00:45:28 +02:00
}
2011-05-27 10:24:54 +02:00
return ASCII . String ( bb ) ;
2006-09-13 19:13:28 +02:00
}
2007-12-12 19:57:43 +01:00
2012-08-19 13:17:03 +02:00
public static String hashSet2hashString ( final Set < String > hashes ) {
final byte [ ] bb = new byte [ hashes . size ( ) * Word . commonHashLength ] ;
int p = 0 ;
for ( final String s : hashes ) {
assert s . length ( ) = = Word . commonHashLength : " hash = " + s ;
System . arraycopy ( ASCII . getBytes ( s ) , 0 , bb , p , Word . commonHashLength ) ;
p + = Word . commonHashLength ;
}
return ASCII . String ( bb ) ;
}
2010-04-15 15:22:59 +02:00
public static String anonymizedQueryHashes ( final HandleSet hashes ) {
2008-08-28 23:15:59 +02:00
// create a more anonymized representation of a query hashes for logging
2009-04-16 17:29:00 +02:00
final Iterator < byte [ ] > i = hashes . iterator ( ) ;
2009-10-11 02:12:19 +02:00
final StringBuilder sb = new StringBuilder ( hashes . size ( ) * ( Word . commonHashLength + 2 ) + 2 ) ;
2007-12-12 19:57:43 +01:00
sb . append ( " [ " ) ;
2009-04-16 17:29:00 +02:00
byte [ ] hash ;
2007-12-12 19:57:43 +01:00
if ( i . hasNext ( ) ) {
2008-01-08 21:12:31 +01:00
hash = i . next ( ) ;
2011-05-27 10:24:54 +02:00
sb . append ( ASCII . String ( hash ) . substring ( 0 , 3 ) ) . append ( " ......... " ) ;
2007-12-12 19:57:43 +01:00
}
while ( i . hasNext ( ) ) {
2008-01-08 21:12:31 +01:00
hash = i . next ( ) ;
2011-05-27 10:24:54 +02:00
sb . append ( " , " ) . append ( ASCII . String ( hash ) . substring ( 0 , 3 ) ) . append ( " ......... " ) ;
2007-12-12 19:57:43 +01:00
}
sb . append ( " ] " ) ;
2011-03-07 21:36:40 +01:00
return sb . toString ( ) ;
2007-12-12 19:57:43 +01:00
}
2011-06-13 23:44:03 +02:00
2010-06-22 14:28:53 +02:00
/ * *
* check if the given text matches with the query
* this checks inclusion and exclusion words
* @param text
* @return true if the query matches with the given text
* /
2012-11-01 10:22:22 +01:00
private final boolean matchesText ( final String text ) {
2010-07-18 11:10:46 +02:00
boolean ret = false ;
2010-10-18 13:35:09 +02:00
final HandleSet wordhashes = Word . words2hashesHandles ( Condenser . getWords ( text , null ) . keySet ( ) ) ;
2012-07-09 11:14:50 +02:00
if ( ! SetTools . anymatch ( wordhashes , this . query_exclude_hashes ) ) {
ret = SetTools . totalInclusion ( this . query_include_hashes , wordhashes ) ;
2010-07-18 11:10:46 +02:00
}
return ret ;
2010-06-22 14:28:53 +02:00
}
2011-06-13 23:44:03 +02:00
2012-11-01 10:22:22 +01:00
protected static final boolean anymatch ( final String text , final HandleSet keyhashes ) {
2007-04-05 12:14:48 +02:00
// returns true if any of the word hashes in keyhashes appear in the String text
// to do this, all words in the string must be recognized and transcoded to word hashes
2012-06-04 15:37:39 +02:00
if ( keyhashes = = null | | keyhashes . isEmpty ( ) ) return false ;
2010-10-18 13:35:09 +02:00
final HandleSet wordhashes = Word . words2hashesHandles ( Condenser . getWords ( text , null ) . keySet ( ) ) ;
2009-01-30 16:33:00 +01:00
return SetTools . anymatch ( wordhashes , keyhashes ) ;
2007-04-05 12:14:48 +02:00
}
2011-06-13 23:44:03 +02:00
2008-09-28 22:01:10 +02:00
private static String seps = " '.,/&_ " ; static { seps + = '"' ; }
2011-06-13 23:44:03 +02:00
2012-07-10 12:01:20 +02:00
@SuppressWarnings ( " unchecked " )
public static Collection < String > [ ] cleanQuery ( String querystring ) {
2012-08-16 23:05:37 +02:00
// returns three sets: a query set, an exclude set and a full query set
2012-07-09 11:14:50 +02:00
final Collection < String > query_include_words = new ArrayList < String > ( ) ;
final Collection < String > query_exclude_words = new ArrayList < String > ( ) ;
final Collection < String > query_all_words = new ArrayList < String > ( ) ;
2011-06-13 23:44:03 +02:00
2011-03-23 01:48:19 +01:00
if ( ( querystring ! = null ) & & ( ! querystring . isEmpty ( ) ) ) {
2011-06-13 23:44:03 +02:00
2010-07-18 11:10:46 +02:00
// convert Umlaute
2010-10-09 01:50:28 +02:00
querystring = AbstractScraper . stripAll ( querystring . toCharArray ( ) ) . toLowerCase ( ) . trim ( ) ;
2010-07-18 11:10:46 +02:00
int c ;
for ( int i = 0 ; i < seps . length ( ) ; i + + ) {
while ( ( c = querystring . indexOf ( seps . charAt ( i ) ) ) > = 0 ) {
querystring = querystring . substring ( 0 , c ) + ( ( ( c + 1 ) < querystring . length ( ) ) ? ( " " + querystring . substring ( c + 1 ) ) : " " ) ;
}
}
String s ;
int l ;
// the string is clean now, but we must generate a set out of it
2010-11-28 03:57:31 +01:00
final String [ ] queries = querystring . split ( " " ) ;
2011-03-23 01:48:19 +01:00
for ( String quer : queries ) {
if ( quer . startsWith ( " - " ) ) {
2012-02-01 18:13:31 +01:00
String x = quer . substring ( 1 ) ;
2012-07-09 11:14:50 +02:00
if ( ! query_exclude_words . contains ( x ) ) query_exclude_words . add ( x ) ;
2010-07-18 11:10:46 +02:00
} else {
2011-03-23 01:48:19 +01:00
while ( ( c = quer . indexOf ( '-' ) ) > = 0 ) {
s = quer . substring ( 0 , c ) ;
2010-07-18 11:10:46 +02:00
l = s . length ( ) ;
2012-07-09 11:14:50 +02:00
if ( l > = Condenser . wordminsize & & ! query_include_words . contains ( s ) ) { query_include_words . add ( s ) ; }
if ( l > 0 & & ! query_all_words . contains ( s ) ) { query_all_words . add ( s ) ; }
2011-03-23 01:48:19 +01:00
quer = quer . substring ( c + 1 ) ;
2010-07-18 11:10:46 +02:00
}
2011-03-23 01:48:19 +01:00
l = quer . length ( ) ;
2012-07-09 11:14:50 +02:00
if ( l > = Condenser . wordminsize & & ! query_include_words . contains ( quer ) ) { query_include_words . add ( quer ) ; }
if ( l > 0 & & ! query_all_words . contains ( quer ) ) { query_all_words . add ( quer ) ; }
2010-07-18 11:10:46 +02:00
}
}
2007-04-03 17:35:29 +02:00
}
2012-07-09 11:14:50 +02:00
return new Collection [ ] { query_include_words , query_exclude_words , query_all_words } ;
2005-10-10 02:33:25 +02:00
}
2011-06-13 23:44:03 +02:00
2008-08-02 14:12:04 +02:00
public String queryString ( final boolean encodeHTML ) {
2010-07-18 11:10:46 +02:00
final String ret ;
2012-08-20 12:50:51 +02:00
if ( encodeHTML ) {
2011-03-23 01:48:19 +01:00
ret = CharacterCoding . unicode2html ( this . queryString , true ) ;
2012-08-20 12:50:51 +02:00
} else {
2010-07-18 11:10:46 +02:00
ret = this . queryString ;
}
2012-08-20 12:50:51 +02:00
return ret ;
}
2012-11-01 10:22:22 +01:00
private final static YaCySchema [ ] fields = new YaCySchema [ ] {
2012-09-25 23:59:30 +02:00
YaCySchema . sku , YaCySchema . title , YaCySchema . h1_txt , YaCySchema . h2_txt ,
2012-10-02 14:29:45 +02:00
YaCySchema . author , YaCySchema . description , YaCySchema . keywords , YaCySchema . text_t , YaCySchema . synonyms_sxt
2012-09-25 17:52:33 +02:00
} ;
2012-09-25 23:59:30 +02:00
2012-11-01 10:22:22 +01:00
private final static Map < YaCySchema , Float > boosts = new LinkedHashMap < YaCySchema , Float > ( ) ;
2012-09-25 23:59:30 +02:00
static {
boosts . put ( YaCySchema . sku , 20 . 0f ) ;
boosts . put ( YaCySchema . title , 15 . 0f ) ;
boosts . put ( YaCySchema . h1_txt , 11 . 0f ) ;
boosts . put ( YaCySchema . h2_txt , 10 . 0f ) ;
boosts . put ( YaCySchema . author , 8 . 0f ) ;
boosts . put ( YaCySchema . description , 5 . 0f ) ;
boosts . put ( YaCySchema . keywords , 2 . 0f ) ;
boosts . put ( YaCySchema . text_t , 1 . 0f ) ;
}
2012-10-07 07:46:55 +02:00
public SolrQuery solrQuery ( ) {
2012-08-27 14:41:33 +02:00
if ( this . query_include_words = = null | | this . query_include_words . size ( ) = = 0 ) return null ;
2012-10-02 14:29:45 +02:00
// get text query
final StringBuilder q = solrQueryString ( this . query_include_words , this . query_exclude_words , this . indexSegment . fulltext ( ) . getSolrScheme ( ) ) ;
2012-08-27 15:25:25 +02:00
2012-08-27 14:41:33 +02:00
// add constraints
if ( this . sitehash = = null ) {
if ( this . siteexcludes ! = null ) {
for ( String ex : this . siteexcludes ) {
2012-09-25 23:59:30 +02:00
q . append ( " - " ) . append ( YaCySchema . host_id_s . name ( ) ) . append ( ':' ) . append ( ex ) ;
2012-08-27 14:41:33 +02:00
}
}
} else {
2012-09-25 23:59:30 +02:00
q . append ( ' ' ) . append ( YaCySchema . host_id_s . name ( ) ) . append ( ':' ) . append ( this . sitehash ) ;
2012-08-27 14:41:33 +02:00
}
2012-09-25 17:52:33 +02:00
String urlMaskPattern = this . urlMask . pattern ( ) ;
int extm = urlMaskPattern . indexOf ( " .* \\ . " ) ;
if ( extm > = 0 ) {
String ext = urlMaskPattern . substring ( extm + 4 ) ;
2012-09-25 23:59:30 +02:00
q . append ( " AND " ) . append ( YaCySchema . url_file_ext_s . name ( ) ) . append ( ':' ) . append ( ext ) ;
2012-09-25 17:52:33 +02:00
}
2012-08-27 14:41:33 +02:00
2012-10-07 07:46:55 +02:00
// construct query
final SolrQuery params = new SolrQuery ( ) ;
params . setQuery ( q . toString ( ) ) ;
params . setStart ( this . offset ) ;
params . setRows ( this . resultcount ) ;
params . setFacet ( false ) ;
2012-08-27 14:41:33 +02:00
if ( this . radius > 0 . 0d & & this . lat ! = 0 . 0d & & this . lon ! = 0 . 0d ) {
2012-08-27 15:25:25 +02:00
// localtion search, no special ranking
2012-10-07 07:46:55 +02:00
// try http://localhost:8090/solr/select?q=*:*&fq={!bbox sfield=coordinate_p pt=50.17,8.65 d=1}
//params.setQuery("!bbox " + q.toString());
//params.set("sfield", YaCySchema.coordinate_p.name());
//params.set("pt", Double.toString(this.lat) + "," + Double.toString(this.lon));
//params.set("d", GeoLocation.degreeToKm(this.radius));
params . setFilterQueries ( " {!bbox sfield= " + YaCySchema . coordinate_p . name ( ) + " pt= " + Double . toString ( this . lat ) + " , " + Double . toString ( this . lon ) + " d= " + GeoLocation . degreeToKm ( this . radius ) + " } " ) ;
//params.setRows(Integer.MAX_VALUE);
2012-08-27 15:25:25 +02:00
} else {
2012-09-26 16:56:33 +02:00
// set ranking
if ( this . ranking . coeff_date = = RankingProfile . COEFF_MAX ) {
// set a most-recent ordering
2012-10-07 07:46:55 +02:00
params . setSortField ( YaCySchema . last_modified . name ( ) , ORDER . desc ) ;
2012-09-25 23:59:30 +02:00
}
2012-08-27 14:41:33 +02:00
}
2012-09-26 16:56:33 +02:00
2012-08-27 14:41:33 +02:00
// prepare result
2012-10-07 07:46:55 +02:00
Log . logInfo ( " Protocol " , " SOLR QUERY: " + params . toString ( ) ) ;
return params ;
2005-12-06 17:15:21 +01:00
}
2012-10-07 07:46:55 +02:00
2012-10-02 14:29:45 +02:00
public static StringBuilder solrQueryString ( Collection < String > include , Collection < String > exclude , SolrConfiguration configuration ) {
final StringBuilder q = new StringBuilder ( 80 ) ;
// add text query
int wc = 0 ;
StringBuilder w = new StringBuilder ( 80 ) ;
for ( String s : include ) {
if ( wc > 0 ) w . append ( " AND " ) ;
w . append ( s ) ;
wc + + ;
}
for ( String s : exclude ) {
if ( wc > 0 ) w . append ( " AND - " ) ;
w . append ( s ) ;
wc + + ;
}
2012-11-01 10:22:22 +01:00
if ( wc > 1 ) { w . insert ( 0 , '(' ) ; w . append ( ')' ) ; }
2012-10-02 14:29:45 +02:00
// combine these queries for all relevant fields
wc = 0 ;
2012-10-07 07:46:55 +02:00
Float boost ;
2012-10-02 14:29:45 +02:00
for ( YaCySchema field : fields ) {
if ( configuration ! = null & & ! configuration . contains ( field . name ( ) ) ) continue ;
if ( wc > 0 ) q . append ( " OR " ) ;
2012-10-07 07:46:55 +02:00
q . append ( '(' ) ;
q . append ( field . name ( ) ) . append ( ':' ) . append ( w ) ;
boost = boosts . get ( field ) ;
if ( boost ! = null ) q . append ( '^' ) . append ( boost . toString ( ) ) ;
q . append ( ')' ) ;
2012-10-02 14:29:45 +02:00
wc + + ;
}
q . insert ( 0 , '(' ) ;
q . append ( ')' ) ;
// add filter to prevent that results come from failed urls
q . append ( " AND - " ) . append ( YaCySchema . failreason_t . name ( ) ) . append ( " :[* TO *] " ) ;
return q ;
}
2010-08-13 17:59:52 +02:00
public String queryStringForUrl ( ) {
try {
2010-11-28 03:57:31 +01:00
return URLEncoder . encode ( this . queryString , " UTF-8 " ) ;
2011-06-13 23:44:03 +02:00
} catch ( final UnsupportedEncodingException e ) {
2011-03-23 01:48:19 +01:00
Log . logException ( e ) ;
2010-11-28 03:57:31 +01:00
return this . queryString ;
}
2010-08-13 17:59:52 +02:00
}
2011-06-13 23:44:03 +02:00
2012-02-01 18:13:31 +01:00
public Collection < String > [ ] queryWords ( ) {
2007-08-28 14:15:46 +02:00
return cleanQuery ( this . queryString ) ;
}
2011-06-13 23:44:03 +02:00
2010-11-28 03:57:31 +01:00
public void filterOut ( final SortedSet < String > blueList ) {
2005-10-12 14:28:49 +02:00
// filter out words that appear in this set
2007-04-05 12:14:48 +02:00
// this is applied to the queryHashes
2010-04-15 15:22:59 +02:00
final HandleSet blues = Word . words2hashesHandles ( blueList ) ;
2012-07-09 11:14:50 +02:00
for ( final byte [ ] b : blues ) this . query_include_hashes . remove ( b ) ;
2005-10-12 14:28:49 +02:00
}
2007-01-15 17:03:00 +01:00
2010-06-23 13:19:32 +02:00
2010-07-18 11:10:46 +02:00
public final Map < MultiProtocolURI , String > separateMatches ( final Map < MultiProtocolURI , String > links ) {
final Map < MultiProtocolURI , String > matcher = new HashMap < MultiProtocolURI , String > ( ) ;
final Iterator < Map . Entry < MultiProtocolURI , String > > i = links . entrySet ( ) . iterator ( ) ;
2010-06-23 13:19:32 +02:00
Map . Entry < MultiProtocolURI , String > entry ;
MultiProtocolURI url ;
String anchorText ;
while ( i . hasNext ( ) ) {
entry = i . next ( ) ;
url = entry . getKey ( ) ;
anchorText = entry . getValue ( ) ;
2011-06-13 23:44:03 +02:00
if ( matchesText ( anchorText ) ) {
2010-06-23 13:19:32 +02:00
matcher . put ( url , anchorText ) ;
i . remove ( ) ;
}
}
return matcher ;
}
2011-06-13 23:44:03 +02:00
2012-06-05 12:06:26 +02:00
private volatile String idCacheAnon = null , idCache = null ;
2010-09-06 12:00:07 +02:00
final static private char asterisk = '*' ;
2008-08-02 14:12:04 +02:00
public String id ( final boolean anonymized ) {
2010-09-06 12:00:07 +02:00
if ( anonymized ) {
2011-06-13 23:44:03 +02:00
if ( this . idCacheAnon ! = null ) return this . idCacheAnon ;
2010-09-06 12:00:07 +02:00
} else {
2011-06-13 23:44:03 +02:00
if ( this . idCache ! = null ) return this . idCache ;
2010-09-06 12:00:07 +02:00
}
2012-06-05 12:06:26 +02:00
synchronized ( this ) {
// do a Double-Checked Locking
if ( anonymized ) {
if ( this . idCacheAnon ! = null ) return this . idCacheAnon ;
} else {
if ( this . idCache ! = null ) return this . idCache ;
}
// generate a string that identifies a search so results can be re-used in a cache
final StringBuilder context = new StringBuilder ( 180 ) ;
if ( anonymized ) {
2012-07-09 11:14:50 +02:00
context . append ( anonymizedQueryHashes ( this . query_include_hashes ) ) ;
2012-06-05 12:06:26 +02:00
context . append ( '-' ) ;
2012-07-09 11:14:50 +02:00
context . append ( anonymizedQueryHashes ( this . query_exclude_hashes ) ) ;
2012-06-05 12:06:26 +02:00
} else {
2012-07-09 11:14:50 +02:00
context . append ( hashSet2hashString ( this . query_include_hashes ) ) ;
2012-06-05 12:06:26 +02:00
context . append ( '-' ) ;
2012-07-09 11:14:50 +02:00
context . append ( hashSet2hashString ( this . query_exclude_hashes ) ) ;
2012-06-05 12:06:26 +02:00
}
//context.append(asterisk);
//context.append(this.domType);
context . append ( asterisk ) ;
context . append ( this . contentdom ) . append ( asterisk ) ;
context . append ( this . zonecode ) . append ( asterisk ) ;
context . append ( ASCII . String ( Word . word2hash ( this . ranking . toExternalString ( ) ) ) ) . append ( asterisk ) ;
context . append ( Base64Order . enhancedCoder . encodeString ( this . prefer . toString ( ) ) ) . append ( asterisk ) ;
context . append ( Base64Order . enhancedCoder . encodeString ( this . urlMask . toString ( ) ) ) . append ( asterisk ) ;
context . append ( this . sitehash ) . append ( asterisk ) ;
context . append ( this . siteexcludes ) . append ( asterisk ) ;
context . append ( this . authorhash ) . append ( asterisk ) ;
context . append ( this . targetlang ) . append ( asterisk ) ;
context . append ( this . constraint ) . append ( asterisk ) ;
context . append ( this . maxDistance ) . append ( asterisk ) ;
context . append ( this . modifier . s ) . append ( asterisk ) ;
context . append ( this . lat ) . append ( asterisk ) . append ( this . lon ) . append ( asterisk ) . append ( this . radius ) . append ( asterisk ) ;
context . append ( this . snippetCacheStrategy = = null ? " null " : this . snippetCacheStrategy . name ( ) ) ;
String result = context . toString ( ) ;
if ( anonymized ) {
this . idCacheAnon = result ;
} else {
this . idCache = result ;
}
return result ;
2010-07-18 11:10:46 +02:00
}
2007-08-25 01:12:59 +02:00
}
2011-06-13 23:44:03 +02:00
2009-06-02 00:45:28 +02:00
/ * *
* make a query anchor tag
* @param page
* @param theQuery
* @param originalUrlMask
* @param addToQuery
* @return
* /
2011-06-13 23:44:03 +02:00
public static StringBuilder navurl (
2010-12-08 11:50:23 +01:00
final String ext , final int page , final QueryParams theQuery ,
2011-06-13 23:44:03 +02:00
final String newQueryString , final String originalUrlMask , final String nav ) {
final StringBuilder sb = navurlBase ( ext , theQuery , newQueryString , originalUrlMask , nav ) ;
sb . append ( ampersand ) ;
sb . append ( " startRecord= " ) ;
2012-05-21 01:58:29 +02:00
sb . append ( page * theQuery . itemsPerPage ( ) ) ;
2011-06-13 23:44:03 +02:00
return sb ;
}
public static StringBuilder navurlBase (
final String ext , final QueryParams theQuery ,
final String newQueryString , final String originalUrlMask , final String nav ) {
2010-07-18 11:10:46 +02:00
2011-03-09 10:29:05 +01:00
final StringBuilder sb = new StringBuilder ( 120 ) ;
2010-07-18 11:10:46 +02:00
sb . append ( " /yacysearch. " ) ;
sb . append ( ext ) ;
2010-12-08 11:50:23 +01:00
sb . append ( " ?query= " ) ;
2010-10-09 10:55:57 +02:00
sb . append ( newQueryString = = null ? theQuery . queryStringForUrl ( ) : newQueryString ) ;
2010-07-18 11:10:46 +02:00
sb . append ( ampersand ) ;
sb . append ( " maximumRecords= " ) ;
2012-05-21 01:58:29 +02:00
sb . append ( theQuery . itemsPerPage ( ) ) ;
2010-07-18 11:10:46 +02:00
sb . append ( ampersand ) ;
sb . append ( " resource= " ) ;
sb . append ( ( theQuery . isLocal ( ) ) ? " local " : " global " ) ;
sb . append ( ampersand ) ;
sb . append ( " verify= " ) ;
2010-10-09 10:55:57 +02:00
sb . append ( theQuery . snippetCacheStrategy = = null ? " false " : theQuery . snippetCacheStrategy . toName ( ) ) ;
2010-07-18 11:10:46 +02:00
sb . append ( ampersand ) ;
sb . append ( " nav= " ) ;
sb . append ( nav ) ;
sb . append ( ampersand ) ;
sb . append ( " urlmaskfilter= " ) ;
sb . append ( originalUrlMask ) ;
sb . append ( ampersand ) ;
sb . append ( " prefermaskfilter= " ) ;
sb . append ( theQuery . prefer ) ;
sb . append ( ampersand ) ;
sb . append ( " cat=href " ) ;
sb . append ( ampersand ) ;
sb . append ( " constraint= " ) ;
sb . append ( ( theQuery . constraint = = null ) ? " " : theQuery . constraint . exportB64 ( ) ) ;
sb . append ( ampersand ) ;
sb . append ( " contentdom= " ) ;
2012-04-22 00:04:36 +02:00
sb . append ( theQuery . contentdom . toString ( ) ) ;
2010-07-18 11:10:46 +02:00
sb . append ( ampersand ) ;
sb . append ( " former= " ) ;
2010-08-13 17:59:52 +02:00
sb . append ( theQuery . queryStringForUrl ( ) ) ;
2010-07-18 11:10:46 +02:00
2011-06-13 23:44:03 +02:00
return sb ;
2009-06-02 00:45:28 +02:00
}
2011-06-13 23:44:03 +02:00
2005-10-10 02:33:25 +02:00
}