2011-06-13 23:44:03 +02:00
// QueryParams.java
2005-10-10 02:33:25 +02:00
// -----------------------
// part of YACY
2008-07-20 19:14:51 +02:00
// (C) by Michael Peter Christen; mc@yacy.net
2005-10-10 02:33:25 +02:00
// first published on http://www.anomic.de
// Frankfurt, Germany, 2005
// Created: 10.10.2005
//
2009-09-05 22:41:21 +02:00
// $LastChangedDate$
// $LastChangedRevision$
// $LastChangedBy$
//
2005-10-10 02:33:25 +02:00
// This program is free software; you can redistribute it and/or modify
// it under the terms of the GNU General Public License as published by
// the Free Software Foundation; either version 2 of the License, or
// (at your option) any later version.
//
// This program is distributed in the hope that it will be useful,
// but WITHOUT ANY WARRANTY; without even the implied warranty of
// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
// GNU General Public License for more details.
//
// You should have received a copy of the GNU General Public License
// along with this program; if not, write to the Free Software
// Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
2011-09-25 18:59:06 +02:00
package net.yacy.search.query ;
2005-10-10 02:33:25 +02:00
2012-01-17 01:53:08 +01:00
import java.util.ArrayList ;
import java.util.Collection ;
2010-06-23 13:19:32 +02:00
import java.util.HashMap ;
2006-09-30 00:27:20 +02:00
import java.util.Iterator ;
2012-12-18 02:29:03 +01:00
import java.util.List ;
2010-06-23 13:19:32 +02:00
import java.util.Map ;
2011-12-13 00:16:05 +01:00
import java.util.Set ;
2010-03-23 11:17:28 +01:00
import java.util.regex.Pattern ;
2011-03-23 01:48:19 +01:00
import java.util.regex.PatternSyntaxException ;
2005-10-10 02:33:25 +02:00
2012-10-07 07:46:55 +02:00
import org.apache.solr.client.solrj.SolrQuery ;
import org.apache.solr.client.solrj.SolrQuery.ORDER ;
2012-09-26 16:56:33 +02:00
2011-05-27 10:24:54 +02:00
import net.yacy.cora.document.ASCII ;
2012-11-21 18:46:49 +01:00
import net.yacy.cora.document.analysis.Classification ;
import net.yacy.cora.document.analysis.Classification.ContentDomain ;
2012-12-02 16:54:29 +01:00
import net.yacy.cora.federate.solr.Boost ;
2012-09-25 21:20:03 +02:00
import net.yacy.cora.federate.yacy.CacheStrategy ;
2012-09-20 19:38:22 +02:00
import net.yacy.cora.geo.GeoLocation ;
2012-06-11 23:49:30 +02:00
import net.yacy.cora.lod.vocabulary.Tagging ;
2012-09-21 16:46:57 +02:00
import net.yacy.cora.order.Base64Order ;
2012-07-27 12:13:53 +02:00
import net.yacy.cora.storage.HandleSet ;
import net.yacy.cora.util.SpaceExceededException ;
2009-10-18 02:53:43 +02:00
import net.yacy.document.Condenser ;
2012-12-18 02:29:03 +01:00
import net.yacy.document.LibraryProvider ;
2009-10-11 02:12:19 +02:00
import net.yacy.kelondro.data.meta.DigestURI ;
import net.yacy.kelondro.data.word.Word ;
2010-04-15 15:22:59 +02:00
import net.yacy.kelondro.data.word.WordReferenceRow ;
2012-07-27 12:13:53 +02:00
import net.yacy.kelondro.index.RowHandleSet ;
2010-04-15 15:22:59 +02:00
import net.yacy.kelondro.logging.Log ;
2012-09-21 16:46:57 +02:00
import net.yacy.kelondro.util.Bitfield ;
2009-10-10 03:14:19 +02:00
import net.yacy.kelondro.util.SetTools ;
2011-10-04 11:06:24 +02:00
import net.yacy.peers.Seed ;
2011-09-25 18:59:06 +02:00
import net.yacy.search.index.Segment ;
import net.yacy.search.ranking.RankingProfile ;
2013-02-21 13:23:55 +01:00
import net.yacy.search.schema.CollectionConfiguration ;
import net.yacy.search.schema.CollectionSchema ;
2005-10-10 02:33:25 +02:00
2009-07-09 00:14:57 +02:00
public final class QueryParams {
2011-06-13 23:44:03 +02:00
2011-11-17 02:05:45 +01:00
public enum Searchdom {
LOCAL , CLUSTER , GLOBAL ;
2011-11-24 15:57:09 +01:00
@Override
public String toString ( ) {
if ( this = = LOCAL ) return " local " ;
else if ( this = = CLUSTER ) return " global " ; // yes thats right: global, not cluster because a cluster search is a global search
else if ( this = = GLOBAL ) return " global " ;
return " local " ;
}
2011-11-17 02:05:45 +01:00
}
2010-10-09 10:55:57 +02:00
2013-02-21 13:23:55 +01:00
private static final CollectionSchema [ ] defaultfacetfields = new CollectionSchema [ ] {
CollectionSchema . host_s , CollectionSchema . url_protocol_s , CollectionSchema . url_file_ext_s , CollectionSchema . author_sxt } ;
2012-11-06 14:32:08 +01:00
private static final int defaultmaxfacets = 30 ;
2010-10-09 10:55:57 +02:00
private static final String ampersand = " & " ;
2009-01-30 16:33:00 +01:00
public static final Bitfield empty_constraint = new Bitfield ( 4 , " AAAAAA " ) ;
2010-03-23 14:41:41 +01:00
public static final Pattern catchall_pattern = Pattern . compile ( " .* " ) ;
2012-11-01 10:22:22 +01:00
private static final Pattern matchnothing_pattern = Pattern . compile ( " " ) ;
2011-06-13 23:44:03 +02:00
2012-12-18 02:29:03 +01:00
private final QueryGoal queryGoal ;
2012-11-05 03:19:28 +01:00
public int itemsPerPage ;
2010-03-23 14:41:41 +01:00
public int offset ;
2012-12-26 21:25:27 +01:00
public Pattern urlMask ;
public final Pattern prefer ;
2013-02-12 03:42:46 +01:00
public final String tld , inlink ;
2012-12-26 21:25:27 +01:00
boolean urlMask_isCatchall ;
2012-04-22 00:04:36 +02:00
public final Classification . ContentDomain contentdom ;
2010-03-23 14:41:41 +01:00
public final String targetlang ;
2012-11-01 10:22:22 +01:00
protected final Collection < Tagging . Metatag > metatags ;
2011-11-17 02:05:45 +01:00
public final Searchdom domType ;
2012-11-01 10:22:22 +01:00
private final int zonecode ;
2010-03-23 14:41:41 +01:00
public final int maxDistance ;
public final Bitfield constraint ;
2013-02-26 17:16:31 +01:00
public final boolean allofconstraint ;
2012-11-01 10:22:22 +01:00
protected CacheStrategy snippetCacheStrategy ;
2010-03-23 14:41:41 +01:00
public final RankingProfile ranking ;
2010-01-11 00:09:48 +01:00
private final Segment indexSegment ;
2012-11-06 14:32:08 +01:00
public final String clienthost ; // this is the client host that starts the query, not a site operator
2012-11-01 10:22:22 +01:00
protected final Set < String > siteexcludes ; // set of domain hashes that are excluded if not included by sitehash
2013-02-12 03:42:46 +01:00
public final QueryModifier modifier ;
2011-10-04 11:06:24 +02:00
public Seed remotepeer ;
2012-11-01 10:22:22 +01:00
public final long starttime ; // the time when the query started, how long it should take and the time when the timeout is reached (milliseconds)
protected final long maxtime ;
2012-12-18 02:29:03 +01:00
private final long timeout ;
2008-02-18 00:35:48 +01:00
// values that are set after a search:
2011-01-11 23:58:14 +01:00
public int transmitcount ; // number of results that had been shown to the user
2008-02-18 00:35:48 +01:00
public long searchtime , urlretrievaltime , snippetcomputationtime ; // time to perform the search, to get all the urls, and to compute the snippets
2010-10-18 10:09:59 +02:00
public final String userAgent ;
2012-12-29 17:47:34 +01:00
protected boolean filterfailurls , filterscannerfail ;
2012-11-01 10:22:22 +01:00
protected double lat , lon , radius ;
2012-12-18 02:29:03 +01:00
public List < String > facetfields ;
2012-11-06 14:32:08 +01:00
public int maxfacets ;
2013-02-02 07:21:18 +01:00
private SolrQuery cachedQuery ;
2013-02-21 13:23:55 +01:00
private CollectionConfiguration solrSchema ;
2011-06-13 23:44:03 +02:00
2011-04-12 07:02:36 +02:00
public QueryParams (
2013-02-04 16:42:10 +01:00
final String query_original ,
final String query_words ,
2010-07-18 11:10:46 +02:00
final int itemsPerPage ,
final Bitfield constraint ,
final Segment indexSegment ,
2010-10-18 10:09:59 +02:00
final RankingProfile ranking ,
final String userAgent ) {
2012-12-15 00:05:46 +01:00
this . queryGoal = new QueryGoal ( query_original , query_words ) ;
2008-01-08 21:12:31 +01:00
this . ranking = ranking ;
2013-02-12 03:42:46 +01:00
this . modifier = new QueryModifier ( ) ;
2007-11-16 15:48:09 +01:00
this . maxDistance = Integer . MAX_VALUE ;
2010-03-23 14:41:41 +01:00
this . urlMask = catchall_pattern ;
this . urlMask_isCatchall = true ;
2012-12-19 12:45:40 +01:00
this . tld = null ;
2013-01-14 12:50:21 +01:00
this . inlink = null ;
2010-03-23 14:41:41 +01:00
this . prefer = matchnothing_pattern ;
2009-11-19 00:56:05 +01:00
this . contentdom = ContentDomain . ALL ;
2009-08-30 12:28:23 +02:00
this . itemsPerPage = itemsPerPage ;
2007-11-16 15:48:09 +01:00
this . offset = 0 ;
2008-09-21 02:04:42 +02:00
this . targetlang = " en " ;
2012-06-11 23:49:30 +02:00
this . metatags = new ArrayList < Tagging . Metatag > ( 0 ) ;
2011-11-17 02:05:45 +01:00
this . domType = Searchdom . LOCAL ;
2009-10-11 02:12:19 +02:00
this . zonecode = DigestURI . TLD_any_zone_filter ;
2006-11-23 03:16:30 +01:00
this . constraint = constraint ;
2007-11-16 15:48:09 +01:00
this . allofconstraint = false ;
2010-10-09 10:55:57 +02:00
this . snippetCacheStrategy = null ;
2012-11-06 14:32:08 +01:00
this . clienthost = null ;
2011-12-13 00:16:05 +01:00
this . siteexcludes = null ;
2008-02-18 00:35:48 +01:00
this . remotepeer = null ;
2012-06-04 15:37:39 +02:00
this . starttime = Long . valueOf ( System . currentTimeMillis ( ) ) ;
this . maxtime = 10000 ;
this . timeout = this . starttime + this . timeout ;
2009-11-24 12:13:11 +01:00
this . indexSegment = indexSegment ;
2010-10-18 10:09:59 +02:00
this . userAgent = userAgent ;
2011-01-11 23:58:14 +01:00
this . transmitcount = 0 ;
2011-01-22 10:46:00 +01:00
this . filterfailurls = false ;
2012-12-29 17:47:34 +01:00
this . filterscannerfail = false ;
2012-05-31 22:39:53 +02:00
this . lat = 0 . 0d ;
this . lon = 0 . 0d ;
this . radius = 0 . 0d ;
2013-02-04 16:42:10 +01:00
this . facetfields = new ArrayList < String > ( ) ;
2013-02-21 13:23:55 +01:00
this . solrSchema = indexSegment . fulltext ( ) . getDefaultConfiguration ( ) ;
for ( CollectionSchema f : defaultfacetfields ) {
2013-02-15 01:38:10 +01:00
if ( solrSchema . contains ( f ) ) facetfields . add ( f . getSolrFieldName ( ) ) ;
2013-02-04 16:42:10 +01:00
}
2013-02-21 13:23:55 +01:00
for ( Tagging v : LibraryProvider . autotagging . getVocabularies ( ) ) this . facetfields . add ( CollectionSchema . VOCABULARY_PREFIX + v . getName ( ) + CollectionSchema . VOCABULARY_SUFFIX ) ;
2012-11-06 14:32:08 +01:00
this . maxfacets = defaultmaxfacets ;
2013-02-02 07:21:18 +01:00
this . cachedQuery = null ;
2007-11-16 15:48:09 +01:00
}
2011-06-13 23:44:03 +02:00
2009-07-09 00:14:57 +02:00
public QueryParams (
2012-11-18 01:22:41 +01:00
final QueryGoal queryGoal ,
2013-02-12 03:42:46 +01:00
final QueryModifier modifier ,
final int maxDistance ,
final String prefer ,
final ContentDomain contentdom ,
2008-09-21 02:04:42 +02:00
final String language ,
2012-06-11 23:49:30 +02:00
final Collection < Tagging . Metatag > metatags ,
2011-06-13 23:44:03 +02:00
final CacheStrategy snippetCacheStrategy ,
2013-02-12 03:42:46 +01:00
final int itemsPerPage ,
final int offset ,
final String urlMask ,
final String tld ,
final String inlink ,
final Searchdom domType ,
final int domMaxTargets ,
final Bitfield constraint ,
final boolean allofconstraint ,
2011-12-13 00:16:05 +01:00
final Set < String > siteexcludes ,
2008-08-02 14:12:04 +02:00
final int domainzone ,
final String host ,
2009-11-24 12:13:11 +01:00
final boolean specialRights ,
final Segment indexSegment ,
2010-10-18 10:09:59 +02:00
final RankingProfile ranking ,
2011-01-22 10:46:00 +01:00
final String userAgent ,
2012-05-31 22:39:53 +02:00
final boolean filterfailurls ,
2012-12-29 17:47:34 +01:00
final boolean filterscannerfail ,
2013-02-12 03:42:46 +01:00
final double lat ,
final double lon ,
final double radius
) {
2012-11-18 01:22:41 +01:00
this . queryGoal = queryGoal ;
2013-02-12 03:42:46 +01:00
this . modifier = modifier ;
2010-07-18 11:10:46 +02:00
this . ranking = ranking ;
this . maxDistance = maxDistance ;
this . contentdom = contentdom ;
2011-08-11 23:37:35 +02:00
this . itemsPerPage = Math . min ( ( specialRights ) ? 10000 : 1000 , itemsPerPage ) ;
this . offset = Math . max ( 0 , Math . min ( ( specialRights ) ? 10000 - this . itemsPerPage : 1000 - this . itemsPerPage , offset ) ) ;
2011-03-23 01:48:19 +01:00
try {
this . urlMask = Pattern . compile ( urlMask . toLowerCase ( ) ) ;
} catch ( final PatternSyntaxException ex ) {
throw new IllegalArgumentException ( " Not a valid regular expression: " + urlMask , ex ) ;
}
2010-03-23 14:41:41 +01:00
this . urlMask_isCatchall = this . urlMask . toString ( ) . equals ( catchall_pattern . toString ( ) ) ;
2012-12-26 21:25:27 +01:00
if ( this . urlMask_isCatchall ) {
2013-02-12 03:42:46 +01:00
if ( modifier . protocol ! = null ) {
this . urlMask = Pattern . compile ( modifier . protocol + " .* " ) ;
2012-12-26 21:25:27 +01:00
this . urlMask_isCatchall = false ;
}
if ( tld ! = null ) {
this . urlMask = Pattern . compile ( " .* " + tld + " .* " ) ;
this . urlMask_isCatchall = false ;
}
2013-02-12 03:42:46 +01:00
if ( modifier . filetype ! = null ) {
this . urlMask = Pattern . compile ( " .* " + modifier . filetype + " .* " ) ;
2012-12-26 21:25:27 +01:00
this . urlMask_isCatchall = false ;
}
}
2012-12-19 12:45:40 +01:00
this . tld = tld ;
2013-01-14 12:50:21 +01:00
this . inlink = inlink ;
2011-03-23 01:48:19 +01:00
try {
this . prefer = Pattern . compile ( prefer ) ;
} catch ( final PatternSyntaxException ex ) {
throw new IllegalArgumentException ( " Not a valid regular expression: " + prefer , ex ) ;
}
2012-11-05 03:19:28 +01:00
this . prefer . toString ( ) . equals ( matchnothing_pattern . toString ( ) ) ;
2010-07-18 11:10:46 +02:00
assert language ! = null ;
2008-09-21 02:04:42 +02:00
this . targetlang = language ;
2012-01-17 01:53:08 +01:00
this . metatags = metatags ;
2008-09-21 02:04:42 +02:00
this . domType = domType ;
2008-03-11 12:09:38 +01:00
this . zonecode = domainzone ;
2010-07-18 11:10:46 +02:00
this . constraint = constraint ;
this . allofconstraint = allofconstraint ;
2012-07-10 22:59:03 +02:00
this . siteexcludes = siteexcludes ! = null & & siteexcludes . isEmpty ( ) ? null : siteexcludes ;
2010-07-18 11:10:46 +02:00
this . snippetCacheStrategy = snippetCacheStrategy ;
2012-11-06 14:32:08 +01:00
this . clienthost = host ;
2008-02-18 00:35:48 +01:00
this . remotepeer = null ;
2012-06-04 15:37:39 +02:00
this . starttime = Long . valueOf ( System . currentTimeMillis ( ) ) ;
this . maxtime = 10000 ;
this . timeout = this . starttime + this . timeout ;
2009-11-24 12:13:11 +01:00
this . indexSegment = indexSegment ;
2010-10-18 10:09:59 +02:00
this . userAgent = userAgent ;
2011-01-11 23:58:14 +01:00
this . transmitcount = 0 ;
2011-01-22 10:46:00 +01:00
this . filterfailurls = filterfailurls ;
2012-12-29 17:47:34 +01:00
this . filterscannerfail = filterscannerfail ;
2012-05-31 22:39:53 +02:00
// we normalize here the location and radius because that should cause a better caching
// and as surplus it will increase privacy
this . lat = Math . floor ( lat * this . kmNormal ) / this . kmNormal ;
this . lon = Math . floor ( lon * this . kmNormal ) / this . kmNormal ;
this . radius = Math . floor ( radius * this . kmNormal + 1 ) / this . kmNormal ;
2013-02-04 16:42:10 +01:00
this . facetfields = new ArrayList < String > ( ) ;
2013-02-21 13:23:55 +01:00
this . solrSchema = indexSegment . fulltext ( ) . getDefaultConfiguration ( ) ;
for ( CollectionSchema f : defaultfacetfields ) {
2013-02-15 01:38:10 +01:00
if ( solrSchema . contains ( f ) ) facetfields . add ( f . getSolrFieldName ( ) ) ;
2013-02-04 16:42:10 +01:00
}
2013-02-21 13:23:55 +01:00
for ( Tagging v : LibraryProvider . autotagging . getVocabularies ( ) ) this . facetfields . add ( CollectionSchema . VOCABULARY_PREFIX + v . getName ( ) + CollectionSchema . VOCABULARY_SUFFIX ) ;
2012-11-06 14:32:08 +01:00
this . maxfacets = defaultmaxfacets ;
2013-02-02 07:21:18 +01:00
this . cachedQuery = null ;
2009-11-24 12:13:11 +01:00
}
2011-06-13 23:44:03 +02:00
2012-11-01 10:22:22 +01:00
private double kmNormal = 100 . d ; // 100 =ca 40000.d / 360.d == 111.11 - if lat/lon is multiplied with this, rounded and diveded by this, the location is normalized to a 1km grid
2012-05-31 22:39:53 +02:00
2009-11-24 12:13:11 +01:00
public Segment getSegment ( ) {
return this . indexSegment ;
2005-10-10 02:33:25 +02:00
}
2011-06-13 23:44:03 +02:00
2007-09-04 01:43:55 +02:00
public int neededResults ( ) {
// the number of result lines that must be computed
2009-08-30 12:28:23 +02:00
return this . offset + this . itemsPerPage ;
2007-09-04 01:43:55 +02:00
}
2011-06-13 23:44:03 +02:00
2012-05-21 01:58:29 +02:00
public int itemsPerPage ( ) {
2007-09-08 13:50:19 +02:00
// the number of result lines that are displayed at once (size of result page)
2009-08-30 12:28:23 +02:00
return this . itemsPerPage ;
2007-09-04 01:43:55 +02:00
}
2012-11-05 03:19:28 +01:00
2008-08-02 14:12:04 +02:00
public void setOffset ( final int newOffset ) {
2007-09-04 01:43:55 +02:00
this . offset = newOffset ;
2005-10-10 02:33:25 +02:00
}
2011-06-13 23:44:03 +02:00
2008-04-24 10:42:08 +02:00
public boolean isLocal ( ) {
2011-11-17 02:05:45 +01:00
return this . domType = = Searchdom . LOCAL ;
2007-09-04 01:43:55 +02:00
}
2011-06-13 23:44:03 +02:00
2010-04-15 15:22:59 +02:00
public static HandleSet hashes2Set ( final String query ) {
2012-07-27 12:13:53 +02:00
final HandleSet keyhashes = new RowHandleSet ( WordReferenceRow . urlEntryRow . primaryKeyLength , WordReferenceRow . urlEntryRow . objectOrder , 0 ) ;
2010-07-18 11:10:46 +02:00
if ( query ! = null ) {
for ( int i = 0 ; i < ( query . length ( ) / Word . commonHashLength ) ; i + + ) try {
2011-05-27 10:24:54 +02:00
keyhashes . put ( ASCII . getBytes ( query . substring ( i * Word . commonHashLength , ( i + 1 ) * Word . commonHashLength ) ) ) ;
2012-07-27 12:13:53 +02:00
} catch ( final SpaceExceededException e ) {
2010-07-18 11:10:46 +02:00
Log . logException ( e ) ;
}
2009-04-16 17:29:00 +02:00
}
return keyhashes ;
}
2011-06-13 23:44:03 +02:00
2010-04-15 15:22:59 +02:00
public static HandleSet hashes2Handles ( final String query ) {
2012-07-27 12:13:53 +02:00
final HandleSet keyhashes = new RowHandleSet ( WordReferenceRow . urlEntryRow . primaryKeyLength , WordReferenceRow . urlEntryRow . objectOrder , 0 ) ;
2010-07-18 11:10:46 +02:00
if ( query ! = null ) {
for ( int i = 0 ; i < ( query . length ( ) / Word . commonHashLength ) ; i + + ) try {
2011-05-27 10:24:54 +02:00
keyhashes . put ( ASCII . getBytes ( query . substring ( i * Word . commonHashLength , ( i + 1 ) * Word . commonHashLength ) ) ) ;
2012-07-27 12:13:53 +02:00
} catch ( final SpaceExceededException e ) {
2010-07-18 11:10:46 +02:00
Log . logException ( e ) ;
}
2010-04-15 15:22:59 +02:00
}
return keyhashes ;
}
2011-06-13 23:44:03 +02:00
2010-04-15 15:22:59 +02:00
public static String hashSet2hashString ( final HandleSet hashes ) {
final byte [ ] bb = new byte [ hashes . size ( ) * Word . commonHashLength ] ;
int p = 0 ;
2010-07-18 11:10:46 +02:00
for ( final byte [ ] b : hashes ) {
2011-05-27 10:24:54 +02:00
assert b . length = = Word . commonHashLength : " hash = " + ASCII . String ( b ) ;
2010-04-15 15:22:59 +02:00
System . arraycopy ( b , 0 , bb , p , Word . commonHashLength ) ;
p + = Word . commonHashLength ;
2009-06-02 00:45:28 +02:00
}
2011-05-27 10:24:54 +02:00
return ASCII . String ( bb ) ;
2006-09-13 19:13:28 +02:00
}
2007-12-12 19:57:43 +01:00
2012-08-19 13:17:03 +02:00
public static String hashSet2hashString ( final Set < String > hashes ) {
final byte [ ] bb = new byte [ hashes . size ( ) * Word . commonHashLength ] ;
int p = 0 ;
for ( final String s : hashes ) {
assert s . length ( ) = = Word . commonHashLength : " hash = " + s ;
System . arraycopy ( ASCII . getBytes ( s ) , 0 , bb , p , Word . commonHashLength ) ;
p + = Word . commonHashLength ;
}
return ASCII . String ( bb ) ;
}
2010-04-15 15:22:59 +02:00
public static String anonymizedQueryHashes ( final HandleSet hashes ) {
2008-08-28 23:15:59 +02:00
// create a more anonymized representation of a query hashes for logging
2009-04-16 17:29:00 +02:00
final Iterator < byte [ ] > i = hashes . iterator ( ) ;
2009-10-11 02:12:19 +02:00
final StringBuilder sb = new StringBuilder ( hashes . size ( ) * ( Word . commonHashLength + 2 ) + 2 ) ;
2007-12-12 19:57:43 +01:00
sb . append ( " [ " ) ;
2009-04-16 17:29:00 +02:00
byte [ ] hash ;
2007-12-12 19:57:43 +01:00
if ( i . hasNext ( ) ) {
2008-01-08 21:12:31 +01:00
hash = i . next ( ) ;
2011-05-27 10:24:54 +02:00
sb . append ( ASCII . String ( hash ) . substring ( 0 , 3 ) ) . append ( " ......... " ) ;
2007-12-12 19:57:43 +01:00
}
while ( i . hasNext ( ) ) {
2008-01-08 21:12:31 +01:00
hash = i . next ( ) ;
2011-05-27 10:24:54 +02:00
sb . append ( " , " ) . append ( ASCII . String ( hash ) . substring ( 0 , 3 ) ) . append ( " ......... " ) ;
2007-12-12 19:57:43 +01:00
}
sb . append ( " ] " ) ;
2011-03-07 21:36:40 +01:00
return sb . toString ( ) ;
2007-12-12 19:57:43 +01:00
}
2011-06-13 23:44:03 +02:00
2010-06-22 14:28:53 +02:00
/ * *
* check if the given text matches with the query
* this checks inclusion and exclusion words
* @param text
* @return true if the query matches with the given text
* /
2012-11-01 10:22:22 +01:00
private final boolean matchesText ( final String text ) {
2010-07-18 11:10:46 +02:00
boolean ret = false ;
2010-10-18 13:35:09 +02:00
final HandleSet wordhashes = Word . words2hashesHandles ( Condenser . getWords ( text , null ) . keySet ( ) ) ;
2012-11-18 01:22:41 +01:00
if ( ! SetTools . anymatch ( wordhashes , this . queryGoal . getExcludeHashes ( ) ) ) {
ret = SetTools . totalInclusion ( this . queryGoal . getIncludeHashes ( ) , wordhashes ) ;
2010-07-18 11:10:46 +02:00
}
return ret ;
2010-06-22 14:28:53 +02:00
}
2011-06-13 23:44:03 +02:00
2012-11-01 10:22:22 +01:00
protected static final boolean anymatch ( final String text , final HandleSet keyhashes ) {
2007-04-05 12:14:48 +02:00
// returns true if any of the word hashes in keyhashes appear in the String text
// to do this, all words in the string must be recognized and transcoded to word hashes
2012-06-04 15:37:39 +02:00
if ( keyhashes = = null | | keyhashes . isEmpty ( ) ) return false ;
2010-10-18 13:35:09 +02:00
final HandleSet wordhashes = Word . words2hashesHandles ( Condenser . getWords ( text , null ) . keySet ( ) ) ;
2009-01-30 16:33:00 +01:00
return SetTools . anymatch ( wordhashes , keyhashes ) ;
2007-04-05 12:14:48 +02:00
}
2011-06-13 23:44:03 +02:00
2012-10-07 07:46:55 +02:00
public SolrQuery solrQuery ( ) {
2013-02-02 07:21:18 +01:00
if ( this . cachedQuery ! = null ) {
this . cachedQuery . setStart ( this . offset ) ;
return this . cachedQuery ;
}
2012-11-18 16:03:34 +01:00
if ( this . queryGoal . getIncludeStrings ( ) . size ( ) = = 0 ) return null ;
2012-12-19 01:00:57 +01:00
// construct query
final SolrQuery params = new SolrQuery ( ) ;
2013-02-21 13:23:55 +01:00
params . setQuery ( this . queryGoal . solrQueryString ( this . indexSegment . fulltext ( ) . getDefaultConfiguration ( ) ) . toString ( ) ) ;
2012-12-19 01:56:33 +01:00
params . setParam ( " defType " , " edismax " ) ;
params . setParam ( " bq " , Boost . RANKING . getBoostQuery ( ) ) ; // a boost query that moves double content to the back
params . setParam ( " bf " , Boost . RANKING . getBoostFunction ( ) ) ; // a boost function extension
params . setStart ( this . offset ) ;
params . setRows ( this . itemsPerPage ) ;
params . setFacet ( false ) ;
2012-12-19 01:00:57 +01:00
2012-12-19 01:56:33 +01:00
// add site facets
2012-12-19 01:00:57 +01:00
final StringBuilder fq = new StringBuilder ( ) ;
2013-02-12 03:42:46 +01:00
if ( this . modifier . sitehash = = null & & this . modifier . sitehost = = null ) {
2012-08-27 14:41:33 +02:00
if ( this . siteexcludes ! = null ) {
for ( String ex : this . siteexcludes ) {
2013-02-21 13:23:55 +01:00
fq . append ( " AND - " ) . append ( CollectionSchema . host_id_s . getSolrFieldName ( ) ) . append ( ':' ) . append ( ex ) ;
2012-08-27 14:41:33 +02:00
}
}
} else {
2013-02-12 03:42:46 +01:00
if ( this . modifier . sitehost ! = null ) {
2013-01-16 14:54:35 +01:00
// consider to search for hosts with 'www'-prefix, if not already part of the host name
2013-02-12 03:42:46 +01:00
if ( this . modifier . sitehost . startsWith ( " www. " ) ) {
2013-02-21 13:23:55 +01:00
fq . append ( " AND ( " ) . append ( CollectionSchema . host_s . getSolrFieldName ( ) ) . append ( " : \" " ) . append ( this . modifier . sitehost . substring ( 4 ) ) . append ( '\"' ) ;
fq . append ( " OR " ) . append ( CollectionSchema . host_s . getSolrFieldName ( ) ) . append ( " : \" " ) . append ( this . modifier . sitehost ) . append ( " \" ) " ) ;
2013-01-16 14:54:35 +01:00
} else {
2013-02-21 13:23:55 +01:00
fq . append ( " AND ( " ) . append ( CollectionSchema . host_s . getSolrFieldName ( ) ) . append ( " : \" " ) . append ( this . modifier . sitehost ) . append ( '\"' ) ;
fq . append ( " OR " ) . append ( CollectionSchema . host_s . getSolrFieldName ( ) ) . append ( " : \" www. " ) . append ( this . modifier . sitehost ) . append ( " \" ) " ) ;
2013-01-16 14:54:35 +01:00
}
} else
2013-02-21 13:23:55 +01:00
fq . append ( " AND " ) . append ( CollectionSchema . host_id_s . getSolrFieldName ( ) ) . append ( " : \" " ) . append ( this . modifier . sitehash ) . append ( '\"' ) ;
2012-08-27 14:41:33 +02:00
}
2012-11-07 13:53:29 +01:00
2012-12-18 02:29:03 +01:00
// add vocabulary facets
for ( Tagging . Metatag tag : this . metatags ) {
2013-02-21 13:23:55 +01:00
fq . append ( " AND " ) . append ( CollectionSchema . VOCABULARY_PREFIX ) . append ( tag . getVocabularyName ( ) ) . append ( CollectionSchema . VOCABULARY_SUFFIX ) . append ( " : \" " ) . append ( tag . getObject ( ) ) . append ( '\"' ) ;
2012-12-18 02:29:03 +01:00
}
2012-12-19 01:56:33 +01:00
// add author facets
2013-02-21 13:23:55 +01:00
if ( this . modifier . author ! = null & & this . modifier . author . length ( ) > 0 & & this . solrSchema . contains ( CollectionSchema . author_sxt ) ) {
fq . append ( " AND " ) . append ( CollectionSchema . author_sxt . getSolrFieldName ( ) ) . append ( " : \" " ) . append ( this . modifier . author ) . append ( '\"' ) ;
2012-12-19 01:56:33 +01:00
}
2012-10-07 07:46:55 +02:00
2013-02-12 03:42:46 +01:00
if ( this . modifier . protocol ! = null ) {
2013-02-21 13:23:55 +01:00
fq . append ( " AND " ) . append ( CollectionSchema . url_protocol_s . getSolrFieldName ( ) ) . append ( ':' ) . append ( this . modifier . protocol ) ;
2012-12-19 12:45:40 +01:00
}
if ( this . tld ! = null ) {
2013-02-21 13:23:55 +01:00
fq . append ( " AND " ) . append ( CollectionSchema . host_dnc_s . getSolrFieldName ( ) ) . append ( " : \" " ) . append ( this . tld ) . append ( '\"' ) ;
2012-12-19 12:45:40 +01:00
}
2013-02-12 03:42:46 +01:00
if ( this . modifier . filetype ! = null ) {
2013-02-21 13:23:55 +01:00
fq . append ( " AND " ) . append ( CollectionSchema . url_file_ext_s . getSolrFieldName ( ) ) . append ( " : \" " ) . append ( this . modifier . filetype ) . append ( '\"' ) ;
2012-12-19 12:45:40 +01:00
}
2013-03-04 21:18:54 +01:00
if ( this . contentdom = = ContentDomain . IMAGE ) {
fq . append ( " AND ( " ) . append ( CollectionSchema . url_file_ext_s . getSolrFieldName ( ) ) . append ( " : \" jpg \" " ) ;
fq . append ( " OR " ) . append ( CollectionSchema . url_file_ext_s . getSolrFieldName ( ) ) . append ( " : \" tif \" " ) ;
fq . append ( " OR " ) . append ( CollectionSchema . url_file_ext_s . getSolrFieldName ( ) ) . append ( " : \" tiff \" " ) ;
fq . append ( " OR " ) . append ( CollectionSchema . url_file_ext_s . getSolrFieldName ( ) ) . append ( " : \" png \" ) " ) ;
}
if ( this . contentdom = = ContentDomain . AUDIO ) {
fq . append ( " AND ( " ) . append ( CollectionSchema . url_file_ext_s . getSolrFieldName ( ) ) . append ( " : \" aif \" " ) ;
fq . append ( " OR " ) . append ( CollectionSchema . url_file_ext_s . getSolrFieldName ( ) ) . append ( " : \" aiff \" " ) ;
fq . append ( " OR " ) . append ( CollectionSchema . url_file_ext_s . getSolrFieldName ( ) ) . append ( " : \" mp3 \" " ) ;
fq . append ( " OR " ) . append ( CollectionSchema . url_file_ext_s . getSolrFieldName ( ) ) . append ( " : \" ogg \" ) " ) ;
}
if ( this . contentdom = = ContentDomain . VIDEO ) {
fq . append ( " AND ( " ) . append ( CollectionSchema . url_file_ext_s . getSolrFieldName ( ) ) . append ( " : \" mpg \" " ) ;
fq . append ( " OR " ) . append ( CollectionSchema . url_file_ext_s . getSolrFieldName ( ) ) . append ( " : \" avi \" " ) ;
fq . append ( " OR " ) . append ( CollectionSchema . url_file_ext_s . getSolrFieldName ( ) ) . append ( " : \" mp4 \" " ) ;
fq . append ( " OR " ) . append ( CollectionSchema . url_file_ext_s . getSolrFieldName ( ) ) . append ( " : \" mkv \" ) " ) ;
}
if ( this . contentdom = = ContentDomain . APP ) {
fq . append ( " AND ( " ) . append ( CollectionSchema . url_file_ext_s . getSolrFieldName ( ) ) . append ( " : \" apk \" " ) ;
fq . append ( " OR " ) . append ( CollectionSchema . url_file_ext_s . getSolrFieldName ( ) ) . append ( " : \" exe \" " ) ;
fq . append ( " OR " ) . append ( CollectionSchema . url_file_ext_s . getSolrFieldName ( ) ) . append ( " : \" dmg \" " ) ;
fq . append ( " OR " ) . append ( CollectionSchema . url_file_ext_s . getSolrFieldName ( ) ) . append ( " : \" gz \" ) " ) ;
}
2013-01-14 12:50:21 +01:00
if ( this . inlink ! = null ) {
2013-02-21 13:23:55 +01:00
fq . append ( " AND " ) . append ( CollectionSchema . outboundlinks_urlstub_txt . getSolrFieldName ( ) ) . append ( " : \" " ) . append ( this . inlink ) . append ( '\"' ) ;
2013-01-14 12:50:21 +01:00
}
2012-11-13 11:45:56 +01:00
if ( ! this . urlMask_isCatchall ) {
2012-12-19 12:45:40 +01:00
// add a filter query on urls
2012-11-13 11:45:56 +01:00
String urlMaskPattern = this . urlMask . pattern ( ) ;
2012-12-19 01:00:57 +01:00
// solr doesn't like slashes, backslashes or doublepoints; remove them // urlmask = ".*\\." + ft + "(\\?.*)?";
2012-11-13 11:45:56 +01:00
int p ;
while ( ( p = urlMaskPattern . indexOf ( ':' ) ) > = 0 ) urlMaskPattern = urlMaskPattern . substring ( 0 , p ) + " . " + urlMaskPattern . substring ( p + 1 ) ;
while ( ( p = urlMaskPattern . indexOf ( '/' ) ) > = 0 ) urlMaskPattern = urlMaskPattern . substring ( 0 , p ) + " . " + urlMaskPattern . substring ( p + 1 ) ;
2012-12-19 01:00:57 +01:00
while ( ( p = urlMaskPattern . indexOf ( '\\' ) ) > = 0 ) urlMaskPattern = urlMaskPattern . substring ( 0 , p ) + " . " + urlMaskPattern . substring ( p + 2 ) ;
2013-02-21 13:23:55 +01:00
fq . append ( " AND " ) . append ( CollectionSchema . sku . getSolrFieldName ( ) + " :/ " + urlMaskPattern + " / " ) ;
2012-11-13 11:45:56 +01:00
}
2012-08-27 14:41:33 +02:00
if ( this . radius > 0 . 0d & & this . lat ! = 0 . 0d & & this . lon ! = 0 . 0d ) {
2012-08-27 15:25:25 +02:00
// localtion search, no special ranking
2012-10-07 07:46:55 +02:00
// try http://localhost:8090/solr/select?q=*:*&fq={!bbox sfield=coordinate_p pt=50.17,8.65 d=1}
//params.setQuery("!bbox " + q.toString());
//params.set("sfield", YaCySchema.coordinate_p.name());
//params.set("pt", Double.toString(this.lat) + "," + Double.toString(this.lon));
//params.set("d", GeoLocation.degreeToKm(this.radius));
2013-02-21 13:23:55 +01:00
fq . append ( " AND " ) . append ( " {!bbox sfield= " + CollectionSchema . coordinate_p . getSolrFieldName ( ) + " pt= " + Double . toString ( this . lat ) + " , " + Double . toString ( this . lon ) + " d= " + GeoLocation . degreeToKm ( this . radius ) + " } " ) ;
2012-10-07 07:46:55 +02:00
//params.setRows(Integer.MAX_VALUE);
2012-08-27 15:25:25 +02:00
} else {
2012-09-26 16:56:33 +02:00
// set ranking
if ( this . ranking . coeff_date = = RankingProfile . COEFF_MAX ) {
// set a most-recent ordering
2013-02-21 13:23:55 +01:00
params . setSortField ( CollectionSchema . last_modified . getSolrFieldName ( ) , ORDER . desc ) ;
2012-09-25 23:59:30 +02:00
}
2012-08-27 14:41:33 +02:00
}
2013-02-02 07:21:18 +01:00
if ( fq . length ( ) > 0 ) {
params . setFilterQueries ( fq . substring ( 5 ) ) ;
}
2012-09-26 16:56:33 +02:00
2012-08-27 14:41:33 +02:00
// prepare result
2012-10-07 07:46:55 +02:00
Log . logInfo ( " Protocol " , " SOLR QUERY: " + params . toString ( ) ) ;
2013-02-02 07:21:18 +01:00
this . cachedQuery = params ;
2012-10-07 07:46:55 +02:00
return params ;
2005-12-06 17:15:21 +01:00
}
2012-10-02 14:29:45 +02:00
2012-11-18 01:22:41 +01:00
public QueryGoal getQueryGoal ( ) {
return this . queryGoal ;
2012-10-02 14:29:45 +02:00
}
2013-02-22 15:45:15 +01:00
public final Map < DigestURI , String > separateMatches ( final Map < DigestURI , String > links ) {
final Map < DigestURI , String > matcher = new HashMap < DigestURI , String > ( ) ;
final Iterator < Map . Entry < DigestURI , String > > i = links . entrySet ( ) . iterator ( ) ;
Map . Entry < DigestURI , String > entry ;
DigestURI url ;
2010-06-23 13:19:32 +02:00
String anchorText ;
while ( i . hasNext ( ) ) {
entry = i . next ( ) ;
url = entry . getKey ( ) ;
anchorText = entry . getValue ( ) ;
2011-06-13 23:44:03 +02:00
if ( matchesText ( anchorText ) ) {
2010-06-23 13:19:32 +02:00
matcher . put ( url , anchorText ) ;
i . remove ( ) ;
}
}
return matcher ;
}
2011-06-13 23:44:03 +02:00
2012-06-05 12:06:26 +02:00
private volatile String idCacheAnon = null , idCache = null ;
2010-09-06 12:00:07 +02:00
final static private char asterisk = '*' ;
2008-08-02 14:12:04 +02:00
public String id ( final boolean anonymized ) {
2010-09-06 12:00:07 +02:00
if ( anonymized ) {
2011-06-13 23:44:03 +02:00
if ( this . idCacheAnon ! = null ) return this . idCacheAnon ;
2010-09-06 12:00:07 +02:00
} else {
2011-06-13 23:44:03 +02:00
if ( this . idCache ! = null ) return this . idCache ;
2010-09-06 12:00:07 +02:00
}
2012-06-05 12:06:26 +02:00
synchronized ( this ) {
// do a Double-Checked Locking
if ( anonymized ) {
if ( this . idCacheAnon ! = null ) return this . idCacheAnon ;
} else {
if ( this . idCache ! = null ) return this . idCache ;
}
// generate a string that identifies a search so results can be re-used in a cache
final StringBuilder context = new StringBuilder ( 180 ) ;
if ( anonymized ) {
2012-11-18 01:22:41 +01:00
context . append ( anonymizedQueryHashes ( this . queryGoal . getIncludeHashes ( ) ) ) ;
2012-06-05 12:06:26 +02:00
context . append ( '-' ) ;
2012-11-18 01:22:41 +01:00
context . append ( anonymizedQueryHashes ( this . queryGoal . getExcludeHashes ( ) ) ) ;
2012-06-05 12:06:26 +02:00
} else {
2012-11-18 01:22:41 +01:00
context . append ( hashSet2hashString ( this . queryGoal . getIncludeHashes ( ) ) ) ;
2012-06-05 12:06:26 +02:00
context . append ( '-' ) ;
2012-11-18 01:22:41 +01:00
context . append ( hashSet2hashString ( this . queryGoal . getExcludeHashes ( ) ) ) ;
2012-06-05 12:06:26 +02:00
}
//context.append(asterisk);
//context.append(this.domType);
context . append ( asterisk ) ;
context . append ( this . contentdom ) . append ( asterisk ) ;
context . append ( this . zonecode ) . append ( asterisk ) ;
context . append ( ASCII . String ( Word . word2hash ( this . ranking . toExternalString ( ) ) ) ) . append ( asterisk ) ;
context . append ( Base64Order . enhancedCoder . encodeString ( this . prefer . toString ( ) ) ) . append ( asterisk ) ;
context . append ( Base64Order . enhancedCoder . encodeString ( this . urlMask . toString ( ) ) ) . append ( asterisk ) ;
2013-02-12 03:42:46 +01:00
context . append ( this . modifier . sitehash ) . append ( asterisk ) ;
2012-06-05 12:06:26 +02:00
context . append ( this . siteexcludes ) . append ( asterisk ) ;
2013-02-12 03:42:46 +01:00
context . append ( this . modifier . author ) . append ( asterisk ) ;
2012-06-05 12:06:26 +02:00
context . append ( this . targetlang ) . append ( asterisk ) ;
context . append ( this . constraint ) . append ( asterisk ) ;
context . append ( this . maxDistance ) . append ( asterisk ) ;
2013-02-12 03:42:46 +01:00
context . append ( this . modifier . toString ( ) ) . append ( asterisk ) ;
context . append ( this . modifier . protocol ) . append ( asterisk ) ;
2013-01-14 12:50:21 +01:00
context . append ( this . tld ) . append ( asterisk ) ;
2013-02-12 03:42:46 +01:00
context . append ( this . modifier . filetype ) . append ( asterisk ) ;
2013-01-14 12:50:21 +01:00
context . append ( this . inlink ) . append ( asterisk ) ;
2012-06-05 12:06:26 +02:00
context . append ( this . lat ) . append ( asterisk ) . append ( this . lon ) . append ( asterisk ) . append ( this . radius ) . append ( asterisk ) ;
context . append ( this . snippetCacheStrategy = = null ? " null " : this . snippetCacheStrategy . name ( ) ) ;
String result = context . toString ( ) ;
if ( anonymized ) {
this . idCacheAnon = result ;
} else {
this . idCache = result ;
}
return result ;
2010-07-18 11:10:46 +02:00
}
2007-08-25 01:12:59 +02:00
}
2011-06-13 23:44:03 +02:00
2009-06-02 00:45:28 +02:00
/ * *
* make a query anchor tag
* @param page
* @param theQuery
* @param originalUrlMask
* @param addToQuery
* @return
* /
2012-11-13 11:45:56 +01:00
public static StringBuilder navurl ( final String ext , final int page , final QueryParams theQuery , final String newQueryString ) {
2011-06-13 23:44:03 +02:00
2012-11-13 11:45:56 +01:00
final StringBuilder sb = navurlBase ( ext , theQuery , newQueryString ) ;
2011-06-13 23:44:03 +02:00
sb . append ( ampersand ) ;
sb . append ( " startRecord= " ) ;
2012-05-21 01:58:29 +02:00
sb . append ( page * theQuery . itemsPerPage ( ) ) ;
2011-06-13 23:44:03 +02:00
return sb ;
}
2012-11-13 11:45:56 +01:00
public static StringBuilder navurlBase ( final String ext , final QueryParams theQuery , final String newQueryString ) {
2010-07-18 11:10:46 +02:00
2011-03-09 10:29:05 +01:00
final StringBuilder sb = new StringBuilder ( 120 ) ;
2010-07-18 11:10:46 +02:00
sb . append ( " /yacysearch. " ) ;
sb . append ( ext ) ;
2010-12-08 11:50:23 +01:00
sb . append ( " ?query= " ) ;
2012-12-15 00:05:46 +01:00
sb . append ( newQueryString = = null ? theQuery . getQueryGoal ( ) . getOriginalQueryString ( true ) : newQueryString ) ;
2010-07-18 11:10:46 +02:00
sb . append ( ampersand ) ;
sb . append ( " maximumRecords= " ) ;
2012-05-21 01:58:29 +02:00
sb . append ( theQuery . itemsPerPage ( ) ) ;
2010-07-18 11:10:46 +02:00
sb . append ( ampersand ) ;
sb . append ( " resource= " ) ;
sb . append ( ( theQuery . isLocal ( ) ) ? " local " : " global " ) ;
sb . append ( ampersand ) ;
sb . append ( " verify= " ) ;
2010-10-09 10:55:57 +02:00
sb . append ( theQuery . snippetCacheStrategy = = null ? " false " : theQuery . snippetCacheStrategy . toName ( ) ) ;
2010-07-18 11:10:46 +02:00
sb . append ( ampersand ) ;
sb . append ( " prefermaskfilter= " ) ;
sb . append ( theQuery . prefer ) ;
sb . append ( ampersand ) ;
sb . append ( " cat=href " ) ;
sb . append ( ampersand ) ;
sb . append ( " constraint= " ) ;
sb . append ( ( theQuery . constraint = = null ) ? " " : theQuery . constraint . exportB64 ( ) ) ;
sb . append ( ampersand ) ;
sb . append ( " contentdom= " ) ;
2012-04-22 00:04:36 +02:00
sb . append ( theQuery . contentdom . toString ( ) ) ;
2010-07-18 11:10:46 +02:00
sb . append ( ampersand ) ;
sb . append ( " former= " ) ;
2012-12-15 00:05:46 +01:00
sb . append ( theQuery . getQueryGoal ( ) . getOriginalQueryString ( true ) ) ;
2010-07-18 11:10:46 +02:00
2011-06-13 23:44:03 +02:00
return sb ;
2009-06-02 00:45:28 +02:00
}
2011-06-13 23:44:03 +02:00
2005-10-10 02:33:25 +02:00
}