2012-09-21 15:48:16 +02:00
package net.yacy.data ;
2009-06-12 22:36:03 +02:00
2014-02-03 12:44:52 +01:00
import java.io.IOException ;
2014-02-04 00:18:11 +01:00
import java.util.Collection ;
2009-06-14 01:16:14 +02:00
import java.util.Collections ;
2009-06-27 09:23:34 +02:00
import java.util.Comparator ;
2012-05-25 16:21:44 +02:00
import java.util.ConcurrentModificationException ;
2014-02-03 12:44:52 +01:00
import java.util.Iterator ;
2015-02-09 18:45:07 +01:00
import java.util.LinkedHashSet ;
2014-02-03 12:44:52 +01:00
import java.util.List ;
import java.util.Map ;
2009-06-12 22:36:03 +02:00
import java.util.Set ;
2009-10-05 23:51:02 +02:00
import java.util.SortedSet ;
2009-06-27 09:23:34 +02:00
import java.util.TreeSet ;
2009-06-12 22:36:03 +02:00
2014-02-03 12:44:52 +01:00
import org.apache.solr.client.solrj.SolrQuery ;
import org.apache.solr.client.solrj.response.QueryResponse ;
import org.apache.solr.common.SolrException ;
2011-12-16 23:59:29 +01:00
import net.yacy.cora.sorting.ClusteredScoreMap ;
2014-02-03 12:44:52 +01:00
import net.yacy.cora.sorting.OrderedScoreMap ;
2011-12-16 23:59:29 +01:00
import net.yacy.cora.sorting.ReversibleScoreMap ;
2015-01-29 02:28:03 +01:00
import net.yacy.cora.util.CommonPattern ;
2013-07-09 14:28:25 +02:00
import net.yacy.cora.util.ConcurrentLog ;
2012-09-20 19:38:22 +02:00
import net.yacy.cora.util.StringBuilderComparator ;
2011-02-12 01:01:40 +01:00
import net.yacy.document.LibraryProvider ;
2012-08-31 13:03:00 +02:00
import net.yacy.search.index.Segment ;
2014-02-03 12:44:52 +01:00
import net.yacy.search.schema.CollectionSchema ;
2009-10-10 01:13:30 +02:00
2009-06-12 22:36:03 +02:00
2009-06-27 09:23:34 +02:00
/ * *
2011-11-09 15:42:55 +01:00
* People make mistakes when they type words .
2009-06-27 09:23:34 +02:00
* The most common mistakes are the four categories listed below :
* < ol >
* < li > Changing one letter : bat / cat ; < / li >
* < li > Adding one letter : bat / boat ; < / li >
* < li > Deleting one letter : frog / fog ; or < / li >
* < li > Reversing two consecutive letters : two / tow . < / li >
* < / ol >
* DidYouMean provides producer threads , that feed a blocking queue with word variations according to
* the above mentioned four categories . Consumer threads check then the generated word variations against a term index .
* Only words contained in the term index are return by the getSuggestion method . < p / >
* @author apfelmaennchen
2014-02-03 12:44:52 +01:00
* @author orbiter ( extensions for multi - language support + multi - word suggestions )
2009-06-27 09:23:34 +02:00
* /
2009-06-12 22:36:03 +02:00
public class DidYouMean {
2010-10-13 00:02:10 +02:00
private static final int MinimumInputWordLength = 2 ;
private static final int MinimumOutputWordLength = 4 ;
2011-11-09 15:42:55 +01:00
2010-10-12 10:36:33 +02:00
private static final char [ ] ALPHABET_LATIN = {
2009-09-08 15:48:17 +02:00
'a' , 'b' , 'c' , 'd' , 'e' , 'f' , 'g' , 'h' , 'i' , 'j' , 'k' , 'l' , 'm' , 'n' , 'o' , 'p' ,
2010-10-12 10:36:33 +02:00
'q' , 'r' , 's' , 't' , 'u' , 'v' , 'w' , 'x' , 'y' , 'z' ,
'\u00df' ,
'\u00e0' , '\u00e1' , '\u00e2' , '\u00e3' , '\u00e4' , '\u00e5' , '\u00e6' , '\u00e7' ,
'\u00e8' , '\u00e9' , '\u00ea' , '\u00eb' , '\u00ec' , '\u00ed' , '\u00ee' , '\u00ef' ,
'\u00f0' , '\u00f1' , '\u00f2' , '\u00f3' , '\u00f4' , '\u00f5' , '\u00f6' ,
'\u00f8' , '\u00f9' , '\u00fa' , '\u00fb' , '\u00fc' , '\u00fd' , '\u00fe' , '\u00ff' } ;
2013-10-09 16:17:50 +02:00
private static final char [ ] ALPHABET_KANJI = new char [ 512 ] ; // \u3400-\u34ff + \u4e00-\u4eff
private static final char [ ] ALPHABET_HIRAGANA = new char [ 96 ] ; // \u3040-\u309F
private static final char [ ] ALPHABET_KATAKANA = new char [ 96 ] ; // \u30A0-\u30FF
private static final char [ ] ALPHABET_CJK_UNIFIED_IDEOGRAPHS_Part1 = new char [ 5376 ] ; // \u4E00-\u62FF
private static final char [ ] ALPHABET_CJK_UNIFIED_IDEOGRAPHS_Part2 = new char [ 5376 ] ; // \u6300-\u77FF
private static final char [ ] ALPHABET_CJK_UNIFIED_IDEOGRAPHS_Part3 = new char [ 5376 ] ; // \u7800-\u8CFF
private static final char [ ] ALPHABET_CJK_UNIFIED_IDEOGRAPHS_Part4 = new char [ 4864 ] ; // \u8D00-\u9FFF
2010-10-12 10:36:33 +02:00
static {
// this is very experimental: a very small subset of Kanji
2013-10-09 16:17:50 +02:00
for ( char a = '\u3400' ; a < = '\u34ff' ; a + + ) ALPHABET_KANJI [ 0xff & ( a - '\u3400' ) ] = a ;
for ( char a = '\u4e00' ; a < = '\u4eff' ; a + + ) ALPHABET_KANJI [ 0xff & ( a - '\u4e00' ) + 256 ] = a ;
for ( char a = '\u3040' ; a < = '\u309F' ; a + + ) ALPHABET_HIRAGANA [ 0xff & ( a - '\u3040' ) ] = a ;
for ( char a = '\u30A0' ; a < = '\u30FF' ; a + + ) ALPHABET_KATAKANA [ 0xff & ( a - '\u30A0' ) ] = a ;
for ( char a = '\u4E00' ; a < = '\u62FF' ; a + + ) ALPHABET_CJK_UNIFIED_IDEOGRAPHS_Part1 [ 0xff & ( a - '\u4E00' ) ] = a ;
for ( char a = '\u6300' ; a < = '\u77FF' ; a + + ) ALPHABET_CJK_UNIFIED_IDEOGRAPHS_Part2 [ 0xff & ( a - '\u6300' ) ] = a ;
for ( char a = '\u7800' ; a < = '\u8CFF' ; a + + ) ALPHABET_CJK_UNIFIED_IDEOGRAPHS_Part3 [ 0xff & ( a - '\u7800' ) ] = a ;
for ( char a = '\u8D00' ; a < = '\u9FFF' ; a + + ) ALPHABET_CJK_UNIFIED_IDEOGRAPHS_Part4 [ 0xff & ( a - '\u8D00' ) ] = a ;
2010-10-12 10:36:33 +02:00
}
2011-11-09 15:42:55 +01:00
2013-10-09 16:17:50 +02:00
private static final char [ ] [ ] ALPHABETS = {
ALPHABET_LATIN , ALPHABET_KANJI , ALPHABET_HIRAGANA , ALPHABET_KATAKANA ,
ALPHABET_CJK_UNIFIED_IDEOGRAPHS_Part1 , ALPHABET_CJK_UNIFIED_IDEOGRAPHS_Part2 , ALPHABET_CJK_UNIFIED_IDEOGRAPHS_Part3 , ALPHABET_CJK_UNIFIED_IDEOGRAPHS_Part4 } ;
2012-01-05 08:35:44 +01:00
public static final int AVAILABLE_CPU = Runtime . getRuntime ( ) . availableProcessors ( ) ;
2010-10-12 10:36:33 +02:00
private static final wordLengthComparator WORD_LENGTH_COMPARATOR = new wordLengthComparator ( ) ;
2011-11-09 15:42:55 +01:00
2012-08-31 13:03:00 +02:00
private final Segment segment ;
2011-11-09 15:42:55 +01:00
private final StringBuilder word ;
2015-04-10 15:59:18 +02:00
private final boolean endsWithSpace ;
2010-10-12 10:36:33 +02:00
private final int wordLen ;
private long timeLimit ;
2011-11-09 15:42:55 +01:00
private final SortedSet < StringBuilder > resultSet ;
2012-01-05 08:35:44 +01:00
private char [ ] alphabet ;
2014-02-03 12:44:52 +01:00
private boolean more ;
2011-11-09 15:42:55 +01:00
2010-04-13 03:16:09 +02:00
/ * *
* @param index a termIndex - most likely retrieved from a switchboard object .
* @param sort true / false - sorts the resulting TreeSet by index . count ( ) ; < b > Warning : < / b > this causes heavy i / o .
* /
2015-02-09 18:45:07 +01:00
public DidYouMean ( final Segment segment , final String word0 ) {
2015-04-10 15:59:18 +02:00
this . endsWithSpace = word0 . length ( ) > 0 & & word0 . charAt ( word0 . length ( ) - 1 ) = = ' ' ;
2015-02-09 18:45:07 +01:00
this . word = new StringBuilder ( word0 . trim ( ) ) ;
this . resultSet = Collections . synchronizedSortedSet ( new TreeSet < StringBuilder > ( new headMatchingComparator ( this . word , WORD_LENGTH_COMPARATOR ) ) ) ;
2011-11-09 15:42:55 +01:00
this . wordLen = this . word . length ( ) ;
2012-08-31 13:03:00 +02:00
this . segment = segment ;
2014-02-03 12:44:52 +01:00
this . more = segment . connectedRWI ( ) & & segment . RWICount ( ) > 0 ; // with RWIs connected the guessing is super-fast
2011-11-09 15:42:55 +01:00
2010-10-12 10:36:33 +02:00
// identify language
2010-11-04 01:25:19 +01:00
if ( this . word . length ( ) > 0 ) {
2015-02-09 18:45:07 +01:00
char testchar = this . word . charAt ( 0 ) ;
if ( testchar > = 'A' & & testchar < = 'Z' ) testchar = ( char ) ( testchar + 32 ) ;
2010-11-04 01:25:19 +01:00
boolean alphafound = false ;
2011-11-09 15:42:55 +01:00
alphatest : for ( final char [ ] alpha : ALPHABETS ) {
2010-10-12 10:36:33 +02:00
if ( isAlphabet ( alpha , testchar ) ) {
2012-01-05 08:35:44 +01:00
this . alphabet = new char [ alpha . length ] ;
2013-10-09 16:17:50 +02:00
System . arraycopy ( alpha , 0 , this . alphabet , 0 , alpha . length ) ;
2010-11-04 01:25:19 +01:00
alphafound = true ;
2010-10-12 10:36:33 +02:00
break alphatest ;
}
}
2013-10-09 16:17:50 +02:00
if ( ! alphafound & & testchar < 'A' ) {
this . alphabet = new char [ ALPHABET_LATIN . length ] ;
System . arraycopy ( ALPHABET_LATIN , 0 , this . alphabet , 0 , ALPHABET_LATIN . length ) ;
alphafound = true ;
}
2010-11-04 01:25:19 +01:00
if ( ! alphafound ) {
2010-10-12 10:36:33 +02:00
// generate generic alphabet using simply a character block of 256 characters
2012-01-05 08:35:44 +01:00
final int firstchar = ( 0xff & ( testchar / 256 ) ) * 256 ;
final int lastchar = firstchar + 255 ;
this . alphabet = new char [ 256 ] ;
// test this with /suggest.json?q=%EF%BD%84
for ( int a = firstchar ; a < = lastchar ; a + + ) {
this . alphabet [ 0xff & ( a - firstchar ) ] = ( char ) a ;
2010-10-12 10:36:33 +02:00
}
}
}
2010-04-13 03:16:09 +02:00
}
2011-11-09 15:42:55 +01:00
2010-11-21 04:39:53 +01:00
private static final boolean isAlphabet ( final char [ ] alpha , final char testchar ) {
2012-01-05 08:35:44 +01:00
for ( final char a : alpha ) {
if ( a = = testchar ) {
return true ;
}
}
2010-10-12 10:36:33 +02:00
return false ;
}
2011-11-09 15:42:55 +01:00
2010-04-13 03:16:09 +02:00
public void reset ( ) {
this . resultSet . clear ( ) ;
}
2011-11-09 15:42:55 +01:00
2010-04-13 03:16:09 +02:00
/ * *
* get suggestions for a given word . The result is first ordered using a term size ordering ,
* and a subset of the result is sorted again with a IO - intensive order based on the index size
2010-10-12 10:36:33 +02:00
* @param word0
2010-04-13 03:16:09 +02:00
* @param timeout
* @param preSortSelection the number of words that participate in the IO - intensive sort
* @return
* /
2014-02-04 00:18:11 +01:00
public Collection < StringBuilder > getSuggestions ( final long timeout , final int preSortSelection ) {
2012-11-19 17:24:34 +01:00
if ( this . word . length ( ) < MinimumInputWordLength ) {
2012-01-05 08:35:44 +01:00
return this . resultSet ; // return nothing if input is too short
}
2010-11-21 04:39:53 +01:00
final long startTime = System . currentTimeMillis ( ) ;
final long timelimit = startTime + timeout ;
2014-02-03 12:44:52 +01:00
int lastIndexOfSpace = this . word . lastIndexOf ( " " ) ;
2015-02-09 18:45:07 +01:00
final Collection < StringBuilder > preSorted ;
2014-02-03 12:44:52 +01:00
if ( lastIndexOfSpace > 0 ) {
2015-02-09 18:45:07 +01:00
// several words
preSorted = getSuggestions ( this . word . substring ( 0 , lastIndexOfSpace ) , this . word . substring ( lastIndexOfSpace + 1 ) , timeout , preSortSelection , this . segment ) ;
} else {
2015-04-10 15:59:18 +02:00
if ( this . endsWithSpace ) {
preSorted = getSuggestions ( this . word . toString ( ) , " " , timeout , preSortSelection , this . segment ) ;
} else {
preSorted = getSuggestions ( timeout ) ;
}
2012-01-05 08:35:44 +01:00
}
2011-11-09 15:42:55 +01:00
final ReversibleScoreMap < StringBuilder > scored = new ClusteredScoreMap < StringBuilder > ( StringBuilderComparator . CASE_INSENSITIVE_ORDER ) ;
2015-02-09 18:45:07 +01:00
LinkedHashSet < StringBuilder > countSorted = new LinkedHashSet < StringBuilder > ( ) ;
2014-02-04 00:18:11 +01:00
if ( this . more ) {
final int wc = this . segment . getWordCountGuess ( this . word . toString ( ) ) ; // all counts must be greater than this
try {
for ( final StringBuilder s : preSorted ) {
if ( System . currentTimeMillis ( ) > timelimit ) break ;
if ( ! ( scored . sizeSmaller ( 2 * preSortSelection ) ) ) break ;
String s0 = s . toString ( ) ;
int wcg = s0 . indexOf ( ' ' ) > 0 ? s0 . length ( ) * 100 : this . segment . getWordCountGuess ( s0 ) ;
if ( wcg > wc ) scored . inc ( s , wcg ) ;
}
} catch ( final ConcurrentModificationException e ) {
2012-01-05 08:35:44 +01:00
}
2014-02-04 00:18:11 +01:00
Iterator < StringBuilder > i = scored . keys ( false ) ;
while ( i . hasNext ( ) ) countSorted . add ( i . next ( ) ) ;
} else {
try {
for ( final StringBuilder s : preSorted ) {
if ( StringBuilderComparator . CASE_INSENSITIVE_ORDER . startsWith ( s , this . word ) | |
2015-02-09 18:45:07 +01:00
StringBuilderComparator . CASE_INSENSITIVE_ORDER . endsWith ( this . word , s ) ) countSorted . add ( this . word ) ;
}
for ( final StringBuilder s : preSorted ) {
if ( ! StringBuilderComparator . CASE_INSENSITIVE_ORDER . equals ( s , this . word ) ) countSorted . add ( s ) ;
2014-02-04 00:18:11 +01:00
}
} catch ( final ConcurrentModificationException e ) {
2012-01-05 08:35:44 +01:00
}
2010-04-13 03:16:09 +02:00
}
2010-10-13 00:02:10 +02:00
// finished
2013-07-09 14:28:25 +02:00
ConcurrentLog . info ( " DidYouMean " , " found " + preSorted . size ( ) + " unsorted terms, returned " + countSorted . size ( ) + " sorted suggestions; execution time: "
2010-11-04 01:25:19 +01:00
+ ( System . currentTimeMillis ( ) - startTime ) + " ms " ) ;
2010-10-13 00:02:10 +02:00
2010-04-13 03:16:09 +02:00
return countSorted ;
}
2011-11-09 15:42:55 +01:00
2010-04-13 03:16:09 +02:00
/ * *
* return a string that is a suggestion list for the list of given words
2015-02-09 18:45:07 +01:00
* @param head - the sequence of words before the last space in the sequence , fixed ( not to be corrected ) ; possibly empty
* @param tail - the word after the last space , possibly empty or misspelled
* @param timeout for operation
* @param preSortSelection - number of suggestions to be computed
2010-04-13 03:16:09 +02:00
* @return
* /
2014-02-04 00:18:11 +01:00
private static Collection < StringBuilder > getSuggestions ( final String head , final String tail , final long timeout , final int preSortSelection , final Segment segment ) {
2011-11-09 15:42:55 +01:00
final SortedSet < StringBuilder > result = new TreeSet < StringBuilder > ( StringBuilderComparator . CASE_INSENSITIVE_ORDER ) ;
2014-02-03 18:49:03 +01:00
int count = 30 ;
2014-02-03 12:44:52 +01:00
final SolrQuery solrQuery = new SolrQuery ( ) ;
solrQuery . setParam ( " defType " , " edismax " ) ;
solrQuery . setFacet ( false ) ;
2015-04-10 15:59:18 +02:00
String q = " " , fq = " " ;
if ( head . length ( ) = = 0 & & tail . length ( ) > 0 ) {
// head == "", tail != "" -> only one word was entered, no space at end
q = CollectionSchema . title . getSolrFieldName ( ) + " : \" " + tail + " \" ^1000.0 " + CollectionSchema . text_t . getSolrFieldName ( ) + " : " + tail + " ~ " ;
fq = null ;
}
if ( head . length ( ) > 0 & & tail . length ( ) = = 0 ) {
// head != "", tail == "" -> only one word was entered and ends on space
q = CollectionSchema . title . getSolrFieldName ( ) + " : \" " + head + " \" ^1000.0 " + CollectionSchema . text_t . getSolrFieldName ( ) + " : \" " + head + " \" " ;
fq = CollectionSchema . text_t . getSolrFieldName ( ) + " : \" " + head + " \" " ;
}
if ( head . length ( ) > 0 & & tail . length ( ) > 0 ) {
// head != "", tail != "" -> several words were entered, last one is in tail, everything before in head.
q = CollectionSchema . text_t . getSolrFieldName ( ) + " :( " + head + " " + tail + " )~ " ; // for a fuzzy search we cannot apply fuzzyness on the tail only
fq = CollectionSchema . text_t . getSolrFieldName ( ) + " : \" " + head + " \" " ;
}
2015-02-09 18:45:07 +01:00
solrQuery . setQuery ( q ) ;
if ( head . length ( ) > 0 & & fq ! = null ) solrQuery . setFilterQueries ( fq ) ;
2014-02-03 12:44:52 +01:00
solrQuery . setStart ( 0 ) ;
solrQuery . setRows ( count ) ;
solrQuery . setHighlight ( true ) ;
2015-04-10 15:59:18 +02:00
//solrQuery.setHighlightFragsize(head.length() + tail.length() + 180);
2014-02-03 12:44:52 +01:00
solrQuery . setHighlightSimplePre ( " <b> " ) ;
solrQuery . setHighlightSimplePost ( " </b> " ) ;
2015-04-10 15:59:18 +02:00
solrQuery . setHighlightSnippets ( 5 ) ;
//solrQuery.addHighlightField(CollectionSchema.title.getSolrFieldName());
2014-02-03 12:44:52 +01:00
solrQuery . addHighlightField ( CollectionSchema . text_t . getSolrFieldName ( ) ) ;
solrQuery . setFields ( ) ; // no fields wanted! only snippets
OrderedScoreMap < String > snippets = new OrderedScoreMap < String > ( null ) ;
try {
QueryResponse response = segment . fulltext ( ) . getDefaultConnector ( ) . getResponseByParams ( solrQuery ) ;
2015-04-10 15:59:18 +02:00
/ *
SolrQuery query = new SolrQuery ( ) ;
query . setRequestHandler ( " /suggest " ) ;
//query.setQueryType(suggestHandler);
query . setQuery ( ( head + " " + tail ) . trim ( ) ) ;
Map < String , String > params = new HashMap < String , String > ( ) ;
params . put ( CommonParams . ROWS , Integer . toString ( count ) ) ;
params . put ( SpellingParams . SPELLCHECK_PREFIX + " field " , dictionary ) ;
params . put ( SpellingParams . SPELLCHECK_PREFIX + " dictionary " , dictionary ) ;
params . put ( SpellingParams . SPELLCHECK_ONLY_MORE_POPULAR , Boolean . toString ( onlyMorePopular ) ) ;
params . put ( SpellingParams . SPELLCHECK_MAX_COLLATION_TRIES , Integer . toString ( 1 ) ) ;
params . put ( SpellingParams . SPELLCHECK_COLLATE_EXTENDED_RESULTS , Boolean . toString ( collate ) ) ;
params . put ( SpellingParams . SPELLCHECK_COLLATE , Boolean . toString ( collate ) ) ;
query . add ( new MapSolrParams ( params ) ) ;
response = segment . fulltext ( ) . getDefaultConnector ( ) . getResponseByParams ( query ) ;
SpellCheckResponse spellCheckResponse = response . getSpellCheckResponse ( ) ;
if ( spellCheckResponse ! = null ) {
Map < String , Suggestion > suggestionMapInternal = spellCheckResponse . getSuggestionMap ( ) ;
if ( suggestionMapInternal ! = null ) {
Map < String , Suggestion > suggestionMap = spellCheckResponse . getSuggestionMap ( ) ;
}
if ( spellCheckResponse . getCollatedResult ( ) ! = null ) {
String collatedResult = spellCheckResponse . getCollatedResult ( ) . trim ( ) ;
}
List < Suggestion > suggestions = spellCheckResponse . getSuggestions ( ) ;
if ( suggestions . size ( ) ! = 0 ) {
StringBuffer sb = new StringBuffer ( ) ;
for ( Suggestion suggestion : suggestions ) {
sb . append ( suggestion . getSuggestions ( ) . get ( 0 ) ) . append ( " " ) ;
}
String spellCheckProposal = sb . toString ( ) . trim ( ) ;
}
}
* /
2014-02-03 12:44:52 +01:00
Map < String , Map < String , List < String > > > rawsnippets = response . getHighlighting ( ) ; // a map from the urlhash to a map with key=field and value = list of snippets
if ( rawsnippets ! = null ) {
for ( Map < String , List < String > > re : rawsnippets . values ( ) ) {
for ( List < String > sl : re . values ( ) ) {
for ( String s : sl ) {
2015-02-09 18:45:07 +01:00
// the suggestion for the tail is in the snippet
2014-02-03 18:49:03 +01:00
s = s . replaceAll ( " </b> <b> " , " " ) ;
2015-02-09 18:45:07 +01:00
int snippetOpen = s . indexOf ( " <b> " ) ;
int snippetClose = s . indexOf ( " </b> " ) ;
if ( snippetOpen > = 0 & & snippetClose > snippetOpen ) {
String snippet = s . substring ( snippetOpen + 3 , snippetClose ) ;
String afterSnippet = s . substring ( snippetClose + 4 ) . trim ( ) ;
s = snippet + ( afterSnippet . length ( ) > 0 ? " " + afterSnippet : " " ) ;
for ( int i = 0 ; i < s . length ( ) ; i + + ) { char c = s . charAt ( i ) ; if ( c < 'A' ) s = s . replace ( c , ' ' ) ; } // remove funny symbols
s = s . replaceAll ( " <b> " , " " ) . replaceAll ( " </b> " , " " ) . replaceAll ( " " , " " ) . trim ( ) ; // wipe superfluous whitespace
2015-01-29 02:28:03 +01:00
String [ ] sx = CommonPattern . SPACE . split ( s ) ;
2014-02-03 12:44:52 +01:00
StringBuilder sb = new StringBuilder ( s . length ( ) ) ;
for ( String x : sx ) if ( x . length ( ) > 1 & & sb . length ( ) < 28 ) sb . append ( x ) . append ( ' ' ) ; else break ;
s = sb . toString ( ) . trim ( ) ;
2015-02-09 18:45:07 +01:00
if ( s . length ( ) > 0 ) snippets . inc ( s , count - - ) ;
2014-02-03 12:44:52 +01:00
}
}
}
2012-01-05 08:35:44 +01:00
}
2009-10-05 23:51:02 +02:00
}
2014-02-03 12:44:52 +01:00
} catch ( SolrException e ) {
2015-04-10 15:59:18 +02:00
e . printStackTrace ( ) ;
2014-02-03 12:44:52 +01:00
} catch ( IOException e ) {
2015-04-10 15:59:18 +02:00
e . printStackTrace ( ) ;
2014-02-03 12:44:52 +01:00
}
2015-04-10 15:59:18 +02:00
// delete all snippets which occur double-times, i.e. one that is a substring of another: remove longer snippet
2014-02-03 12:44:52 +01:00
Iterator < String > si = snippets . keys ( false ) ;
2015-04-10 15:59:18 +02:00
while ( si . hasNext ( ) ) {
String testsnippet = si . next ( ) . toLowerCase ( ) ;
if ( testsnippet . length ( ) > head . length ( ) + tail . length ( ) + 1 ) {
Iterator < String > sin = snippets . keys ( false ) ;
while ( sin . hasNext ( ) ) {
String snippetx = sin . next ( ) ;
if ( snippetx . length ( ) ! = testsnippet . length ( ) & & snippetx . toLowerCase ( ) . startsWith ( testsnippet ) ) {
snippets . delete ( snippetx ) ;
}
}
}
}
si = snippets . keys ( false ) ;
2015-02-09 18:45:07 +01:00
while ( si . hasNext ( ) & & result . size ( ) < preSortSelection ) {
result . add ( new StringBuilder ( si . next ( ) ) ) ;
2009-10-05 23:51:02 +02:00
}
return result ;
}
2011-11-09 15:42:55 +01:00
2010-04-13 03:16:09 +02:00
/ * *
* This method triggers the producer and consumer threads of the DidYouMean object .
* @param word a String with a single word
* @param timeout execution time in ms .
* @return a Set & lt ; String & gt ; with word variations contained in term index .
* /
2015-02-09 18:45:07 +01:00
private Collection < StringBuilder > getSuggestions ( final long timeout ) {
2011-11-09 15:42:55 +01:00
final long startTime = System . currentTimeMillis ( ) ;
2010-04-13 03:16:09 +02:00
this . timeLimit = startTime + timeout ;
2014-02-04 00:18:11 +01:00
Thread [ ] producers = null ;
if ( this . more ) {
// create and start producers
// the CPU load to create the guessed words is very low, but the testing
// against the library may be CPU intensive. Since it is possible to test
// words in the library concurrently, it is a good idea to start separate threads
producers = new Thread [ 4 ] ;
producers [ 0 ] = new ChangingOneLetter ( ) ;
producers [ 1 ] = new AddingOneLetter ( ) ;
producers [ 2 ] = new DeletingOneLetter ( ) ;
producers [ 3 ] = new ReversingTwoConsecutiveLetters ( ) ;
for ( final Thread t : producers ) {
t . start ( ) ;
2012-01-05 08:35:44 +01:00
}
2009-09-02 15:41:56 +02:00
}
2011-11-09 15:42:55 +01:00
2014-02-04 00:18:11 +01:00
test ( this . word ) ;
2015-02-09 18:45:07 +01:00
this . resultSet . addAll ( getSuggestions ( " " , this . word . toString ( ) , timeout , 10 , this . segment ) ) ;
2014-02-04 00:18:11 +01:00
if ( this . more ) {
// finish the producer
for ( final Thread t : producers ) {
long wait = this . timeLimit - System . currentTimeMillis ( ) ;
if ( wait > 0 ) try {
t . join ( wait ) ;
} catch ( final InterruptedException e ) { }
2012-01-05 08:35:44 +01:00
}
2009-09-02 15:41:56 +02:00
}
2014-02-04 00:18:11 +01:00
2010-04-13 03:16:09 +02:00
// we don't want the given word in the result
2010-10-12 10:36:33 +02:00
this . resultSet . remove ( this . word ) ;
2010-04-13 03:16:09 +02:00
return this . resultSet ;
}
2011-11-09 15:42:55 +01:00
2014-02-04 00:18:11 +01:00
private void test ( final StringBuilder s ) {
2011-11-09 15:42:55 +01:00
final Set < StringBuilder > libr = LibraryProvider . dymLib . recommend ( s ) ;
2010-05-16 01:49:30 +02:00
libr . addAll ( LibraryProvider . geoLoc . recommend ( s ) ) ;
2011-11-09 15:42:55 +01:00
for ( final StringBuilder t : libr ) {
2014-02-04 00:18:11 +01:00
if ( t . length ( ) > = MinimumOutputWordLength ) this . resultSet . add ( t ) ;
2010-04-13 03:25:15 +02:00
}
2010-04-13 03:16:09 +02:00
}
2014-02-04 00:18:11 +01:00
2010-04-13 03:16:09 +02:00
/ * *
2009-06-27 09:23:34 +02:00
* DidYouMean ' s producer thread that changes one letter ( e . g . bat / cat ) for a given term
* based on the given alphabet and puts it on the blocking queue , to be ' consumed ' by a consumer thread . < p / >
* < b > Note : < / b > the loop runs ( alphabet . length * len ) tests .
2010-04-13 03:16:09 +02:00
* /
public class ChangingOneLetter extends Thread {
@Override
public void run ( ) {
2010-10-12 11:45:15 +02:00
char m ;
2012-01-05 08:35:44 +01:00
for ( int i = 0 ; i < DidYouMean . this . wordLen ; i + + ) {
2014-02-04 00:18:11 +01:00
m = DidYouMean . this . word . charAt ( i ) ;
for ( final char c : DidYouMean . this . alphabet ) {
if ( m ! = c ) {
final StringBuilder ts = new StringBuilder ( DidYouMean . this . word . length ( ) + 1 ) . append ( DidYouMean . this . word . substring ( 0 , i ) ) . append ( c ) . append ( DidYouMean . this . word . substring ( i + 1 ) ) ;
test ( ts ) ;
2011-11-09 15:42:55 +01:00
}
2014-02-04 00:18:11 +01:00
if ( System . currentTimeMillis ( ) > DidYouMean . this . timeLimit ) return ;
}
2012-01-05 08:35:44 +01:00
}
2010-04-13 03:16:09 +02:00
}
}
2011-11-09 15:42:55 +01:00
2009-06-27 09:23:34 +02:00
/ * *
* DidYouMean ' s producer thread that deletes extra letters ( e . g . frog / fog ) for a given term
* and puts it on the blocking queue , to be ' consumed ' by a consumer thread . < p / >
* < b > Note : < / b > the loop runs ( len ) tests .
* /
2010-10-12 10:36:33 +02:00
private class DeletingOneLetter extends Thread {
2014-02-04 00:18:11 +01:00
@Override
public void run ( ) {
for ( int i = 0 ; i < DidYouMean . this . wordLen ; i + + ) {
final StringBuilder ts = new StringBuilder ( DidYouMean . this . word . length ( ) + 1 ) . append ( DidYouMean . this . word . substring ( 0 , i ) ) . append ( DidYouMean . this . word . substring ( i + 1 ) ) ;
test ( ts ) ;
if ( System . currentTimeMillis ( ) > DidYouMean . this . timeLimit ) return ;
2010-04-13 03:16:09 +02:00
}
2014-02-04 00:18:11 +01:00
}
2009-06-12 22:36:03 +02:00
}
2011-11-09 15:42:55 +01:00
2009-06-27 09:23:34 +02:00
/ * *
* DidYouMean ' s producer thread that adds missing letters ( e . g . bat / boat ) for a given term
* based on the given alphabet and puts it on the blocking queue , to be ' consumed ' by a consumer thread . < p / >
* < b > Note : < / b > the loop runs ( alphabet . length * len ) tests .
* /
2010-10-12 10:36:33 +02:00
private class AddingOneLetter extends Thread {
2014-02-04 00:18:11 +01:00
@Override
public void run ( ) {
for ( int i = 0 ; i < = DidYouMean . this . wordLen ; i + + ) {
for ( final char c : DidYouMean . this . alphabet ) {
final StringBuilder ts = new StringBuilder ( DidYouMean . this . word . length ( ) + 1 ) . append ( DidYouMean . this . word . substring ( 0 , i ) ) . append ( c ) . append ( DidYouMean . this . word . substring ( i ) ) ;
test ( ts ) ;
if ( System . currentTimeMillis ( ) > DidYouMean . this . timeLimit ) return ;
2012-01-05 08:35:44 +01:00
}
2010-04-13 03:16:09 +02:00
}
2014-02-04 00:18:11 +01:00
}
2009-06-12 22:36:03 +02:00
}
2011-11-09 15:42:55 +01:00
2009-06-27 09:23:34 +02:00
/ * *
* DidYouMean ' s producer thread that reverses any two consecutive letters ( e . g . two / tow ) for a given term
* and puts it on the blocking queue , to be ' consumed ' by a consumer thread . < p / >
* < b > Note : < / b > the loop runs ( len - 1 ) tests .
* /
2010-10-12 10:36:33 +02:00
private class ReversingTwoConsecutiveLetters extends Thread {
2012-01-05 08:35:44 +01:00
@Override
2014-02-04 00:18:11 +01:00
public void run ( ) {
for ( int i = 0 ; i < DidYouMean . this . wordLen - 1 ; i + + ) {
final StringBuilder ts = new StringBuilder ( DidYouMean . this . word . length ( ) + 1 ) . append ( DidYouMean . this . word . substring ( 0 , i ) ) . append ( DidYouMean . this . word . charAt ( i + 1 ) ) . append ( DidYouMean . this . word . charAt ( i ) ) . append ( DidYouMean . this . word . substring ( i + 2 ) ) ;
test ( ts ) ;
if ( System . currentTimeMillis ( ) > DidYouMean . this . timeLimit ) return ;
2012-01-05 08:35:44 +01:00
}
2011-11-09 15:42:55 +01:00
}
2014-02-04 00:18:11 +01:00
}
2011-11-09 15:42:55 +01:00
2009-09-02 15:41:56 +02:00
/ * *
2010-10-18 13:35:09 +02:00
* wordLengthComparator is used by DidYouMean to order terms by the term length
2009-09-02 15:41:56 +02:00
* This is the default order if the indexSizeComparator is not used
* /
2011-11-09 15:42:55 +01:00
private static class wordLengthComparator implements Comparator < StringBuilder > {
2012-01-05 08:35:44 +01:00
@Override
2011-11-09 15:42:55 +01:00
public int compare ( final StringBuilder o1 , final StringBuilder o2 ) {
2009-09-02 15:41:56 +02:00
final int i1 = o1 . length ( ) ;
final int i2 = o2 . length ( ) ;
2012-01-05 08:35:44 +01:00
if ( i1 = = i2 ) {
return StringBuilderComparator . CASE_INSENSITIVE_ORDER . compare ( o1 , o2 ) ;
}
2010-10-18 13:35:09 +02:00
return ( i1 < i2 ) ? 1 : - 1 ; // '<' is correct, because the longest word shall be first
2010-04-13 03:16:09 +02:00
}
2009-09-02 15:41:56 +02:00
}
2009-06-27 09:23:34 +02:00
2010-10-18 13:35:09 +02:00
/ * *
* headMatchingComparator is used to sort results in such a way that words that match with the given words are sorted first
* /
2011-11-09 15:42:55 +01:00
private static class headMatchingComparator implements Comparator < StringBuilder > {
private final StringBuilder head ;
private final Comparator < StringBuilder > secondaryComparator ;
public headMatchingComparator ( final StringBuilder head , final Comparator < StringBuilder > secondaryComparator ) {
this . head = head ;
2010-10-18 13:35:09 +02:00
this . secondaryComparator = secondaryComparator ;
}
2011-11-09 15:42:55 +01:00
2012-01-05 08:35:44 +01:00
@Override
2011-11-09 15:42:55 +01:00
public int compare ( final StringBuilder o1 , final StringBuilder o2 ) {
final boolean o1m = StringBuilderComparator . CASE_INSENSITIVE_ORDER . startsWith ( o1 , this . head ) ;
final boolean o2m = StringBuilderComparator . CASE_INSENSITIVE_ORDER . startsWith ( o2 , this . head ) ;
2012-01-05 08:35:44 +01:00
if ( ( o1m & & o2m ) | | ( ! o1m & & ! o2m ) ) {
return this . secondaryComparator . compare ( o1 , o2 ) ;
}
2010-10-18 13:35:09 +02:00
return o1m ? - 1 : 1 ;
}
}
2011-11-09 15:42:55 +01:00
2009-06-12 22:36:03 +02:00
}
2009-06-14 13:53:09 +02:00