yacy_search_server/source/de/anomic/data/DidYouMean.java

260 lines
8.1 KiB
Java
Raw Normal View History

package de.anomic.data;
import java.util.Collections;
import java.util.Comparator;
import java.util.HashSet;
import java.util.Set;
import java.util.TreeSet;
import java.util.concurrent.LinkedBlockingQueue;
import de.anomic.document.Word;
import de.anomic.kelondro.text.IndexCell;
import de.anomic.kelondro.text.referencePrototype.WordReference;
import de.anomic.yacy.logging.Log;
/**
* People make mistakes when they type words.
* The most common mistakes are the four categories listed below:
* <ol>
* <li>Changing one letter: bat / cat;</li>
* <li>Adding one letter: bat / boat;</li>
* <li>Deleting one letter: frog / fog; or</li>
* <li>Reversing two consecutive letters: two / tow.</li>
* </ol>
* DidYouMean provides producer threads, that feed a blocking queue with word variations according to
* the above mentioned four categories. Consumer threads check then the generated word variations against a term index.
* Only words contained in the term index are return by the getSuggestion method.<p/>
* @author apfelmaennchen
*/
public class DidYouMean {
protected static final char[] alphabet = {'a','b','c','d','e','f','g','h','i','j','k','l','m','n','o','p',
'q','r','s','t','u','v','w','x','y','z','\u00e4','\u00f6','\u00fc','\u00df'};
public static final int availableCPU = Runtime.getRuntime().availableProcessors();
final LinkedBlockingQueue<String> queue = new LinkedBlockingQueue<String>();
protected final Set<String> set;
protected final IndexCell<WordReference> index;
protected String word;
protected int len;
/**
* @param index a termIndex - most likely retrieved from a switchboard object.
* @param sort true/false - sorts the resulting TreeSet by index.count(); <b>Warning:</b> this causes heavy i/o.
*/
public DidYouMean(final IndexCell<WordReference> index, boolean sort) {
if(sort)
this.set = Collections.synchronizedSortedSet(new TreeSet<String>(new wordSizeComparator()));
else
this.set = Collections.synchronizedSet(new HashSet<String>());
this.word = "";
this.len = 0;
this.index = index;
}
/**
* @param index a termIndex - most likely retrieved from a switchboard object.
*/
public DidYouMean(final IndexCell<WordReference> index) {
this(index, false);
}
/**
* This method triggers the 4 producer and 8 consumer threads of DidYouMean.
* <p/><b>Note:</b> the default timeout is 500ms
* @param word a String with a single word
* @return a Set&lt;String&gt; with word variations contained in index.
*/
public Set<String> getSuggestion(final String word) {
return getSuggestion(word, 500);
}
/**
* This method triggers the 4 producer and 8 consumer threads of the DidYouMean object.
* @param word a String with a single word
* @param timeout execution time in ms.
* @return a Set&lt;String&gt; with word variations contained in term index.
*/
public Set<String> getSuggestion(final String word, long timeout) {
long startTime = System.currentTimeMillis();
this.word = word.toLowerCase();
this.len = word.length();
// create producers
// the intention of the 4 producers is to mix results, as there
// is currently no default sorting or ranking due to the i/o performance of index.count()
Thread[] producers = new Thread[4];
producers[0] = new ChangingOneLetter();
producers[1] = new AddingOneLetter();
producers[2] = new DeletingOneLetter();
producers[3] = new ReversingTwoConsecutiveLetters();
// start producers
for (int i=0; i<producers.length; i++) {
producers[i].start();
}
// create and start consumers threads
Thread[] consumers = new Thread[availableCPU];
for (int i=0; i<consumers.length; i++) {
consumers[i] = new Consumer();
consumers[i].start();
}
// check if timeout has been reached
boolean cont = false;
while(((System.currentTimeMillis()-startTime) < timeout)) {
// checks if queue is already empty
if(queue.size()==0) {
// check if at least one producers is still running and potentially filling the queue
for (int i=0; i<producers.length; i++) {
if(producers[i].isAlive())
cont = true;
}
// as the queue is empty and no producer is running we can break the timeout-loop
if(!cont) break;
}
}
// interrupt all consumer threads
for (int i=0; i<consumers.length; i++) {
consumers[i].interrupt();
}
// interrupt all remaining producer threads
for (int i=0; i<producers.length; i++) {
producers[i].interrupt();
}
this.set.remove(word.toLowerCase());
Log.logInfo("DidYouMean", "found "+this.set.size()+" terms; execution time: "
+(System.currentTimeMillis()-startTime)+"ms"+ " - remaining queue size: "+queue.size());
return this.set;
}
/**
* DidYouMean's producer thread that changes one letter (e.g. bat/cat) for a given term
* based on the given alphabet and puts it on the blocking queue, to be 'consumed' by a consumer thread.<p/>
* <b>Note:</b> the loop runs (alphabet.length * len) tests.
*/
public class ChangingOneLetter extends Thread {
public void run() {
String s;
for(int i=0; i<len; i++) {
for(int j=0; j<alphabet.length; j++) {
s = word.substring(0, i) + alphabet[j] + word.substring(i+1);
try {
queue.put(s);
} catch (InterruptedException e) {
return;
}
}
}
}
}
/**
* DidYouMean's producer thread that deletes extra letters (e.g. frog/fog) for a given term
* and puts it on the blocking queue, to be 'consumed' by a consumer thread.<p/>
* <b>Note:</b> the loop runs (len) tests.
*/
protected class DeletingOneLetter extends Thread {
public void run() {
String s;
for(int i=0; i<len;i++) {
s = word.substring(0, i) + word.substring(i+1);
try {
queue.put(s);
} catch (InterruptedException e) {
return;
}
}
}
}
/**
* DidYouMean's producer thread that adds missing letters (e.g. bat/boat) for a given term
* based on the given alphabet and puts it on the blocking queue, to be 'consumed' by a consumer thread.<p/>
* <b>Note:</b> the loop runs (alphabet.length * len) tests.
*/
protected class AddingOneLetter extends Thread {
public void run() {
String s;
for(int i=0; i<=len;i++) {
for(int j=0; j<alphabet.length; j++) {
s = word.substring(0, i) + alphabet[j] + word.substring(i);
try {
queue.put(s);
} catch (InterruptedException e) {
return;
}
}
}
}
}
/**
* DidYouMean's producer thread that reverses any two consecutive letters (e.g. two/tow) for a given term
* and puts it on the blocking queue, to be 'consumed' by a consumer thread.<p/>
* <b>Note:</b> the loop runs (len-1) tests.
*/
protected class ReversingTwoConsecutiveLetters extends Thread {
public void run() {
String s;
for(int i=0; i<len-1; i++) {
s = word.substring(0,i)+word.charAt(i+1)+word.charAt(i)+word.substring(i+2);
try {
queue.put(s);
} catch (InterruptedException e) {
return;
}
}
}
}
/**
* DidYouMean's consumer thread takes a String object (term) from the blocking queue
* and checks if it is contained in YaCy's RWI index. The thread recognizes "\n" as poison pill!<p/>
* <b>Note:</b> this causes no or moderate i/o as it uses the efficient index.has() method.
*/
class Consumer extends Thread {
public void run() {
try {
while(true) {
String s = queue.take();
if(s.equals("\n"))
this.interrupt();
else
consume(s);
}
} catch (InterruptedException e) {
return;
}
}
void consume(String s) {
if (index.has(Word.word2hash(s))) {
set.add(s);
}
}
}
/**
* wordSizeComparator is used by DidYouMean to order terms by index.count()<p/>
* <b>Warning:</b> this causes heavy i/o
*/
protected class wordSizeComparator implements Comparator<String> {
public int compare(final String o1, final String o2) {
final Integer i1 = index.count(Word.word2hash(o1));
final Integer i2 = index.count(Word.word2hash(o2));
return i2.compareTo(i1);
}
}
}