yacy_search_server/source/net/yacy/data/DidYouMean.java
Michael Peter Christen 535f1ebe3b added a new way of content browsing in search results:
- date navigation

The date is taken from the CONTENT of the documents / web pages, NOT
from a date submitted in the context of metadata (i.e. http header or
html head form). This makes it possible to search for documents in the
future, i.e. when documents contain event descriptions for future
events.

The date is written to an index field which is now enabled by default.
All documents are scanned for contained date mentions.
To visualize the dates for a specific search results, a histogram
showing the number of documents for each day is displayed. To render
these histograms the morris.js library is used. Morris.js requires also
raphael.js which is now also integrated in YaCy.

The histogram is now also displayed in the index browser by default.

To select a specific range from a search result, the following modifiers
had been introduced:
from:<date>
to:<date>
These modifiers can be used separately (i.e. only 'from' or only 'to')
to describe an open interval or combined to have a closed interval. Both
dates are inclusive. To select a specific single date only, use the
'to:' - modifier.

The histogram shows blue and green lines; the green lines denot weekend
days (saturday and sunday).

Clicking on bars in the histogram has the following reaction:
1st click: add a from:<date> modifier for the date of the bar
2nd click: add a to:<date> modifier for the date of the bar
3rd click: remove from and date modifier and set a on:<date> for the bar
When the on:<date> modifier is used, the histogram shows an unlimited
time period. This makes it possible to click again (4th click) which is
then interpreted as a 1st click again (sets a from modifier).

The display feature is NOT switched on by default; to switch it on use
the /ConfigSearchPage_p.html servlet.
2015-03-02 04:30:10 +01:00

438 lines
21 KiB
Java

package net.yacy.data;
import java.io.IOException;
import java.util.Collection;
import java.util.Collections;
import java.util.Comparator;
import java.util.ConcurrentModificationException;
import java.util.Iterator;
import java.util.LinkedHashSet;
import java.util.List;
import java.util.Map;
import java.util.Set;
import java.util.SortedSet;
import java.util.TreeSet;
import org.apache.solr.client.solrj.SolrQuery;
import org.apache.solr.client.solrj.response.QueryResponse;
import org.apache.solr.common.SolrException;
import net.yacy.cora.sorting.ClusteredScoreMap;
import net.yacy.cora.sorting.OrderedScoreMap;
import net.yacy.cora.sorting.ReversibleScoreMap;
import net.yacy.cora.util.CommonPattern;
import net.yacy.cora.util.ConcurrentLog;
import net.yacy.cora.util.StringBuilderComparator;
import net.yacy.document.LibraryProvider;
import net.yacy.search.index.Segment;
import net.yacy.search.schema.CollectionSchema;
/**
* People make mistakes when they type words.
* The most common mistakes are the four categories listed below:
* <ol>
* <li>Changing one letter: bat / cat;</li>
* <li>Adding one letter: bat / boat;</li>
* <li>Deleting one letter: frog / fog; or</li>
* <li>Reversing two consecutive letters: two / tow.</li>
* </ol>
* DidYouMean provides producer threads, that feed a blocking queue with word variations according to
* the above mentioned four categories. Consumer threads check then the generated word variations against a term index.
* Only words contained in the term index are return by the getSuggestion method.<p/>
* @author apfelmaennchen
* @author orbiter (extensions for multi-language support + multi-word suggestions)
*/
public class DidYouMean {
private static final int MinimumInputWordLength = 2;
private static final int MinimumOutputWordLength = 4;
private static final char[] ALPHABET_LATIN = {
'a','b','c','d','e','f','g','h','i','j','k','l','m','n','o','p',
'q','r','s','t','u','v','w','x','y','z',
'\u00df',
'\u00e0','\u00e1','\u00e2','\u00e3','\u00e4','\u00e5','\u00e6','\u00e7',
'\u00e8','\u00e9','\u00ea','\u00eb','\u00ec','\u00ed','\u00ee','\u00ef',
'\u00f0','\u00f1','\u00f2','\u00f3','\u00f4','\u00f5','\u00f6',
'\u00f8','\u00f9','\u00fa','\u00fb','\u00fc','\u00fd','\u00fe','\u00ff'};
private static final char[] ALPHABET_KANJI = new char[512]; // \u3400-\u34ff + \u4e00-\u4eff
private static final char[] ALPHABET_HIRAGANA = new char[96]; // \u3040-\u309F
private static final char[] ALPHABET_KATAKANA = new char[96]; // \u30A0-\u30FF
private static final char[] ALPHABET_CJK_UNIFIED_IDEOGRAPHS_Part1 = new char[5376]; // \u4E00-\u62FF
private static final char[] ALPHABET_CJK_UNIFIED_IDEOGRAPHS_Part2 = new char[5376]; // \u6300-\u77FF
private static final char[] ALPHABET_CJK_UNIFIED_IDEOGRAPHS_Part3 = new char[5376]; // \u7800-\u8CFF
private static final char[] ALPHABET_CJK_UNIFIED_IDEOGRAPHS_Part4 = new char[4864]; // \u8D00-\u9FFF
static {
// this is very experimental: a very small subset of Kanji
for (char a = '\u3400'; a <= '\u34ff'; a++) ALPHABET_KANJI[0xff & (a - '\u3400')] = a;
for (char a = '\u4e00'; a <= '\u4eff'; a++) ALPHABET_KANJI[0xff & (a - '\u4e00') + 256] = a;
for (char a = '\u3040'; a <= '\u309F'; a++) ALPHABET_HIRAGANA[0xff & (a - '\u3040')] = a;
for (char a = '\u30A0'; a <= '\u30FF'; a++) ALPHABET_KATAKANA[0xff & (a - '\u30A0')] = a;
for (char a = '\u4E00'; a <= '\u62FF'; a++) ALPHABET_CJK_UNIFIED_IDEOGRAPHS_Part1[0xff & (a - '\u4E00')] = a;
for (char a = '\u6300'; a <= '\u77FF'; a++) ALPHABET_CJK_UNIFIED_IDEOGRAPHS_Part2[0xff & (a - '\u6300')] = a;
for (char a = '\u7800'; a <= '\u8CFF'; a++) ALPHABET_CJK_UNIFIED_IDEOGRAPHS_Part3[0xff & (a - '\u7800')] = a;
for (char a = '\u8D00'; a <= '\u9FFF'; a++) ALPHABET_CJK_UNIFIED_IDEOGRAPHS_Part4[0xff & (a - '\u8D00')] = a;
}
private static final char[][] ALPHABETS = {
ALPHABET_LATIN, ALPHABET_KANJI, ALPHABET_HIRAGANA, ALPHABET_KATAKANA,
ALPHABET_CJK_UNIFIED_IDEOGRAPHS_Part1, ALPHABET_CJK_UNIFIED_IDEOGRAPHS_Part2, ALPHABET_CJK_UNIFIED_IDEOGRAPHS_Part3, ALPHABET_CJK_UNIFIED_IDEOGRAPHS_Part4};
public static final int AVAILABLE_CPU = Runtime.getRuntime().availableProcessors();
private static final wordLengthComparator WORD_LENGTH_COMPARATOR = new wordLengthComparator();
private final Segment segment;
private final StringBuilder word;
private final int wordLen;
private long timeLimit;
private final SortedSet<StringBuilder> resultSet;
private char[] alphabet;
private boolean more;
/**
* @param index a termIndex - most likely retrieved from a switchboard object.
* @param sort true/false - sorts the resulting TreeSet by index.count(); <b>Warning:</b> this causes heavy i/o.
*/
public DidYouMean(final Segment segment, final String word0) {
this.word = new StringBuilder(word0.trim());
this.resultSet = Collections.synchronizedSortedSet(new TreeSet<StringBuilder>(new headMatchingComparator(this.word, WORD_LENGTH_COMPARATOR)));
this.wordLen = this.word.length();
this.segment = segment;
this.more = segment.connectedRWI() && segment.RWICount() > 0; // with RWIs connected the guessing is super-fast
// identify language
if (this.word.length() > 0) {
char testchar = this.word.charAt(0);
if (testchar >= 'A' && testchar <= 'Z') testchar = (char) (testchar + 32);
boolean alphafound = false;
alphatest: for (final char[] alpha: ALPHABETS) {
if (isAlphabet(alpha, testchar)) {
this.alphabet = new char[alpha.length];
System.arraycopy(alpha, 0, this.alphabet, 0, alpha.length);
alphafound = true;
break alphatest;
}
}
if (!alphafound && testchar < 'A') {
this.alphabet = new char[ALPHABET_LATIN.length];
System.arraycopy(ALPHABET_LATIN, 0, this.alphabet, 0, ALPHABET_LATIN.length);
alphafound = true;
}
if (!alphafound) {
// generate generic alphabet using simply a character block of 256 characters
final int firstchar = (0xff & (testchar / 256)) * 256;
final int lastchar = firstchar + 255;
this.alphabet = new char[256];
// test this with /suggest.json?q=%EF%BD%84
for (int a = firstchar; a <= lastchar; a++) {
this.alphabet[0xff & (a - firstchar)] = (char) a;
}
}
}
}
private static final boolean isAlphabet(final char[] alpha, final char testchar) {
for (final char a: alpha) {
if (a == testchar) {
return true;
}
}
return false;
}
public void reset() {
this.resultSet.clear();
}
/**
* get suggestions for a given word. The result is first ordered using a term size ordering,
* and a subset of the result is sorted again with a IO-intensive order based on the index size
* @param word0
* @param timeout
* @param preSortSelection the number of words that participate in the IO-intensive sort
* @return
*/
public Collection<StringBuilder> getSuggestions(final long timeout, final int preSortSelection) {
if (this.word.length() < MinimumInputWordLength) {
return this.resultSet; // return nothing if input is too short
}
final long startTime = System.currentTimeMillis();
final long timelimit = startTime + timeout;
int lastIndexOfSpace = this.word.lastIndexOf(" ");
final Collection<StringBuilder> preSorted;
if (lastIndexOfSpace > 0) {
// several words
preSorted = getSuggestions(this.word.substring(0, lastIndexOfSpace), this.word.substring(lastIndexOfSpace + 1), timeout, preSortSelection, this.segment);
} else {
preSorted = getSuggestions(timeout);
}
final ReversibleScoreMap<StringBuilder> scored = new ClusteredScoreMap<StringBuilder>(StringBuilderComparator.CASE_INSENSITIVE_ORDER);
LinkedHashSet<StringBuilder> countSorted = new LinkedHashSet<StringBuilder>();
if (this.more) {
final int wc = this.segment.getWordCountGuess(this.word.toString()); // all counts must be greater than this
try {
for (final StringBuilder s: preSorted) {
if (System.currentTimeMillis() > timelimit) break;
if (!(scored.sizeSmaller(2 * preSortSelection))) break;
String s0 = s.toString();
int wcg = s0.indexOf(' ') > 0 ? s0.length() * 100 : this.segment.getWordCountGuess(s0);
if (wcg > wc) scored.inc(s, wcg);
}
} catch (final ConcurrentModificationException e) {
}
Iterator<StringBuilder> i = scored.keys(false);
while (i.hasNext()) countSorted.add(i.next());
} else {
try {
for (final StringBuilder s: preSorted) {
if (StringBuilderComparator.CASE_INSENSITIVE_ORDER.startsWith(s, this.word) ||
StringBuilderComparator.CASE_INSENSITIVE_ORDER.endsWith(this.word, s)) countSorted.add(this.word);
}
for (final StringBuilder s: preSorted) {
if (!StringBuilderComparator.CASE_INSENSITIVE_ORDER.equals(s, this.word)) countSorted.add(s);
}
} catch (final ConcurrentModificationException e) {
}
}
// finished
ConcurrentLog.info("DidYouMean", "found " + preSorted.size() + " unsorted terms, returned " + countSorted.size() + " sorted suggestions; execution time: "
+ (System.currentTimeMillis() - startTime) + "ms");
return countSorted;
}
/**
* return a string that is a suggestion list for the list of given words
* @param head - the sequence of words before the last space in the sequence, fixed (not to be corrected); possibly empty
* @param tail - the word after the last space, possibly empty or misspelled
* @param timeout for operation
* @param preSortSelection - number of suggestions to be computed
* @return
*/
private static Collection<StringBuilder> getSuggestions(final String head, final String tail, final long timeout, final int preSortSelection, final Segment segment) {
final SortedSet<StringBuilder> result = new TreeSet<StringBuilder>(StringBuilderComparator.CASE_INSENSITIVE_ORDER);
int count = 30;
final SolrQuery solrQuery = new SolrQuery();
solrQuery.setParam("defType", "edismax");
solrQuery.setFacet(false);
assert tail.length() > 0 && tail.indexOf(' ') < 0; // if there would be a space it should be part of head
String q = head.length() == 0 ? CollectionSchema.text_t.getSolrFieldName() + ":" + tail + "~" : CollectionSchema.title.getSolrFieldName() + ":\"" + head + "\"^10 " + CollectionSchema.text_t.getSolrFieldName() + ":(" + head + " " + tail + ")~"; // for a fuzzy search we cannot apply fuzzyness on the tail only
String fq = head.length() == 0 ? null : CollectionSchema.text_t.getSolrFieldName() + ":\"" + head + "\""; // in all cases (a) and (b) there must be that portion, but that is not part of the snippet that we are searching
solrQuery.setQuery(q);
if (head.length() > 0 && fq != null) solrQuery.setFilterQueries(fq);
solrQuery.setStart(0);
solrQuery.setRows(count);
solrQuery.setHighlight(true);
solrQuery.setHighlightFragsize(head.length() + tail.length() + 80);
solrQuery.setHighlightSimplePre("<b>");
solrQuery.setHighlightSimplePost("</b>");
solrQuery.setHighlightSnippets(1);
solrQuery.addHighlightField(CollectionSchema.title.getSolrFieldName());
solrQuery.addHighlightField(CollectionSchema.text_t.getSolrFieldName());
solrQuery.setFields(); // no fields wanted! only snippets
OrderedScoreMap<String> snippets = new OrderedScoreMap<String>(null);
try {
QueryResponse response = segment.fulltext().getDefaultConnector().getResponseByParams(solrQuery);
Map<String, Map<String, List<String>>> rawsnippets = response.getHighlighting(); // a map from the urlhash to a map with key=field and value = list of snippets
if (rawsnippets != null) {
for (Map<String, List<String>> re: rawsnippets.values()) {
for (List<String> sl: re.values()) {
for (String s: sl) {
// the suggestion for the tail is in the snippet
s = s.replaceAll("</b> <b>", " ");
int snippetOpen = s.indexOf("<b>");
int snippetClose = s.indexOf("</b>");
if (snippetOpen >= 0 && snippetClose > snippetOpen) {
String snippet = s.substring(snippetOpen + 3, snippetClose);
String afterSnippet = s.substring(snippetClose + 4).trim();
s = snippet + (afterSnippet.length() > 0 ? " " + afterSnippet : "");
for (int i = 0; i < s.length(); i++) {char c = s.charAt(i); if (c < 'A') s = s.replace(c, ' ');} // remove funny symbols
s = s.replaceAll("<b>", " ").replaceAll("</b>", " ").replaceAll(" ", " ").trim(); // wipe superfluous whitespace
String[] sx = CommonPattern.SPACE.split(s);
StringBuilder sb = new StringBuilder(s.length());
for (String x: sx) if (x.length() > 1 && sb.length() < 28) sb.append(x).append(' '); else break;
s = sb.toString().trim();
if (s.length() > 0) snippets.inc(s, count--);
}
}
}
}
}
} catch (SolrException e) {
} catch (IOException e) {
}
Iterator<String> si = snippets.keys(false);
while (si.hasNext() && result.size() < preSortSelection) {
result.add(new StringBuilder(si.next()));
}
return result;
}
/**
* This method triggers the producer and consumer threads of the DidYouMean object.
* @param word a String with a single word
* @param timeout execution time in ms.
* @return a Set&lt;String&gt; with word variations contained in term index.
*/
private Collection<StringBuilder> getSuggestions(final long timeout) {
final long startTime = System.currentTimeMillis();
this.timeLimit = startTime + timeout;
Thread[] producers = null;
if (this.more) {
// create and start producers
// the CPU load to create the guessed words is very low, but the testing
// against the library may be CPU intensive. Since it is possible to test
// words in the library concurrently, it is a good idea to start separate threads
producers = new Thread[4];
producers[0] = new ChangingOneLetter();
producers[1] = new AddingOneLetter();
producers[2] = new DeletingOneLetter();
producers[3] = new ReversingTwoConsecutiveLetters();
for (final Thread t: producers) {
t.start();
}
}
test(this.word);
this.resultSet.addAll(getSuggestions("", this.word.toString(), timeout, 10, this.segment));
if (this.more) {
// finish the producer
for (final Thread t: producers) {
long wait = this.timeLimit - System.currentTimeMillis();
if (wait > 0) try {
t.join(wait);
} catch (final InterruptedException e) {}
}
}
// we don't want the given word in the result
this.resultSet.remove(this.word);
return this.resultSet;
}
private void test(final StringBuilder s) {
final Set<StringBuilder> libr = LibraryProvider.dymLib.recommend(s);
libr.addAll(LibraryProvider.geoLoc.recommend(s));
for (final StringBuilder t: libr) {
if (t.length() >= MinimumOutputWordLength) this.resultSet.add(t);
}
}
/**
* DidYouMean's producer thread that changes one letter (e.g. bat/cat) for a given term
* based on the given alphabet and puts it on the blocking queue, to be 'consumed' by a consumer thread.<p/>
* <b>Note:</b> the loop runs (alphabet.length * len) tests.
*/
public class ChangingOneLetter extends Thread {
@Override
public void run() {
char m;
for (int i = 0; i < DidYouMean.this.wordLen; i++) {
m = DidYouMean.this.word.charAt(i);
for (final char c: DidYouMean.this.alphabet) {
if (m != c) {
final StringBuilder ts = new StringBuilder(DidYouMean.this.word.length() + 1).append(DidYouMean.this.word.substring(0, i)).append(c).append(DidYouMean.this.word.substring(i + 1));
test(ts);
}
if (System.currentTimeMillis() > DidYouMean.this.timeLimit) return;
}
}
}
}
/**
* DidYouMean's producer thread that deletes extra letters (e.g. frog/fog) for a given term
* and puts it on the blocking queue, to be 'consumed' by a consumer thread.<p/>
* <b>Note:</b> the loop runs (len) tests.
*/
private class DeletingOneLetter extends Thread {
@Override
public void run() {
for (int i = 0; i < DidYouMean.this.wordLen; i++) {
final StringBuilder ts = new StringBuilder(DidYouMean.this.word.length() + 1).append(DidYouMean.this.word.substring(0, i)).append(DidYouMean.this.word.substring(i + 1));
test(ts);
if (System.currentTimeMillis() > DidYouMean.this.timeLimit) return;
}
}
}
/**
* DidYouMean's producer thread that adds missing letters (e.g. bat/boat) for a given term
* based on the given alphabet and puts it on the blocking queue, to be 'consumed' by a consumer thread.<p/>
* <b>Note:</b> the loop runs (alphabet.length * len) tests.
*/
private class AddingOneLetter extends Thread {
@Override
public void run() {
for (int i = 0; i <= DidYouMean.this.wordLen; i++) {
for (final char c: DidYouMean.this.alphabet) {
final StringBuilder ts = new StringBuilder(DidYouMean.this.word.length() + 1).append(DidYouMean.this.word.substring(0, i)).append(c).append(DidYouMean.this.word.substring(i));
test(ts);
if (System.currentTimeMillis() > DidYouMean.this.timeLimit) return;
}
}
}
}
/**
* DidYouMean's producer thread that reverses any two consecutive letters (e.g. two/tow) for a given term
* and puts it on the blocking queue, to be 'consumed' by a consumer thread.<p/>
* <b>Note:</b> the loop runs (len-1) tests.
*/
private class ReversingTwoConsecutiveLetters extends Thread {
@Override
public void run() {
for (int i = 0; i < DidYouMean.this.wordLen - 1; i++) {
final StringBuilder ts = new StringBuilder(DidYouMean.this.word.length() + 1).append(DidYouMean.this.word.substring(0, i)).append(DidYouMean.this.word.charAt(i + 1)).append(DidYouMean.this.word.charAt(i)).append(DidYouMean.this.word.substring(i + 2));
test(ts);
if (System.currentTimeMillis() > DidYouMean.this.timeLimit) return;
}
}
}
/**
* wordLengthComparator is used by DidYouMean to order terms by the term length
* This is the default order if the indexSizeComparator is not used
*/
private static class wordLengthComparator implements Comparator<StringBuilder> {
@Override
public int compare(final StringBuilder o1, final StringBuilder o2) {
final int i1 = o1.length();
final int i2 = o2.length();
if (i1 == i2) {
return StringBuilderComparator.CASE_INSENSITIVE_ORDER.compare(o1, o2);
}
return (i1 < i2) ? 1 : -1; // '<' is correct, because the longest word shall be first
}
}
/**
* headMatchingComparator is used to sort results in such a way that words that match with the given words are sorted first
*/
private static class headMatchingComparator implements Comparator<StringBuilder> {
private final StringBuilder head;
private final Comparator<StringBuilder> secondaryComparator;
public headMatchingComparator(final StringBuilder head, final Comparator<StringBuilder> secondaryComparator) {
this.head = head;
this.secondaryComparator = secondaryComparator;
}
@Override
public int compare(final StringBuilder o1, final StringBuilder o2) {
final boolean o1m = StringBuilderComparator.CASE_INSENSITIVE_ORDER.startsWith(o1, this.head);
final boolean o2m = StringBuilderComparator.CASE_INSENSITIVE_ORDER.startsWith(o2, this.head);
if ((o1m && o2m) || (!o1m && !o2m)) {
return this.secondaryComparator.compare(o1, o2);
}
return o1m ? -1 : 1;
}
}
}