less word hash computations (removing some overhead because of MD5

calcs) using the clear word in a normalized form.
This commit is contained in:
orbiter 2013-11-25 15:20:54 +01:00
parent f23471c471
commit 61409788eb
5 changed files with 112 additions and 47 deletions

View File

@ -524,9 +524,9 @@ public class yacysearch {
final int maxDistance = (querystring.indexOf('"', 0) >= 0) ? qg.getIncludeHashes().size() - 1 : Integer.MAX_VALUE;
// filter out stopwords
final SortedSet<String> filtered = SetTools.joinConstructiveByTest(qg.getIncludeStrings(), Switchboard.stopwords); //find matching stopwords
final SortedSet<String> filtered = SetTools.joinConstructiveByTest(qg.getIncludeWords(), Switchboard.stopwords); //find matching stopwords
if ( !filtered.isEmpty() ) {
SetTools.excludeDestructiveByTestSmallInLarge(qg.getIncludeStrings(), filtered); //remove stopwords
SetTools.excludeDestructiveByTestSmallInLarge(qg.getIncludeWords(), filtered); //remove stopwords
}
// if a minus-button was hit, remove a special reference first

View File

@ -34,6 +34,7 @@ import java.util.Locale;
import java.util.Map;
import java.util.Properties;
import java.util.Set;
import java.util.SortedMap;
import java.util.SortedSet;
import java.util.TreeMap;
@ -48,6 +49,7 @@ import net.yacy.cora.document.id.MultiProtocolURL;
import net.yacy.cora.federate.solr.Ranking;
import net.yacy.cora.language.synonyms.SynonymLibrary;
import net.yacy.cora.lod.vocabulary.Tagging;
import net.yacy.cora.order.NaturalOrder;
import net.yacy.cora.util.ConcurrentLog;
import net.yacy.document.language.Identificator;
import net.yacy.document.parser.html.ImageEntry;
@ -73,7 +75,7 @@ public final class Condenser {
public static final int flag_cat_hasapp = 23; // the page refers to (at least one) application file
//private Properties analysis;
private final Map<String, Word> words; // a string (the words) to (indexWord) - relation
private final SortedMap<String, Word> words; // a string (the words) to (indexWord) - relation
private final Map<String, Set<Tagging.Metatag>> tags = new HashMap<String, Set<Tagging.Metatag>>(); // a set of tags, discovered from Autotagging
private final Set<String> synonyms; // a set of synonyms to the words
private long fuzzy_signature = 0, exact_signature = 0; // signatures for double-check detection
@ -97,7 +99,7 @@ public final class Condenser {
Thread.currentThread().setName("condenser-" + document.dc_identifier()); // for debugging
// if addMedia == true, then all the media links are also parsed and added to the words
// added media words are flagged with the appropriate media flag
this.words = new HashMap<String, Word>();
this.words = new TreeMap<String, Word>(NaturalOrder.naturalComparator);
this.synonyms = new LinkedHashSet<String>();
this.RESULT_FLAGS = new Bitfield(4);
@ -297,7 +299,7 @@ public final class Condenser {
return oldsize - this.words.size();
}
public Map<String, Word> words() {
public SortedMap<String, Word> words() {
// returns the words as word/indexWord relation map
return this.words;
}
@ -458,7 +460,7 @@ public final class Condenser {
this.RESULT_DIFF_SENTENCES = sentenceHandleCount;
}
public static Map<String, Word> getWords(final String text, final WordCache meaningLib) {
public static SortedMap<String, Word> getWords(final String text, final WordCache meaningLib) {
// returns a word/indexWord relation map
if (text == null) return null;
return new Condenser(text, meaningLib, false).words();

View File

@ -24,13 +24,17 @@ package net.yacy.search.query;
import java.io.UnsupportedEncodingException;
import java.net.URLEncoder;
import java.util.ArrayList;
import java.util.Collection;
import java.util.Locale;
import java.util.Map;
import java.util.SortedSet;
import java.util.TreeSet;
import net.yacy.cora.document.WordCache;
import net.yacy.cora.federate.solr.Ranking;
import net.yacy.cora.federate.solr.SchemaDeclaration;
import net.yacy.cora.federate.solr.SolrType;
import net.yacy.cora.order.NaturalOrder;
import net.yacy.cora.storage.HandleSet;
import net.yacy.document.parser.html.AbstractScraper;
import net.yacy.document.parser.html.CharacterCoding;
@ -49,14 +53,43 @@ public class QueryGoal {
private String query_original;
private HandleSet include_hashes, exclude_hashes;
private final ArrayList<String> include_words, exclude_words;
private final NormalizedWords include_words, exclude_words;
private final ArrayList<String> include_strings, exclude_strings;
public static class NormalizedWords extends TreeSet<String> {
private static final long serialVersionUID = -3050851079671868007L;
public NormalizedWords() {
super(NaturalOrder.naturalComparator);
}
public NormalizedWords(String[] rawWords) {
super(NaturalOrder.naturalComparator);
for (String word: rawWords) super.add(word.toLowerCase(Locale.ENGLISH));
}
public NormalizedWords(Collection<String> rawWords) {
super(NaturalOrder.naturalComparator);
for (String word: rawWords) super.add(word.toLowerCase(Locale.ENGLISH));
}
@Override
public boolean add(String word) {
return super.add(word.toLowerCase(Locale.ENGLISH));
}
@Override
public boolean contains(Object word) {
if (!(word instanceof String)) return false;
return super.contains(((String) word).toLowerCase(Locale.ENGLISH));
}
}
public QueryGoal(HandleSet include_hashes, HandleSet exclude_hashes) {
this.query_original = null;
this.include_words = new ArrayList<String>();
this.exclude_words = new ArrayList<String>();
this.include_words = new NormalizedWords();
this.exclude_words = new NormalizedWords();
this.include_strings = new ArrayList<String>();
this.exclude_strings = new ArrayList<String>();
this.include_hashes = include_hashes;
@ -67,8 +100,8 @@ public class QueryGoal {
assert query_original != null;
assert query_words != null;
this.query_original = query_original;
this.include_words = new ArrayList<String>();
this.exclude_words = new ArrayList<String>();
this.include_words = new NormalizedWords();
this.exclude_words = new NormalizedWords();
this.include_strings = new ArrayList<String>();
this.exclude_strings = new ArrayList<String>();
@ -88,8 +121,8 @@ public class QueryGoal {
for (String s: this.include_strings) parseQuery(s, this.include_words, this.include_words);
for (String s: this.exclude_strings) parseQuery(s, this.exclude_words, this.exclude_words);
WordCache.learn(this.include_strings);
WordCache.learn(this.exclude_strings);
WordCache.learn(this.include_words);
WordCache.learn(this.exclude_words);
this.include_hashes = null;
this.exclude_hashes = null;
@ -107,7 +140,7 @@ public class QueryGoal {
* sq = '\''
* dq = '"'
*/
private static void parseQuery(String s, ArrayList<String> include_string, ArrayList<String> exclude_string) {
private static void parseQuery(String s, Collection<String> include_string, Collection<String> exclude_string) {
while (s.length() > 0) {
// parse query
int p = 0;
@ -155,15 +188,51 @@ public class QueryGoal {
return ret;
}
/**
* @return a set of hashes of words to be included in the search result.
* if possible, use getIncludeWords instead
*/
public HandleSet getIncludeHashes() {
if (include_hashes == null) include_hashes = Word.words2hashesHandles(include_words);
return include_hashes;
}
/**
* @return a set of hashes of words to be excluded in the search result
* if possible, use getExcludeWords instead
*/
public HandleSet getExcludeHashes() {
if (exclude_hashes == null) exclude_hashes = Word.words2hashesHandles(exclude_words);
return exclude_hashes;
}
/**
* @return a set of words to be included in the search result
*/
public NormalizedWords getIncludeWords() {
return include_words;
}
/**
* @return a set of words to be excluded in the search result
*/
public NormalizedWords getExcludeWords() {
return exclude_words;
}
/**
* @return a list of include strings which reproduces the original order of the search words and quotation
*/
public ArrayList<String> getIncludeStrings() {
return include_strings;
}
/**
* @return a list of exclude strings which reproduces the original order of the search words and quotation
*/
public ArrayList<String> getExcludeStrings() {
return exclude_strings;
}
/**
* the include string may be useful (and better) for highlight/snippet computation
@ -175,14 +244,6 @@ public class QueryGoal {
for (String s: this.include_strings) sb.append(s).append(' ');
return sb.toString().substring(0, sb.length() - 1);
}
public ArrayList<String> getIncludeStrings() {
return include_strings;
}
public ArrayList<String> getExcludeStrings() {
return exclude_strings;
}
public boolean isCatchall() {
if (include_strings.size() != 1 || exclude_strings.size() != 0) return false;
@ -205,6 +266,10 @@ public class QueryGoal {
public void filterOut(final SortedSet<String> blueList) {
// filter out words that appear in this set
// this is applied to the queryHashes
for (String word: blueList) {
this.include_words.remove(word);
this.include_strings.remove(word);
}
final HandleSet blues = Word.words2hashesHandles(blueList);
for (final byte[] b: blues) this.include_hashes.remove(b);
}

View File

@ -32,6 +32,7 @@ import java.util.Iterator;
import java.util.LinkedHashSet;
import java.util.Map;
import java.util.Set;
import java.util.SortedSet;
import java.util.regex.Pattern;
import java.util.regex.PatternSyntaxException;
@ -318,19 +319,17 @@ public final class QueryParams {
*/
private final boolean matchesText(final String text) {
boolean ret = false;
final HandleSet wordhashes = Word.words2hashesHandles(Condenser.getWords(text, null).keySet());
if (!SetTools.anymatch(wordhashes, this.queryGoal.getExcludeHashes())) {
ret = SetTools.totalInclusion(this.queryGoal.getIncludeHashes(), wordhashes);
QueryGoal.NormalizedWords words = new QueryGoal.NormalizedWords(Condenser.getWords(text, null).keySet());
if (!SetTools.anymatch(words, this.queryGoal.getExcludeWords())) {
ret = SetTools.totalInclusion(this.queryGoal.getIncludeWords(), words);
}
return ret;
}
protected static final boolean anymatch(final String text, final HandleSet keyhashes) {
// returns true if any of the word hashes in keyhashes appear in the String text
// to do this, all words in the string must be recognized and transcoded to word hashes
if (keyhashes == null || keyhashes.isEmpty()) return false;
final HandleSet wordhashes = Word.words2hashesHandles(Condenser.getWords(text, null).keySet());
return SetTools.anymatch(wordhashes, keyhashes);
protected static final boolean anymatch(final String text, final QueryGoal.NormalizedWords keywords) {
if (keywords == null || keywords.isEmpty()) return false;
final SortedSet<String> textwords = (SortedSet<String>) Condenser.getWords(text, null).keySet();
return SetTools.anymatch(textwords, keywords);
}
public SolrQuery solrQuery(final ContentDomain cd, final boolean getFacets, final boolean excludeintext_image) {
@ -344,7 +343,7 @@ public final class QueryParams {
if (!getFacets) this.cachedQuery.setFacet(false);
return this.cachedQuery;
}
if (this.queryGoal.getIncludeStrings().size() == 0) return null;
if (this.queryGoal.getIncludeWords().size() == 0) return null;
// construct query
final SolrQuery params = getBasicParams(getFacets);
@ -369,7 +368,7 @@ public final class QueryParams {
if (!getFacets) this.cachedQuery.setFacet(false);
return this.cachedQuery;
}
if (this.queryGoal.getIncludeStrings().size() == 0) return null;
if (this.queryGoal.getIncludeWords().size() == 0) return null;
// construct query
final SolrQuery params = getBasicParams(getFacets);

View File

@ -71,7 +71,6 @@ import net.yacy.document.LibraryProvider;
import net.yacy.document.TextParser;
import net.yacy.kelondro.data.meta.URIMetadataNode;
import net.yacy.kelondro.data.meta.URIMetadataRow;
import net.yacy.kelondro.data.word.Word;
import net.yacy.kelondro.data.word.WordReference;
import net.yacy.kelondro.data.word.WordReferenceFactory;
import net.yacy.kelondro.data.word.WordReferenceVars;
@ -1080,10 +1079,10 @@ public final class SearchEvent {
final String pagetitle = page.dc_title().toLowerCase();
// check exclusion
if (!this.query.getQueryGoal().getExcludeHashes().isEmpty() &&
((QueryParams.anymatch(pagetitle, this.query.getQueryGoal().getExcludeHashes()))
|| (QueryParams.anymatch(pageurl.toLowerCase(), this.query.getQueryGoal().getExcludeHashes()))
|| (QueryParams.anymatch(pageauthor.toLowerCase(), this.query.getQueryGoal().getExcludeHashes())))) {
if (!this.query.getQueryGoal().getExcludeWords().isEmpty() &&
((QueryParams.anymatch(pagetitle, this.query.getQueryGoal().getExcludeWords()))
|| (QueryParams.anymatch(pageurl.toLowerCase(), this.query.getQueryGoal().getExcludeWords()))
|| (QueryParams.anymatch(pageauthor.toLowerCase(), this.query.getQueryGoal().getExcludeWords())))) {
if (log.isFine()) log.fine("dropped RWI: no match with query goal exclusion");
if (page.word().local()) this.local_rwi_available.decrementAndGet(); else this.remote_rwi_available.decrementAndGet();
continue;
@ -1294,14 +1293,14 @@ public final class SearchEvent {
}
// apply query-in-result matching
final HandleSet urlcomph = Word.words2hashesHandles(urlcomps);
final HandleSet descrcomph = Word.words2hashesHandles(descrcomps);
final Iterator<byte[]> shi = this.query.getQueryGoal().getIncludeHashes().iterator();
byte[] queryhash;
final QueryGoal.NormalizedWords urlcomph = new QueryGoal.NormalizedWords(urlcomps);
final QueryGoal.NormalizedWords descrcomph = new QueryGoal.NormalizedWords(descrcomps);
final Iterator<String> shi = this.query.getQueryGoal().getIncludeWords().iterator();
String queryword;
while (shi.hasNext()) {
queryhash = shi.next();
if (urlcomph.has(queryhash)) r += 256 << this.query.ranking.coeff_appurl;
if (descrcomph.has(queryhash)) r += 256 << this.query.ranking.coeff_app_dc_title;
queryword = shi.next();
if (urlcomph.contains(queryword)) r += 256 << this.query.ranking.coeff_appurl;
if (descrcomph.contains(queryword)) r += 256 << this.query.ranking.coeff_app_dc_title;
}
return r;
}
@ -1642,7 +1641,7 @@ public final class SearchEvent {
if ( word.length() > 2
&& "http_html_php_ftp_www_com_org_net_gov_edu_index_home_page_for_usage_the_and_zum_der_die_das_und_the_zur_bzw_mit_blog_wiki_aus_bei_off"
.indexOf(word) < 0
&& !this.query.getQueryGoal().getIncludeHashes().has(Word.word2hash(word))
&& !this.query.getQueryGoal().getIncludeWords().contains(word)
&& lettermatch.matcher(word).matches()
&& !Switchboard.badwords.contains(word)
&& !Switchboard.stopwords.contains(word) ) {