From 61409788eb321d6af3bb691ad5f2c77de91e3aa6 Mon Sep 17 00:00:00 2001 From: orbiter Date: Mon, 25 Nov 2013 15:20:54 +0100 Subject: [PATCH] less word hash computations (removing some overhead because of MD5 calcs) using the clear word in a normalized form. --- htroot/yacysearch.java | 4 +- source/net/yacy/document/Condenser.java | 10 +- source/net/yacy/search/query/QueryGoal.java | 97 ++++++++++++++++--- source/net/yacy/search/query/QueryParams.java | 23 +++-- source/net/yacy/search/query/SearchEvent.java | 25 +++-- 5 files changed, 112 insertions(+), 47 deletions(-) diff --git a/htroot/yacysearch.java b/htroot/yacysearch.java index 9f2ba85e6..54b4cc6e9 100644 --- a/htroot/yacysearch.java +++ b/htroot/yacysearch.java @@ -524,9 +524,9 @@ public class yacysearch { final int maxDistance = (querystring.indexOf('"', 0) >= 0) ? qg.getIncludeHashes().size() - 1 : Integer.MAX_VALUE; // filter out stopwords - final SortedSet filtered = SetTools.joinConstructiveByTest(qg.getIncludeStrings(), Switchboard.stopwords); //find matching stopwords + final SortedSet filtered = SetTools.joinConstructiveByTest(qg.getIncludeWords(), Switchboard.stopwords); //find matching stopwords if ( !filtered.isEmpty() ) { - SetTools.excludeDestructiveByTestSmallInLarge(qg.getIncludeStrings(), filtered); //remove stopwords + SetTools.excludeDestructiveByTestSmallInLarge(qg.getIncludeWords(), filtered); //remove stopwords } // if a minus-button was hit, remove a special reference first diff --git a/source/net/yacy/document/Condenser.java b/source/net/yacy/document/Condenser.java index 2d19abdbe..76f432e9a 100644 --- a/source/net/yacy/document/Condenser.java +++ b/source/net/yacy/document/Condenser.java @@ -34,6 +34,7 @@ import java.util.Locale; import java.util.Map; import java.util.Properties; import java.util.Set; +import java.util.SortedMap; import java.util.SortedSet; import java.util.TreeMap; @@ -48,6 +49,7 @@ import net.yacy.cora.document.id.MultiProtocolURL; import net.yacy.cora.federate.solr.Ranking; import net.yacy.cora.language.synonyms.SynonymLibrary; import net.yacy.cora.lod.vocabulary.Tagging; +import net.yacy.cora.order.NaturalOrder; import net.yacy.cora.util.ConcurrentLog; import net.yacy.document.language.Identificator; import net.yacy.document.parser.html.ImageEntry; @@ -73,7 +75,7 @@ public final class Condenser { public static final int flag_cat_hasapp = 23; // the page refers to (at least one) application file //private Properties analysis; - private final Map words; // a string (the words) to (indexWord) - relation + private final SortedMap words; // a string (the words) to (indexWord) - relation private final Map> tags = new HashMap>(); // a set of tags, discovered from Autotagging private final Set synonyms; // a set of synonyms to the words private long fuzzy_signature = 0, exact_signature = 0; // signatures for double-check detection @@ -97,7 +99,7 @@ public final class Condenser { Thread.currentThread().setName("condenser-" + document.dc_identifier()); // for debugging // if addMedia == true, then all the media links are also parsed and added to the words // added media words are flagged with the appropriate media flag - this.words = new HashMap(); + this.words = new TreeMap(NaturalOrder.naturalComparator); this.synonyms = new LinkedHashSet(); this.RESULT_FLAGS = new Bitfield(4); @@ -297,7 +299,7 @@ public final class Condenser { return oldsize - this.words.size(); } - public Map words() { + public SortedMap words() { // returns the words as word/indexWord relation map return this.words; } @@ -458,7 +460,7 @@ public final class Condenser { this.RESULT_DIFF_SENTENCES = sentenceHandleCount; } - public static Map getWords(final String text, final WordCache meaningLib) { + public static SortedMap getWords(final String text, final WordCache meaningLib) { // returns a word/indexWord relation map if (text == null) return null; return new Condenser(text, meaningLib, false).words(); diff --git a/source/net/yacy/search/query/QueryGoal.java b/source/net/yacy/search/query/QueryGoal.java index 51687a581..c77c50532 100644 --- a/source/net/yacy/search/query/QueryGoal.java +++ b/source/net/yacy/search/query/QueryGoal.java @@ -24,13 +24,17 @@ package net.yacy.search.query; import java.io.UnsupportedEncodingException; import java.net.URLEncoder; import java.util.ArrayList; +import java.util.Collection; +import java.util.Locale; import java.util.Map; import java.util.SortedSet; +import java.util.TreeSet; import net.yacy.cora.document.WordCache; import net.yacy.cora.federate.solr.Ranking; import net.yacy.cora.federate.solr.SchemaDeclaration; import net.yacy.cora.federate.solr.SolrType; +import net.yacy.cora.order.NaturalOrder; import net.yacy.cora.storage.HandleSet; import net.yacy.document.parser.html.AbstractScraper; import net.yacy.document.parser.html.CharacterCoding; @@ -49,14 +53,43 @@ public class QueryGoal { private String query_original; private HandleSet include_hashes, exclude_hashes; - private final ArrayList include_words, exclude_words; + private final NormalizedWords include_words, exclude_words; private final ArrayList include_strings, exclude_strings; + public static class NormalizedWords extends TreeSet { + + private static final long serialVersionUID = -3050851079671868007L; + + public NormalizedWords() { + super(NaturalOrder.naturalComparator); + } + + public NormalizedWords(String[] rawWords) { + super(NaturalOrder.naturalComparator); + for (String word: rawWords) super.add(word.toLowerCase(Locale.ENGLISH)); + } + + public NormalizedWords(Collection rawWords) { + super(NaturalOrder.naturalComparator); + for (String word: rawWords) super.add(word.toLowerCase(Locale.ENGLISH)); + } + + @Override + public boolean add(String word) { + return super.add(word.toLowerCase(Locale.ENGLISH)); + } + + @Override + public boolean contains(Object word) { + if (!(word instanceof String)) return false; + return super.contains(((String) word).toLowerCase(Locale.ENGLISH)); + } + } public QueryGoal(HandleSet include_hashes, HandleSet exclude_hashes) { this.query_original = null; - this.include_words = new ArrayList(); - this.exclude_words = new ArrayList(); + this.include_words = new NormalizedWords(); + this.exclude_words = new NormalizedWords(); this.include_strings = new ArrayList(); this.exclude_strings = new ArrayList(); this.include_hashes = include_hashes; @@ -67,8 +100,8 @@ public class QueryGoal { assert query_original != null; assert query_words != null; this.query_original = query_original; - this.include_words = new ArrayList(); - this.exclude_words = new ArrayList(); + this.include_words = new NormalizedWords(); + this.exclude_words = new NormalizedWords(); this.include_strings = new ArrayList(); this.exclude_strings = new ArrayList(); @@ -88,8 +121,8 @@ public class QueryGoal { for (String s: this.include_strings) parseQuery(s, this.include_words, this.include_words); for (String s: this.exclude_strings) parseQuery(s, this.exclude_words, this.exclude_words); - WordCache.learn(this.include_strings); - WordCache.learn(this.exclude_strings); + WordCache.learn(this.include_words); + WordCache.learn(this.exclude_words); this.include_hashes = null; this.exclude_hashes = null; @@ -107,7 +140,7 @@ public class QueryGoal { * sq = '\'' * dq = '"' */ - private static void parseQuery(String s, ArrayList include_string, ArrayList exclude_string) { + private static void parseQuery(String s, Collection include_string, Collection exclude_string) { while (s.length() > 0) { // parse query int p = 0; @@ -155,15 +188,51 @@ public class QueryGoal { return ret; } + /** + * @return a set of hashes of words to be included in the search result. + * if possible, use getIncludeWords instead + */ public HandleSet getIncludeHashes() { if (include_hashes == null) include_hashes = Word.words2hashesHandles(include_words); return include_hashes; } + /** + * @return a set of hashes of words to be excluded in the search result + * if possible, use getExcludeWords instead + */ public HandleSet getExcludeHashes() { if (exclude_hashes == null) exclude_hashes = Word.words2hashesHandles(exclude_words); return exclude_hashes; } + + /** + * @return a set of words to be included in the search result + */ + public NormalizedWords getIncludeWords() { + return include_words; + } + + /** + * @return a set of words to be excluded in the search result + */ + public NormalizedWords getExcludeWords() { + return exclude_words; + } + + /** + * @return a list of include strings which reproduces the original order of the search words and quotation + */ + public ArrayList getIncludeStrings() { + return include_strings; + } + + /** + * @return a list of exclude strings which reproduces the original order of the search words and quotation + */ + public ArrayList getExcludeStrings() { + return exclude_strings; + } /** * the include string may be useful (and better) for highlight/snippet computation @@ -175,14 +244,6 @@ public class QueryGoal { for (String s: this.include_strings) sb.append(s).append(' '); return sb.toString().substring(0, sb.length() - 1); } - - public ArrayList getIncludeStrings() { - return include_strings; - } - - public ArrayList getExcludeStrings() { - return exclude_strings; - } public boolean isCatchall() { if (include_strings.size() != 1 || exclude_strings.size() != 0) return false; @@ -205,6 +266,10 @@ public class QueryGoal { public void filterOut(final SortedSet blueList) { // filter out words that appear in this set // this is applied to the queryHashes + for (String word: blueList) { + this.include_words.remove(word); + this.include_strings.remove(word); + } final HandleSet blues = Word.words2hashesHandles(blueList); for (final byte[] b: blues) this.include_hashes.remove(b); } diff --git a/source/net/yacy/search/query/QueryParams.java b/source/net/yacy/search/query/QueryParams.java index fcc10de9f..dce1bb532 100644 --- a/source/net/yacy/search/query/QueryParams.java +++ b/source/net/yacy/search/query/QueryParams.java @@ -32,6 +32,7 @@ import java.util.Iterator; import java.util.LinkedHashSet; import java.util.Map; import java.util.Set; +import java.util.SortedSet; import java.util.regex.Pattern; import java.util.regex.PatternSyntaxException; @@ -318,19 +319,17 @@ public final class QueryParams { */ private final boolean matchesText(final String text) { boolean ret = false; - final HandleSet wordhashes = Word.words2hashesHandles(Condenser.getWords(text, null).keySet()); - if (!SetTools.anymatch(wordhashes, this.queryGoal.getExcludeHashes())) { - ret = SetTools.totalInclusion(this.queryGoal.getIncludeHashes(), wordhashes); + QueryGoal.NormalizedWords words = new QueryGoal.NormalizedWords(Condenser.getWords(text, null).keySet()); + if (!SetTools.anymatch(words, this.queryGoal.getExcludeWords())) { + ret = SetTools.totalInclusion(this.queryGoal.getIncludeWords(), words); } return ret; } - - protected static final boolean anymatch(final String text, final HandleSet keyhashes) { - // returns true if any of the word hashes in keyhashes appear in the String text - // to do this, all words in the string must be recognized and transcoded to word hashes - if (keyhashes == null || keyhashes.isEmpty()) return false; - final HandleSet wordhashes = Word.words2hashesHandles(Condenser.getWords(text, null).keySet()); - return SetTools.anymatch(wordhashes, keyhashes); + + protected static final boolean anymatch(final String text, final QueryGoal.NormalizedWords keywords) { + if (keywords == null || keywords.isEmpty()) return false; + final SortedSet textwords = (SortedSet) Condenser.getWords(text, null).keySet(); + return SetTools.anymatch(textwords, keywords); } public SolrQuery solrQuery(final ContentDomain cd, final boolean getFacets, final boolean excludeintext_image) { @@ -344,7 +343,7 @@ public final class QueryParams { if (!getFacets) this.cachedQuery.setFacet(false); return this.cachedQuery; } - if (this.queryGoal.getIncludeStrings().size() == 0) return null; + if (this.queryGoal.getIncludeWords().size() == 0) return null; // construct query final SolrQuery params = getBasicParams(getFacets); @@ -369,7 +368,7 @@ public final class QueryParams { if (!getFacets) this.cachedQuery.setFacet(false); return this.cachedQuery; } - if (this.queryGoal.getIncludeStrings().size() == 0) return null; + if (this.queryGoal.getIncludeWords().size() == 0) return null; // construct query final SolrQuery params = getBasicParams(getFacets); diff --git a/source/net/yacy/search/query/SearchEvent.java b/source/net/yacy/search/query/SearchEvent.java index 310fc9072..4ba335e96 100644 --- a/source/net/yacy/search/query/SearchEvent.java +++ b/source/net/yacy/search/query/SearchEvent.java @@ -71,7 +71,6 @@ import net.yacy.document.LibraryProvider; import net.yacy.document.TextParser; import net.yacy.kelondro.data.meta.URIMetadataNode; import net.yacy.kelondro.data.meta.URIMetadataRow; -import net.yacy.kelondro.data.word.Word; import net.yacy.kelondro.data.word.WordReference; import net.yacy.kelondro.data.word.WordReferenceFactory; import net.yacy.kelondro.data.word.WordReferenceVars; @@ -1080,10 +1079,10 @@ public final class SearchEvent { final String pagetitle = page.dc_title().toLowerCase(); // check exclusion - if (!this.query.getQueryGoal().getExcludeHashes().isEmpty() && - ((QueryParams.anymatch(pagetitle, this.query.getQueryGoal().getExcludeHashes())) - || (QueryParams.anymatch(pageurl.toLowerCase(), this.query.getQueryGoal().getExcludeHashes())) - || (QueryParams.anymatch(pageauthor.toLowerCase(), this.query.getQueryGoal().getExcludeHashes())))) { + if (!this.query.getQueryGoal().getExcludeWords().isEmpty() && + ((QueryParams.anymatch(pagetitle, this.query.getQueryGoal().getExcludeWords())) + || (QueryParams.anymatch(pageurl.toLowerCase(), this.query.getQueryGoal().getExcludeWords())) + || (QueryParams.anymatch(pageauthor.toLowerCase(), this.query.getQueryGoal().getExcludeWords())))) { if (log.isFine()) log.fine("dropped RWI: no match with query goal exclusion"); if (page.word().local()) this.local_rwi_available.decrementAndGet(); else this.remote_rwi_available.decrementAndGet(); continue; @@ -1294,14 +1293,14 @@ public final class SearchEvent { } // apply query-in-result matching - final HandleSet urlcomph = Word.words2hashesHandles(urlcomps); - final HandleSet descrcomph = Word.words2hashesHandles(descrcomps); - final Iterator shi = this.query.getQueryGoal().getIncludeHashes().iterator(); - byte[] queryhash; + final QueryGoal.NormalizedWords urlcomph = new QueryGoal.NormalizedWords(urlcomps); + final QueryGoal.NormalizedWords descrcomph = new QueryGoal.NormalizedWords(descrcomps); + final Iterator shi = this.query.getQueryGoal().getIncludeWords().iterator(); + String queryword; while (shi.hasNext()) { - queryhash = shi.next(); - if (urlcomph.has(queryhash)) r += 256 << this.query.ranking.coeff_appurl; - if (descrcomph.has(queryhash)) r += 256 << this.query.ranking.coeff_app_dc_title; + queryword = shi.next(); + if (urlcomph.contains(queryword)) r += 256 << this.query.ranking.coeff_appurl; + if (descrcomph.contains(queryword)) r += 256 << this.query.ranking.coeff_app_dc_title; } return r; } @@ -1642,7 +1641,7 @@ public final class SearchEvent { if ( word.length() > 2 && "http_html_php_ftp_www_com_org_net_gov_edu_index_home_page_for_usage_the_and_zum_der_die_das_und_the_zur_bzw_mit_blog_wiki_aus_bei_off" .indexOf(word) < 0 - && !this.query.getQueryGoal().getIncludeHashes().has(Word.word2hash(word)) + && !this.query.getQueryGoal().getIncludeWords().contains(word) && lettermatch.matcher(word).matches() && !Switchboard.badwords.contains(word) && !Switchboard.stopwords.contains(word) ) {