less word hash computations (removing some overhead because of MD5

calcs) using the clear word in a normalized form.
2024-09-19 00:01:41 +02:00 · 2013-11-25 15:20:54 +01:00 · 2013-11-25 15:20:54 +01:00 · 61409788eb
commit 61409788eb
parent f23471c471
5 changed files with 112 additions and 47 deletions
--- a/htroot/yacysearch.java
+++ b/htroot/yacysearch.java
@ -524,9 +524,9 @@ public class yacysearch {
            final int maxDistance = (querystring.indexOf('"', 0) >= 0) ? qg.getIncludeHashes().size() - 1 : Integer.MAX_VALUE;

            // filter out stopwords
-            final SortedSet<String> filtered = SetTools.joinConstructiveByTest(qg.getIncludeStrings(), Switchboard.stopwords); //find matching stopwords
+            final SortedSet<String> filtered = SetTools.joinConstructiveByTest(qg.getIncludeWords(), Switchboard.stopwords); //find matching stopwords
            if ( !filtered.isEmpty() ) {
-                SetTools.excludeDestructiveByTestSmallInLarge(qg.getIncludeStrings(), filtered); //remove stopwords
+                SetTools.excludeDestructiveByTestSmallInLarge(qg.getIncludeWords(), filtered); //remove stopwords
            }

            // if a minus-button was hit, remove a special reference first
--- a/source/net/yacy/document/Condenser.java
+++ b/source/net/yacy/document/Condenser.java
@ -34,6 +34,7 @@ import java.util.Locale;
 import java.util.Map;
 import java.util.Properties;
 import java.util.Set;
+import java.util.SortedMap;
 import java.util.SortedSet;
 import java.util.TreeMap;

@ -48,6 +49,7 @@ import net.yacy.cora.document.id.MultiProtocolURL;
 import net.yacy.cora.federate.solr.Ranking;
 import net.yacy.cora.language.synonyms.SynonymLibrary;
 import net.yacy.cora.lod.vocabulary.Tagging;
+import net.yacy.cora.order.NaturalOrder;
 import net.yacy.cora.util.ConcurrentLog;
 import net.yacy.document.language.Identificator;
 import net.yacy.document.parser.html.ImageEntry;
@ -73,7 +75,7 @@ public final class Condenser {
    public  static final int flag_cat_hasapp        = 23; // the page refers to (at least one) application file

    //private Properties analysis;
-    private final Map<String, Word> words; // a string (the words) to (indexWord) - relation
+    private final SortedMap<String, Word> words; // a string (the words) to (indexWord) - relation
    private final Map<String, Set<Tagging.Metatag>> tags = new HashMap<String, Set<Tagging.Metatag>>(); // a set of tags, discovered from Autotagging
    private final Set<String> synonyms; // a set of synonyms to the words
    private long fuzzy_signature = 0, exact_signature = 0; // signatures for double-check detection
@ -97,7 +99,7 @@ public final class Condenser {
        Thread.currentThread().setName("condenser-" + document.dc_identifier()); // for debugging
        // if addMedia == true, then all the media links are also parsed and added to the words
        // added media words are flagged with the appropriate media flag
-        this.words = new HashMap<String, Word>();
+        this.words = new TreeMap<String, Word>(NaturalOrder.naturalComparator);
        this.synonyms = new LinkedHashSet<String>();
        this.RESULT_FLAGS = new Bitfield(4);

@ -297,7 +299,7 @@ public final class Condenser {
        return oldsize - this.words.size();
    }

-    public Map<String, Word> words() {
+    public SortedMap<String, Word> words() {
        // returns the words as word/indexWord relation map
        return this.words;
    }
@ -458,7 +460,7 @@ public final class Condenser {
        this.RESULT_DIFF_SENTENCES = sentenceHandleCount;
    }

-    public static Map<String, Word> getWords(final String text, final WordCache meaningLib) {
+    public static SortedMap<String, Word> getWords(final String text, final WordCache meaningLib) {
        // returns a word/indexWord relation map
        if (text == null) return null;
        return new Condenser(text, meaningLib, false).words();
--- a/source/net/yacy/search/query/QueryGoal.java
+++ b/source/net/yacy/search/query/QueryGoal.java
@ -24,13 +24,17 @@ package net.yacy.search.query;
 import java.io.UnsupportedEncodingException;
 import java.net.URLEncoder;
 import java.util.ArrayList;
+import java.util.Collection;
+import java.util.Locale;
 import java.util.Map;
 import java.util.SortedSet;
+import java.util.TreeSet;

 import net.yacy.cora.document.WordCache;
 import net.yacy.cora.federate.solr.Ranking;
 import net.yacy.cora.federate.solr.SchemaDeclaration;
 import net.yacy.cora.federate.solr.SolrType;
+import net.yacy.cora.order.NaturalOrder;
 import net.yacy.cora.storage.HandleSet;
 import net.yacy.document.parser.html.AbstractScraper;
 import net.yacy.document.parser.html.CharacterCoding;
@ -49,14 +53,43 @@ public class QueryGoal {
    
    private String query_original;
    private HandleSet include_hashes, exclude_hashes;
-    private final ArrayList<String> include_words, exclude_words;
+    private final NormalizedWords include_words, exclude_words;
    private final ArrayList<String> include_strings, exclude_strings;

+    public static class NormalizedWords extends TreeSet<String> {
+
+        private static final long serialVersionUID = -3050851079671868007L;
+
+        public NormalizedWords() {
+            super(NaturalOrder.naturalComparator);
+        }
+        
+        public NormalizedWords(String[] rawWords) {
+            super(NaturalOrder.naturalComparator);
+            for (String word: rawWords) super.add(word.toLowerCase(Locale.ENGLISH));
+        }
+        
+        public NormalizedWords(Collection<String> rawWords) {
+            super(NaturalOrder.naturalComparator);
+            for (String word: rawWords) super.add(word.toLowerCase(Locale.ENGLISH));
+        }
+
+        @Override
+        public boolean add(String word) {
+            return super.add(word.toLowerCase(Locale.ENGLISH));
+        }
+
+        @Override
+        public boolean contains(Object word) {
+            if (!(word instanceof String)) return false;
+            return super.contains(((String) word).toLowerCase(Locale.ENGLISH));
+        }
+    }

    public QueryGoal(HandleSet include_hashes, HandleSet exclude_hashes) {
        this.query_original = null;
-        this.include_words = new ArrayList<String>();
-        this.exclude_words = new ArrayList<String>();
+        this.include_words = new NormalizedWords();
+        this.exclude_words = new NormalizedWords();
        this.include_strings = new ArrayList<String>();
        this.exclude_strings = new ArrayList<String>();
        this.include_hashes = include_hashes;
@ -67,8 +100,8 @@ public class QueryGoal {
        assert query_original != null;
        assert query_words != null;
        this.query_original = query_original;
-        this.include_words = new ArrayList<String>();
-        this.exclude_words = new ArrayList<String>();
+        this.include_words = new NormalizedWords();
+        this.exclude_words = new NormalizedWords();
        this.include_strings = new ArrayList<String>();
        this.exclude_strings = new ArrayList<String>();

@ -88,8 +121,8 @@ public class QueryGoal {
        for (String s: this.include_strings) parseQuery(s, this.include_words, this.include_words);
        for (String s: this.exclude_strings) parseQuery(s, this.exclude_words, this.exclude_words);

-        WordCache.learn(this.include_strings);
-        WordCache.learn(this.exclude_strings);
+        WordCache.learn(this.include_words);
+        WordCache.learn(this.exclude_words);
        
        this.include_hashes = null;
        this.exclude_hashes = null;
@ -107,7 +140,7 @@ public class QueryGoal {
 * sq         = '\''
 * dq         = '"'
 */
-    private static void parseQuery(String s, ArrayList<String> include_string, ArrayList<String> exclude_string) {
+    private static void parseQuery(String s, Collection<String> include_string, Collection<String> exclude_string) {
        while (s.length() > 0) {
            // parse query
            int p = 0;
@ -155,15 +188,51 @@ public class QueryGoal {
        return ret;
    }
    
+    /**
+     * @return a set of hashes of words to be included in the search result.
+     * if possible, use getIncludeWords instead
+     */
    public HandleSet getIncludeHashes() {
        if (include_hashes == null) include_hashes = Word.words2hashesHandles(include_words);
        return include_hashes;
    }

+    /**
+     * @return a set of hashes of words to be excluded in the search result
+     * if possible, use getExcludeWords instead
+     */
    public HandleSet getExcludeHashes() {
        if (exclude_hashes == null) exclude_hashes = Word.words2hashesHandles(exclude_words);
        return exclude_hashes;
    }
+
+    /**
+     * @return a set of words to be included in the search result
+     */
+    public NormalizedWords getIncludeWords() {
+        return include_words;
+    }
+
+    /**
+     * @return a set of words to be excluded in the search result
+     */
+    public NormalizedWords getExcludeWords() {
+        return exclude_words;
+    }
+   
+    /**
+     * @return a list of include strings which reproduces the original order of the search words and quotation
+     */
+    public ArrayList<String> getIncludeStrings() {
+        return include_strings;
+    }
+
+    /**
+     * @return a list of exclude strings which reproduces the original order of the search words and quotation
+     */
+    public ArrayList<String> getExcludeStrings() {
+        return exclude_strings;
+    }
   
    /**
     * the include string may be useful (and better) for highlight/snippet computation 
@ -175,14 +244,6 @@ public class QueryGoal {
        for (String s: this.include_strings) sb.append(s).append(' ');
        return sb.toString().substring(0, sb.length() - 1);
    }
-   
-    public ArrayList<String> getIncludeStrings() {
-        return include_strings;
-    }
-    
-    public ArrayList<String> getExcludeStrings() {
-        return exclude_strings;
-    }
    
    public boolean isCatchall() {
        if (include_strings.size() != 1 || exclude_strings.size() != 0) return false;
@ -205,6 +266,10 @@ public class QueryGoal {
    public void filterOut(final SortedSet<String> blueList) {
        // filter out words that appear in this set
        // this is applied to the queryHashes
+        for (String word: blueList) {
+            this.include_words.remove(word);
+            this.include_strings.remove(word);
+        }
        final HandleSet blues = Word.words2hashesHandles(blueList);
        for (final byte[] b: blues) this.include_hashes.remove(b);
    }
--- a/source/net/yacy/search/query/QueryParams.java
+++ b/source/net/yacy/search/query/QueryParams.java
@ -32,6 +32,7 @@ import java.util.Iterator;
 import java.util.LinkedHashSet;
 import java.util.Map;
 import java.util.Set;
+import java.util.SortedSet;
 import java.util.regex.Pattern;
 import java.util.regex.PatternSyntaxException;

@ -318,19 +319,17 @@ public final class QueryParams {
     */
    private final boolean matchesText(final String text) {
        boolean ret = false;
-        final HandleSet wordhashes = Word.words2hashesHandles(Condenser.getWords(text, null).keySet());
-        if (!SetTools.anymatch(wordhashes, this.queryGoal.getExcludeHashes())) {
-            ret = SetTools.totalInclusion(this.queryGoal.getIncludeHashes(), wordhashes);
+        QueryGoal.NormalizedWords words = new QueryGoal.NormalizedWords(Condenser.getWords(text, null).keySet());
+        if (!SetTools.anymatch(words, this.queryGoal.getExcludeWords())) {
+            ret = SetTools.totalInclusion(this.queryGoal.getIncludeWords(), words);
        }
        return ret;
    }
-
-    protected static final boolean anymatch(final String text, final HandleSet keyhashes) {
-    	// returns true if any of the word hashes in keyhashes appear in the String text
-    	// to do this, all words in the string must be recognized and transcoded to word hashes
-        if (keyhashes == null || keyhashes.isEmpty()) return false;
-    	final HandleSet wordhashes = Word.words2hashesHandles(Condenser.getWords(text, null).keySet());
-    	return SetTools.anymatch(wordhashes, keyhashes);
+    
+    protected static final boolean anymatch(final String text, final QueryGoal.NormalizedWords keywords) {
+        if (keywords == null || keywords.isEmpty()) return false;
+        final SortedSet<String> textwords = (SortedSet<String>) Condenser.getWords(text, null).keySet();
+        return SetTools.anymatch(textwords, keywords);
    }

    public SolrQuery solrQuery(final ContentDomain cd, final boolean getFacets, final boolean excludeintext_image) {
@ -344,7 +343,7 @@ public final class QueryParams {
            if (!getFacets) this.cachedQuery.setFacet(false);
            return this.cachedQuery;
        }
-        if (this.queryGoal.getIncludeStrings().size() == 0) return null;
+        if (this.queryGoal.getIncludeWords().size() == 0) return null;
        
        // construct query
        final SolrQuery params = getBasicParams(getFacets);
@ -369,7 +368,7 @@ public final class QueryParams {
            if (!getFacets) this.cachedQuery.setFacet(false);
            return this.cachedQuery;
        }
-        if (this.queryGoal.getIncludeStrings().size() == 0) return null;
+        if (this.queryGoal.getIncludeWords().size() == 0) return null;
        
        // construct query
        final SolrQuery params = getBasicParams(getFacets);
--- a/source/net/yacy/search/query/SearchEvent.java
+++ b/source/net/yacy/search/query/SearchEvent.java
@ -71,7 +71,6 @@ import net.yacy.document.LibraryProvider;
 import net.yacy.document.TextParser;
 import net.yacy.kelondro.data.meta.URIMetadataNode;
 import net.yacy.kelondro.data.meta.URIMetadataRow;
-import net.yacy.kelondro.data.word.Word;
 import net.yacy.kelondro.data.word.WordReference;
 import net.yacy.kelondro.data.word.WordReferenceFactory;
 import net.yacy.kelondro.data.word.WordReferenceVars;
@ -1080,10 +1079,10 @@ public final class SearchEvent {
            final String pagetitle = page.dc_title().toLowerCase();

            // check exclusion
-            if (!this.query.getQueryGoal().getExcludeHashes().isEmpty() &&
-                ((QueryParams.anymatch(pagetitle, this.query.getQueryGoal().getExcludeHashes()))
-                || (QueryParams.anymatch(pageurl.toLowerCase(), this.query.getQueryGoal().getExcludeHashes()))
-                || (QueryParams.anymatch(pageauthor.toLowerCase(), this.query.getQueryGoal().getExcludeHashes())))) {
+            if (!this.query.getQueryGoal().getExcludeWords().isEmpty() &&
+                ((QueryParams.anymatch(pagetitle, this.query.getQueryGoal().getExcludeWords()))
+                || (QueryParams.anymatch(pageurl.toLowerCase(), this.query.getQueryGoal().getExcludeWords()))
+                || (QueryParams.anymatch(pageauthor.toLowerCase(), this.query.getQueryGoal().getExcludeWords())))) {
                if (log.isFine()) log.fine("dropped RWI: no match with query goal exclusion");
                if (page.word().local()) this.local_rwi_available.decrementAndGet(); else this.remote_rwi_available.decrementAndGet();
                continue;
@ -1294,14 +1293,14 @@ public final class SearchEvent {
        }

        // apply query-in-result matching
-        final HandleSet urlcomph = Word.words2hashesHandles(urlcomps);
-        final HandleSet descrcomph = Word.words2hashesHandles(descrcomps);
-        final Iterator<byte[]> shi = this.query.getQueryGoal().getIncludeHashes().iterator();
-        byte[] queryhash;
+        final QueryGoal.NormalizedWords urlcomph = new QueryGoal.NormalizedWords(urlcomps);
+        final QueryGoal.NormalizedWords descrcomph = new QueryGoal.NormalizedWords(descrcomps);
+        final Iterator<String> shi = this.query.getQueryGoal().getIncludeWords().iterator();
+        String queryword;
        while (shi.hasNext()) {
-            queryhash = shi.next();
-            if (urlcomph.has(queryhash)) r += 256 << this.query.ranking.coeff_appurl;
-            if (descrcomph.has(queryhash)) r += 256 << this.query.ranking.coeff_app_dc_title;
+            queryword = shi.next();
+            if (urlcomph.contains(queryword)) r += 256 << this.query.ranking.coeff_appurl;
+            if (descrcomph.contains(queryword)) r += 256 << this.query.ranking.coeff_app_dc_title;
        }
        return r;
    }
@ -1642,7 +1641,7 @@ public final class SearchEvent {
            if ( word.length() > 2
                && "http_html_php_ftp_www_com_org_net_gov_edu_index_home_page_for_usage_the_and_zum_der_die_das_und_the_zur_bzw_mit_blog_wiki_aus_bei_off"
                    .indexOf(word) < 0
-                && !this.query.getQueryGoal().getIncludeHashes().has(Word.word2hash(word))
+                && !this.query.getQueryGoal().getIncludeWords().contains(word)
                && lettermatch.matcher(word).matches()
                && !Switchboard.badwords.contains(word)
                && !Switchboard.stopwords.contains(word) ) {