mirror of
https://github.com/yacy/yacy_search_server.git
synced 2024-09-19 00:01:41 +02:00
less word hash computations (removing some overhead because of MD5
calcs) using the clear word in a normalized form.
This commit is contained in:
parent
f23471c471
commit
61409788eb
|
@ -524,9 +524,9 @@ public class yacysearch {
|
|||
final int maxDistance = (querystring.indexOf('"', 0) >= 0) ? qg.getIncludeHashes().size() - 1 : Integer.MAX_VALUE;
|
||||
|
||||
// filter out stopwords
|
||||
final SortedSet<String> filtered = SetTools.joinConstructiveByTest(qg.getIncludeStrings(), Switchboard.stopwords); //find matching stopwords
|
||||
final SortedSet<String> filtered = SetTools.joinConstructiveByTest(qg.getIncludeWords(), Switchboard.stopwords); //find matching stopwords
|
||||
if ( !filtered.isEmpty() ) {
|
||||
SetTools.excludeDestructiveByTestSmallInLarge(qg.getIncludeStrings(), filtered); //remove stopwords
|
||||
SetTools.excludeDestructiveByTestSmallInLarge(qg.getIncludeWords(), filtered); //remove stopwords
|
||||
}
|
||||
|
||||
// if a minus-button was hit, remove a special reference first
|
||||
|
|
|
@ -34,6 +34,7 @@ import java.util.Locale;
|
|||
import java.util.Map;
|
||||
import java.util.Properties;
|
||||
import java.util.Set;
|
||||
import java.util.SortedMap;
|
||||
import java.util.SortedSet;
|
||||
import java.util.TreeMap;
|
||||
|
||||
|
@ -48,6 +49,7 @@ import net.yacy.cora.document.id.MultiProtocolURL;
|
|||
import net.yacy.cora.federate.solr.Ranking;
|
||||
import net.yacy.cora.language.synonyms.SynonymLibrary;
|
||||
import net.yacy.cora.lod.vocabulary.Tagging;
|
||||
import net.yacy.cora.order.NaturalOrder;
|
||||
import net.yacy.cora.util.ConcurrentLog;
|
||||
import net.yacy.document.language.Identificator;
|
||||
import net.yacy.document.parser.html.ImageEntry;
|
||||
|
@ -73,7 +75,7 @@ public final class Condenser {
|
|||
public static final int flag_cat_hasapp = 23; // the page refers to (at least one) application file
|
||||
|
||||
//private Properties analysis;
|
||||
private final Map<String, Word> words; // a string (the words) to (indexWord) - relation
|
||||
private final SortedMap<String, Word> words; // a string (the words) to (indexWord) - relation
|
||||
private final Map<String, Set<Tagging.Metatag>> tags = new HashMap<String, Set<Tagging.Metatag>>(); // a set of tags, discovered from Autotagging
|
||||
private final Set<String> synonyms; // a set of synonyms to the words
|
||||
private long fuzzy_signature = 0, exact_signature = 0; // signatures for double-check detection
|
||||
|
@ -97,7 +99,7 @@ public final class Condenser {
|
|||
Thread.currentThread().setName("condenser-" + document.dc_identifier()); // for debugging
|
||||
// if addMedia == true, then all the media links are also parsed and added to the words
|
||||
// added media words are flagged with the appropriate media flag
|
||||
this.words = new HashMap<String, Word>();
|
||||
this.words = new TreeMap<String, Word>(NaturalOrder.naturalComparator);
|
||||
this.synonyms = new LinkedHashSet<String>();
|
||||
this.RESULT_FLAGS = new Bitfield(4);
|
||||
|
||||
|
@ -297,7 +299,7 @@ public final class Condenser {
|
|||
return oldsize - this.words.size();
|
||||
}
|
||||
|
||||
public Map<String, Word> words() {
|
||||
public SortedMap<String, Word> words() {
|
||||
// returns the words as word/indexWord relation map
|
||||
return this.words;
|
||||
}
|
||||
|
@ -458,7 +460,7 @@ public final class Condenser {
|
|||
this.RESULT_DIFF_SENTENCES = sentenceHandleCount;
|
||||
}
|
||||
|
||||
public static Map<String, Word> getWords(final String text, final WordCache meaningLib) {
|
||||
public static SortedMap<String, Word> getWords(final String text, final WordCache meaningLib) {
|
||||
// returns a word/indexWord relation map
|
||||
if (text == null) return null;
|
||||
return new Condenser(text, meaningLib, false).words();
|
||||
|
|
|
@ -24,13 +24,17 @@ package net.yacy.search.query;
|
|||
import java.io.UnsupportedEncodingException;
|
||||
import java.net.URLEncoder;
|
||||
import java.util.ArrayList;
|
||||
import java.util.Collection;
|
||||
import java.util.Locale;
|
||||
import java.util.Map;
|
||||
import java.util.SortedSet;
|
||||
import java.util.TreeSet;
|
||||
|
||||
import net.yacy.cora.document.WordCache;
|
||||
import net.yacy.cora.federate.solr.Ranking;
|
||||
import net.yacy.cora.federate.solr.SchemaDeclaration;
|
||||
import net.yacy.cora.federate.solr.SolrType;
|
||||
import net.yacy.cora.order.NaturalOrder;
|
||||
import net.yacy.cora.storage.HandleSet;
|
||||
import net.yacy.document.parser.html.AbstractScraper;
|
||||
import net.yacy.document.parser.html.CharacterCoding;
|
||||
|
@ -49,14 +53,43 @@ public class QueryGoal {
|
|||
|
||||
private String query_original;
|
||||
private HandleSet include_hashes, exclude_hashes;
|
||||
private final ArrayList<String> include_words, exclude_words;
|
||||
private final NormalizedWords include_words, exclude_words;
|
||||
private final ArrayList<String> include_strings, exclude_strings;
|
||||
|
||||
public static class NormalizedWords extends TreeSet<String> {
|
||||
|
||||
private static final long serialVersionUID = -3050851079671868007L;
|
||||
|
||||
public NormalizedWords() {
|
||||
super(NaturalOrder.naturalComparator);
|
||||
}
|
||||
|
||||
public NormalizedWords(String[] rawWords) {
|
||||
super(NaturalOrder.naturalComparator);
|
||||
for (String word: rawWords) super.add(word.toLowerCase(Locale.ENGLISH));
|
||||
}
|
||||
|
||||
public NormalizedWords(Collection<String> rawWords) {
|
||||
super(NaturalOrder.naturalComparator);
|
||||
for (String word: rawWords) super.add(word.toLowerCase(Locale.ENGLISH));
|
||||
}
|
||||
|
||||
@Override
|
||||
public boolean add(String word) {
|
||||
return super.add(word.toLowerCase(Locale.ENGLISH));
|
||||
}
|
||||
|
||||
@Override
|
||||
public boolean contains(Object word) {
|
||||
if (!(word instanceof String)) return false;
|
||||
return super.contains(((String) word).toLowerCase(Locale.ENGLISH));
|
||||
}
|
||||
}
|
||||
|
||||
public QueryGoal(HandleSet include_hashes, HandleSet exclude_hashes) {
|
||||
this.query_original = null;
|
||||
this.include_words = new ArrayList<String>();
|
||||
this.exclude_words = new ArrayList<String>();
|
||||
this.include_words = new NormalizedWords();
|
||||
this.exclude_words = new NormalizedWords();
|
||||
this.include_strings = new ArrayList<String>();
|
||||
this.exclude_strings = new ArrayList<String>();
|
||||
this.include_hashes = include_hashes;
|
||||
|
@ -67,8 +100,8 @@ public class QueryGoal {
|
|||
assert query_original != null;
|
||||
assert query_words != null;
|
||||
this.query_original = query_original;
|
||||
this.include_words = new ArrayList<String>();
|
||||
this.exclude_words = new ArrayList<String>();
|
||||
this.include_words = new NormalizedWords();
|
||||
this.exclude_words = new NormalizedWords();
|
||||
this.include_strings = new ArrayList<String>();
|
||||
this.exclude_strings = new ArrayList<String>();
|
||||
|
||||
|
@ -88,8 +121,8 @@ public class QueryGoal {
|
|||
for (String s: this.include_strings) parseQuery(s, this.include_words, this.include_words);
|
||||
for (String s: this.exclude_strings) parseQuery(s, this.exclude_words, this.exclude_words);
|
||||
|
||||
WordCache.learn(this.include_strings);
|
||||
WordCache.learn(this.exclude_strings);
|
||||
WordCache.learn(this.include_words);
|
||||
WordCache.learn(this.exclude_words);
|
||||
|
||||
this.include_hashes = null;
|
||||
this.exclude_hashes = null;
|
||||
|
@ -107,7 +140,7 @@ public class QueryGoal {
|
|||
* sq = '\''
|
||||
* dq = '"'
|
||||
*/
|
||||
private static void parseQuery(String s, ArrayList<String> include_string, ArrayList<String> exclude_string) {
|
||||
private static void parseQuery(String s, Collection<String> include_string, Collection<String> exclude_string) {
|
||||
while (s.length() > 0) {
|
||||
// parse query
|
||||
int p = 0;
|
||||
|
@ -155,16 +188,52 @@ public class QueryGoal {
|
|||
return ret;
|
||||
}
|
||||
|
||||
/**
|
||||
* @return a set of hashes of words to be included in the search result.
|
||||
* if possible, use getIncludeWords instead
|
||||
*/
|
||||
public HandleSet getIncludeHashes() {
|
||||
if (include_hashes == null) include_hashes = Word.words2hashesHandles(include_words);
|
||||
return include_hashes;
|
||||
}
|
||||
|
||||
/**
|
||||
* @return a set of hashes of words to be excluded in the search result
|
||||
* if possible, use getExcludeWords instead
|
||||
*/
|
||||
public HandleSet getExcludeHashes() {
|
||||
if (exclude_hashes == null) exclude_hashes = Word.words2hashesHandles(exclude_words);
|
||||
return exclude_hashes;
|
||||
}
|
||||
|
||||
/**
|
||||
* @return a set of words to be included in the search result
|
||||
*/
|
||||
public NormalizedWords getIncludeWords() {
|
||||
return include_words;
|
||||
}
|
||||
|
||||
/**
|
||||
* @return a set of words to be excluded in the search result
|
||||
*/
|
||||
public NormalizedWords getExcludeWords() {
|
||||
return exclude_words;
|
||||
}
|
||||
|
||||
/**
|
||||
* @return a list of include strings which reproduces the original order of the search words and quotation
|
||||
*/
|
||||
public ArrayList<String> getIncludeStrings() {
|
||||
return include_strings;
|
||||
}
|
||||
|
||||
/**
|
||||
* @return a list of exclude strings which reproduces the original order of the search words and quotation
|
||||
*/
|
||||
public ArrayList<String> getExcludeStrings() {
|
||||
return exclude_strings;
|
||||
}
|
||||
|
||||
/**
|
||||
* the include string may be useful (and better) for highlight/snippet computation
|
||||
* @return the query string containing only the positive literals (includes) and without whitespace characters
|
||||
|
@ -176,14 +245,6 @@ public class QueryGoal {
|
|||
return sb.toString().substring(0, sb.length() - 1);
|
||||
}
|
||||
|
||||
public ArrayList<String> getIncludeStrings() {
|
||||
return include_strings;
|
||||
}
|
||||
|
||||
public ArrayList<String> getExcludeStrings() {
|
||||
return exclude_strings;
|
||||
}
|
||||
|
||||
public boolean isCatchall() {
|
||||
if (include_strings.size() != 1 || exclude_strings.size() != 0) return false;
|
||||
String w = include_strings.get(0);
|
||||
|
@ -205,6 +266,10 @@ public class QueryGoal {
|
|||
public void filterOut(final SortedSet<String> blueList) {
|
||||
// filter out words that appear in this set
|
||||
// this is applied to the queryHashes
|
||||
for (String word: blueList) {
|
||||
this.include_words.remove(word);
|
||||
this.include_strings.remove(word);
|
||||
}
|
||||
final HandleSet blues = Word.words2hashesHandles(blueList);
|
||||
for (final byte[] b: blues) this.include_hashes.remove(b);
|
||||
}
|
||||
|
|
|
@ -32,6 +32,7 @@ import java.util.Iterator;
|
|||
import java.util.LinkedHashSet;
|
||||
import java.util.Map;
|
||||
import java.util.Set;
|
||||
import java.util.SortedSet;
|
||||
import java.util.regex.Pattern;
|
||||
import java.util.regex.PatternSyntaxException;
|
||||
|
||||
|
@ -318,19 +319,17 @@ public final class QueryParams {
|
|||
*/
|
||||
private final boolean matchesText(final String text) {
|
||||
boolean ret = false;
|
||||
final HandleSet wordhashes = Word.words2hashesHandles(Condenser.getWords(text, null).keySet());
|
||||
if (!SetTools.anymatch(wordhashes, this.queryGoal.getExcludeHashes())) {
|
||||
ret = SetTools.totalInclusion(this.queryGoal.getIncludeHashes(), wordhashes);
|
||||
QueryGoal.NormalizedWords words = new QueryGoal.NormalizedWords(Condenser.getWords(text, null).keySet());
|
||||
if (!SetTools.anymatch(words, this.queryGoal.getExcludeWords())) {
|
||||
ret = SetTools.totalInclusion(this.queryGoal.getIncludeWords(), words);
|
||||
}
|
||||
return ret;
|
||||
}
|
||||
|
||||
protected static final boolean anymatch(final String text, final HandleSet keyhashes) {
|
||||
// returns true if any of the word hashes in keyhashes appear in the String text
|
||||
// to do this, all words in the string must be recognized and transcoded to word hashes
|
||||
if (keyhashes == null || keyhashes.isEmpty()) return false;
|
||||
final HandleSet wordhashes = Word.words2hashesHandles(Condenser.getWords(text, null).keySet());
|
||||
return SetTools.anymatch(wordhashes, keyhashes);
|
||||
protected static final boolean anymatch(final String text, final QueryGoal.NormalizedWords keywords) {
|
||||
if (keywords == null || keywords.isEmpty()) return false;
|
||||
final SortedSet<String> textwords = (SortedSet<String>) Condenser.getWords(text, null).keySet();
|
||||
return SetTools.anymatch(textwords, keywords);
|
||||
}
|
||||
|
||||
public SolrQuery solrQuery(final ContentDomain cd, final boolean getFacets, final boolean excludeintext_image) {
|
||||
|
@ -344,7 +343,7 @@ public final class QueryParams {
|
|||
if (!getFacets) this.cachedQuery.setFacet(false);
|
||||
return this.cachedQuery;
|
||||
}
|
||||
if (this.queryGoal.getIncludeStrings().size() == 0) return null;
|
||||
if (this.queryGoal.getIncludeWords().size() == 0) return null;
|
||||
|
||||
// construct query
|
||||
final SolrQuery params = getBasicParams(getFacets);
|
||||
|
@ -369,7 +368,7 @@ public final class QueryParams {
|
|||
if (!getFacets) this.cachedQuery.setFacet(false);
|
||||
return this.cachedQuery;
|
||||
}
|
||||
if (this.queryGoal.getIncludeStrings().size() == 0) return null;
|
||||
if (this.queryGoal.getIncludeWords().size() == 0) return null;
|
||||
|
||||
// construct query
|
||||
final SolrQuery params = getBasicParams(getFacets);
|
||||
|
|
|
@ -71,7 +71,6 @@ import net.yacy.document.LibraryProvider;
|
|||
import net.yacy.document.TextParser;
|
||||
import net.yacy.kelondro.data.meta.URIMetadataNode;
|
||||
import net.yacy.kelondro.data.meta.URIMetadataRow;
|
||||
import net.yacy.kelondro.data.word.Word;
|
||||
import net.yacy.kelondro.data.word.WordReference;
|
||||
import net.yacy.kelondro.data.word.WordReferenceFactory;
|
||||
import net.yacy.kelondro.data.word.WordReferenceVars;
|
||||
|
@ -1080,10 +1079,10 @@ public final class SearchEvent {
|
|||
final String pagetitle = page.dc_title().toLowerCase();
|
||||
|
||||
// check exclusion
|
||||
if (!this.query.getQueryGoal().getExcludeHashes().isEmpty() &&
|
||||
((QueryParams.anymatch(pagetitle, this.query.getQueryGoal().getExcludeHashes()))
|
||||
|| (QueryParams.anymatch(pageurl.toLowerCase(), this.query.getQueryGoal().getExcludeHashes()))
|
||||
|| (QueryParams.anymatch(pageauthor.toLowerCase(), this.query.getQueryGoal().getExcludeHashes())))) {
|
||||
if (!this.query.getQueryGoal().getExcludeWords().isEmpty() &&
|
||||
((QueryParams.anymatch(pagetitle, this.query.getQueryGoal().getExcludeWords()))
|
||||
|| (QueryParams.anymatch(pageurl.toLowerCase(), this.query.getQueryGoal().getExcludeWords()))
|
||||
|| (QueryParams.anymatch(pageauthor.toLowerCase(), this.query.getQueryGoal().getExcludeWords())))) {
|
||||
if (log.isFine()) log.fine("dropped RWI: no match with query goal exclusion");
|
||||
if (page.word().local()) this.local_rwi_available.decrementAndGet(); else this.remote_rwi_available.decrementAndGet();
|
||||
continue;
|
||||
|
@ -1294,14 +1293,14 @@ public final class SearchEvent {
|
|||
}
|
||||
|
||||
// apply query-in-result matching
|
||||
final HandleSet urlcomph = Word.words2hashesHandles(urlcomps);
|
||||
final HandleSet descrcomph = Word.words2hashesHandles(descrcomps);
|
||||
final Iterator<byte[]> shi = this.query.getQueryGoal().getIncludeHashes().iterator();
|
||||
byte[] queryhash;
|
||||
final QueryGoal.NormalizedWords urlcomph = new QueryGoal.NormalizedWords(urlcomps);
|
||||
final QueryGoal.NormalizedWords descrcomph = new QueryGoal.NormalizedWords(descrcomps);
|
||||
final Iterator<String> shi = this.query.getQueryGoal().getIncludeWords().iterator();
|
||||
String queryword;
|
||||
while (shi.hasNext()) {
|
||||
queryhash = shi.next();
|
||||
if (urlcomph.has(queryhash)) r += 256 << this.query.ranking.coeff_appurl;
|
||||
if (descrcomph.has(queryhash)) r += 256 << this.query.ranking.coeff_app_dc_title;
|
||||
queryword = shi.next();
|
||||
if (urlcomph.contains(queryword)) r += 256 << this.query.ranking.coeff_appurl;
|
||||
if (descrcomph.contains(queryword)) r += 256 << this.query.ranking.coeff_app_dc_title;
|
||||
}
|
||||
return r;
|
||||
}
|
||||
|
@ -1642,7 +1641,7 @@ public final class SearchEvent {
|
|||
if ( word.length() > 2
|
||||
&& "http_html_php_ftp_www_com_org_net_gov_edu_index_home_page_for_usage_the_and_zum_der_die_das_und_the_zur_bzw_mit_blog_wiki_aus_bei_off"
|
||||
.indexOf(word) < 0
|
||||
&& !this.query.getQueryGoal().getIncludeHashes().has(Word.word2hash(word))
|
||||
&& !this.query.getQueryGoal().getIncludeWords().contains(word)
|
||||
&& lettermatch.matcher(word).matches()
|
||||
&& !Switchboard.badwords.contains(word)
|
||||
&& !Switchboard.stopwords.contains(word) ) {
|
||||
|
|
Loading…
Reference in New Issue
Block a user