Michael Peter Christen d6b82840f8 added a feature to find similarities in documents.
This uses an enhanced version of the Nutch/Solr TextProfileSignatue.
As a result, a signature of the document is written to the solr search
index. Additionally for each time when a signature is written, it is
checked if the singature exists already in the index. If the signature
does not exist, the document is marked as unique. The unique attribute
can now be used to sort document lists and bring duplicates to the end
of a result list.
To enable this, a large portion of the search api to Solr had to be
changed. This affected mainly caching of 'exists' searches to enhance
the check for existing signatures and do this without actually doing a
solr query.
Because here the first time a long number is used as value in the Solr
store, also the value naming in the YaCySchema had to be adopted and
normalized. This caused that many files had to be changed.
2012-11-21 18:46:49 +01:00

284 lines
11 KiB

* QueryGoal
* Copyright 2012 by Michael Peter Christen; mc@yacy.net, Frankfurt a. M., Germany
* First published 16.11.2005 on http://yacy.net
* This library is free software; you can redistribute it and/or
* modify it under the terms of the GNU Lesser General Public
* License as published by the Free Software Foundation; either
* version 2.1 of the License, or (at your option) any later version.
* This library is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* Lesser General Public License for more details.
* You should have received a copy of the GNU Lesser General Public License
* along with this program in the file lgpl21.txt
* If not, see <http://www.gnu.org/licenses/>.
package net.yacy.search.query;
import java.io.UnsupportedEncodingException;
import java.net.URLEncoder;
import java.util.ArrayList;
import java.util.LinkedHashMap;
import java.util.Map;
import java.util.SortedSet;
import net.yacy.cora.federate.solr.YaCySchema;
import net.yacy.cora.order.Base64Order;
import net.yacy.cora.storage.HandleSet;
import net.yacy.cora.util.SpaceExceededException;
import net.yacy.document.parser.html.AbstractScraper;
import net.yacy.document.parser.html.CharacterCoding;
import net.yacy.kelondro.data.word.Word;
import net.yacy.kelondro.data.word.WordReferenceRow;
import net.yacy.kelondro.index.RowHandleSet;
import net.yacy.kelondro.logging.Log;
import net.yacy.search.index.Segment;
import net.yacy.search.index.SolrConfiguration;
public class QueryGoal {
private static char space = ' ';
private static char sq = '\'';
private static char dq = '"';
private static String seps = ".,/&_";
private String querystring;
private HandleSet include_hashes, exclude_hashes, all_hashes;
private final ArrayList<String> include_words, exclude_words, all_words;
private final ArrayList<String> include_strings, exclude_strings, all_strings;
public QueryGoal(HandleSet include_hashes, HandleSet exclude_hashes, HandleSet all_hashes) {
this.querystring = null;
this.include_words = null;
this.exclude_words = null;
this.all_words = null;
this.include_strings = null;
this.exclude_strings = null;
this.all_strings = null;
this.include_hashes = include_hashes;
this.exclude_hashes = exclude_hashes;
this.all_hashes = all_hashes;
public QueryGoal(byte[] queryHash) {
assert querystring != null;
assert queryHash.length == 12;
assert Base64Order.enhancedCoder.wellformed(queryHash);
this.querystring = null;
this.include_words = new ArrayList<String>();
this.exclude_words = new ArrayList<String>();
this.all_words = new ArrayList<String>();
this.include_strings = new ArrayList<String>();
this.exclude_strings = new ArrayList<String>();
this.all_strings = new ArrayList<String>();
this.include_hashes = new RowHandleSet(WordReferenceRow.urlEntryRow.primaryKeyLength, WordReferenceRow.urlEntryRow.objectOrder, 0);
this.exclude_hashes = new RowHandleSet(WordReferenceRow.urlEntryRow.primaryKeyLength, WordReferenceRow.urlEntryRow.objectOrder, 0);
this.all_hashes = new RowHandleSet(WordReferenceRow.urlEntryRow.primaryKeyLength, WordReferenceRow.urlEntryRow.objectOrder, 0);
try {
} catch (final SpaceExceededException e) {
this.include_hashes = null;
this.exclude_hashes = null;
this.all_hashes = null;
public QueryGoal(String querystring) {
assert querystring != null;
this.querystring = querystring;
this.include_words = new ArrayList<String>();
this.exclude_words = new ArrayList<String>();
this.all_words = new ArrayList<String>();
this.include_strings = new ArrayList<String>();
this.exclude_strings = new ArrayList<String>();
this.all_strings = new ArrayList<String>();
// remove funny symbols
querystring = CharacterCoding.html2unicode(AbstractScraper.stripAllTags(querystring.toCharArray())).toLowerCase().trim();
int c;
for (int i = 0; i < seps.length(); i++) {
while ((c = querystring.indexOf(seps.charAt(i))) >= 0) {
querystring = querystring.substring(0, c) + (((c + 1) < querystring.length()) ? (' ' + querystring.substring(c + 1)) : "");
// parse first quoted strings
parseQuery(querystring, this.include_strings, this.exclude_strings, this.all_strings);
// .. end then take these strings apart to generate word lists
for (String s: this.include_strings) parseQuery(s, this.include_words, this.include_words, this.all_words);
for (String s: this.exclude_strings) parseQuery(s, this.exclude_words, this.exclude_words, this.all_words);
this.include_hashes = null;
this.exclude_hashes = null;
this.all_hashes = null;
* EBNF of a query
* query = {whitespace, phrase}, [whitespace]
* whitespace = space, {space}
* space = ' '
* phrase = ['-'], string
* string = {any character without sq, dq and whitespace} | sq, {any character without sq}, sq | dq, {any character without dq}, dq
* sq = '\''
* dq = '"'
private static void parseQuery(String s, ArrayList<String> include_string, ArrayList<String> exclude_string, ArrayList<String> all_string) {
while (s.length() > 0) {
// parse query
int p = 0;
while (p < s.length() && s.charAt(p) == space) p++;
s = s.substring(p);
if (s.length() == 0) return;
// parse phrase
boolean inc = true;
if (s.charAt(0) == '-') {inc = false; s = s.substring(1);}
if (s.length() == 0) return;
// parse string
char stop = space;
if (s.charAt(0) == dq) {stop = s.charAt(0); s = s.substring(1);}
if (s.charAt(0) == sq) {stop = s.charAt(0); s = s.substring(1);}
p = 0;
while (p < s.length() && s.charAt(p) != stop) p++;
String string = s.substring(0, p);
p++; // go behind the stop character (eats up space, sq and dq)
s = p < s.length() ? s.substring(p) : "";
if (string.length() > 0) {
if (!all_string.contains(string)) all_string.add(string);
if (inc) {
if (!include_string.contains(string)) include_string.add(string);
} else {
if (!exclude_string.contains(string)) exclude_string.add(string);
public String getQueryString() {
return this.querystring;
public String queryStringForUrl() {
try {
return URLEncoder.encode(this.querystring, "UTF-8");
} catch (final UnsupportedEncodingException e) {
return this.querystring;
public HandleSet getIncludeHashes() {
if (include_hashes == null) include_hashes = Word.words2hashesHandles(include_words);
return include_hashes;
public HandleSet getExcludeHashes() {
if (exclude_hashes == null) exclude_hashes = Word.words2hashesHandles(exclude_words);
return exclude_hashes;
public HandleSet getAllHashes() {
if (all_hashes == null) all_hashes = Word.words2hashesHandles(all_words);
return all_hashes;
public ArrayList<String> getIncludeStrings() {
return include_strings;
public ArrayList<String> getExcludeStrings() {
return exclude_strings;
public ArrayList<String> getAllStrings() {
return all_strings;
public void filterOut(final SortedSet<String> blueList) {
// filter out words that appear in this set
// this is applied to the queryHashes
final HandleSet blues = Word.words2hashesHandles(blueList);
for (final byte[] b: blues) this.include_hashes.remove(b);
private final static YaCySchema[] fields = new YaCySchema[]{
private final static Map<YaCySchema,Float> boosts = new LinkedHashMap<YaCySchema,Float>();
static {
boosts.put(YaCySchema.sku, 20.0f);
boosts.put(YaCySchema.url_paths_sxt, 20.0f);
boosts.put(YaCySchema.title, 15.0f);
boosts.put(YaCySchema.h1_txt, 11.0f);
boosts.put(YaCySchema.h2_txt, 10.0f);
boosts.put(YaCySchema.author, 8.0f);
boosts.put(YaCySchema.description, 5.0f);
boosts.put(YaCySchema.keywords, 2.0f);
boosts.put(YaCySchema.text_t, 1.0f);
public StringBuilder solrQueryString(SolrConfiguration configuration) {
final StringBuilder q = new StringBuilder(80);
// parse special requests
if (include_strings.size() == 1 && exclude_strings.size() == 0) {
String w = include_strings.get(0);
if (Segment.catchallString.equals(w)) return new StringBuilder("*:*");
// add text query
int wc = 0;
StringBuilder w = new StringBuilder(80);
for (String s: include_strings) {
if (wc > 0) w.append(" AND ");
for (String s: exclude_strings){
if (wc > 0) w.append(" AND -");
if (wc > 1) {w.insert(0, '('); w.append(')');}
// combine these queries for all relevant fields
wc = 0;
Float boost;
for (YaCySchema field: fields) {
if (configuration != null && !configuration.contains(field.getSolrFieldName())) continue;
if (wc > 0) q.append(" OR ");
boost = boosts.get(field);
if (boost != null) q.append('^').append(boost.toString());
q.insert(0, '(');
// add filter to prevent that results come from failed urls
q.append(" AND -").append(YaCySchema.failreason_t.getSolrFieldName()).append(":[* TO *]");
return q;