distinguishing modified query string and original query string

This commit is contained in:
Michael Peter Christen 2012-12-15 00:05:46 +01:00
parent fb0fa9a102
commit cb5cbec14d
13 changed files with 79 additions and 97 deletions

View File

@ -165,7 +165,7 @@ public class AccessTracker_p {
if (page == 2) {
// local search
prop.putNum("page_list_" + m + "_offset", query.offset);
prop.putHTML("page_list_" + m + "_querystring", query.getQueryGoal().getQueryString());
prop.putHTML("page_list_" + m + "_querystring", query.getQueryGoal().getOriginalQueryString(false));
} else {
// remote search
prop.putHTML("page_list_" + m + "_peername", (query.remotepeer == null) ? "<unknown>" : query.remotepeer.getName());

View File

@ -649,7 +649,8 @@ public class IndexControlRWIs_p {
final byte[] keyhash,
final Bitfield filter) {
final QueryParams query = new QueryParams(ASCII.String(keyhash), -1, filter, segment, sb.getRanking(), "IndexControlRWIs_p");
String khw = ASCII.String(keyhash);
final QueryParams query = new QueryParams(khw, khw, -1, filter, segment, sb.getRanking(), "IndexControlRWIs_p");
final SearchEvent theSearch = SearchEventCache.getEvent(query, sb.peers, sb.tables, null, false, sb.loader, Integer.MAX_VALUE, Long.MAX_VALUE, (int) sb.getConfigLong(SwitchboardConstants.DHT_BURST_ROBINSON, 0), (int) sb.getConfigLong(SwitchboardConstants.DHT_BURST_MULTIWORD, 0));
//theSearch.rankingProcess.run();
RankingProcess ranked = theSearch.rankingProcess;

View File

@ -67,7 +67,7 @@ public final class timeline {
language = (agent == null) ? "en" : ISO639.userAgentLanguageDetection(agent);
if (language == null) language = "en";
}
final QueryGoal qg = new QueryGoal(querystring);
final QueryGoal qg = new QueryGoal(querystring, querystring);
HandleSet q = qg.getIncludeHashes();
// tell all threads to do nothing for a specific time

View File

@ -113,7 +113,7 @@ public class searchresult {
post.put("originalQuery", originalQuery);
// get a solr query string
QueryGoal qg = new QueryGoal(originalQuery);
QueryGoal qg = new QueryGoal(originalQuery, originalQuery);
StringBuilder solrQ = qg.solrQueryString(sb.index.fulltext().getSolrScheme());
post.put("defType", "edismax");
post.put(CommonParams.Q, solrQ.toString());

View File

@ -356,8 +356,9 @@ public class yacysearch {
final RankingProfile ranking = sb.getRanking();
final StringBuilder modifier = new StringBuilder(20);
if ("*".equals(querystring)) {
querystring = Segment.catchallString;
int stp = querystring.indexOf('*');
if (stp >= 0) {
querystring = querystring.substring(0, stp) + Segment.catchallString + querystring.substring(stp + 1);
}
if ( querystring.indexOf("/near", 0) >= 0 ) {
querystring = querystring.replace("/near", "");
@ -597,7 +598,7 @@ public class yacysearch {
}
// the query
final QueryGoal qg = new QueryGoal(querystring.trim());
final QueryGoal qg = new QueryGoal(originalquerystring, querystring.trim());
final int maxDistance = (querystring.indexOf('"', 0) >= 0) ? qg.getAllHashes().size() - 1 : Integer.MAX_VALUE;
// filter out stopwords
@ -766,7 +767,7 @@ public class yacysearch {
Log.logInfo(
"LOCAL_SEARCH",
"INIT WORD SEARCH: "
+ theQuery.getQueryGoal().getQueryString()
+ theQuery.getQueryGoal().getOriginalQueryString(false)
+ ":"
+ QueryParams.hashSet2hashString(theQuery.getQueryGoal().getIncludeHashes())
+ " - "
@ -775,7 +776,7 @@ public class yacysearch {
+ theQuery.itemsPerPage()
+ " lines to be displayed");
EventChannel.channels(EventChannel.LOCALSEARCH).addMessage(
new RSSMessage("Local Search Request", theQuery.getQueryGoal().getQueryString(), ""));
new RSSMessage("Local Search Request", theQuery.getQueryGoal().getOriginalQueryString(false), ""));
final long timestamp = System.currentTimeMillis();
// create a new search event
@ -814,7 +815,7 @@ public class yacysearch {
// log
Log.logInfo("LOCAL_SEARCH", "EXIT WORD SEARCH: "
+ theQuery.getQueryGoal().getQueryString()
+ theQuery.getQueryGoal().getOriginalQueryString(false)
+ " - "
+ "local_rwi_available(" + theSearch.query.local_rwi_available.get() + "), "
+ "local_rwi_stored(" + theSearch.query.local_rwi_stored.get() + "), "

View File

@ -122,6 +122,7 @@ public class yacysearchitem {
faviconURL = null;
}
final String resource = theSearch.query.domType.toString();
final String origQ = theSearch.query.getQueryGoal().getOriginalQueryString(true);
prop.put("content", 1); // switch on specific content
prop.put("content_showDate", sb.getConfigBool("search.result.show.date", true) ? 1 : 0);
prop.put("content_showSize", sb.getConfigBool("search.result.show.size", true) ? 1 : 0);
@ -134,10 +135,10 @@ public class yacysearchitem {
prop.put("content_authorized", authenticated ? "1" : "0");
final String urlhash = ASCII.String(result.hash());
prop.put("content_authorized_bookmark", sb.tables.bookmarks.hasBookmark("admin", urlhash) ? "0" : "1");
prop.putHTML("content_authorized_bookmark_bookmarklink", "/yacysearch.html?query=" + theSearch.query.getQueryGoal().getQueryString().replace(' ', '+') + "&Enter=Search&count=" + theSearch.query.itemsPerPage() + "&offset=" + (theSearch.query.neededResults() - theSearch.query.itemsPerPage()) + "&order=" + crypt.simpleEncode(theSearch.query.ranking.toExternalString()) + "&resource=" + resource + "&time=3&bookmarkref=" + urlhash + "&urlmaskfilter=.*");
prop.putHTML("content_authorized_bookmark_bookmarklink", "/yacysearch.html?query=" + origQ.replace(' ', '+') + "&Enter=Search&count=" + theSearch.query.itemsPerPage() + "&offset=" + (theSearch.query.neededResults() - theSearch.query.itemsPerPage()) + "&order=" + crypt.simpleEncode(theSearch.query.ranking.toExternalString()) + "&resource=" + resource + "&time=3&bookmarkref=" + urlhash + "&urlmaskfilter=.*");
prop.put("content_authorized_recommend", (sb.peers.newsPool.getSpecific(NewsPool.OUTGOING_DB, NewsPool.CATEGORY_SURFTIPP_ADD, "url", resultUrlstring) == null) ? "1" : "0");
prop.putHTML("content_authorized_recommend_deletelink", "/yacysearch.html?query=" + theSearch.query.getQueryGoal().getQueryString().replace(' ', '+') + "&Enter=Search&count=" + theSearch.query.itemsPerPage() + "&offset=" + (theSearch.query.neededResults() - theSearch.query.itemsPerPage()) + "&order=" + crypt.simpleEncode(theSearch.query.ranking.toExternalString()) + "&resource=" + resource + "&time=3&deleteref=" + urlhash + "&urlmaskfilter=.*");
prop.putHTML("content_authorized_recommend_recommendlink", "/yacysearch.html?query=" + theSearch.query.getQueryGoal().getQueryString().replace(' ', '+') + "&Enter=Search&count=" + theSearch.query.itemsPerPage() + "&offset=" + (theSearch.query.neededResults() - theSearch.query.itemsPerPage()) + "&order=" + crypt.simpleEncode(theSearch.query.ranking.toExternalString()) + "&resource=" + resource + "&time=3&recommendref=" + urlhash + "&urlmaskfilter=.*");
prop.putHTML("content_authorized_recommend_deletelink", "/yacysearch.html?query=" + origQ.replace(' ', '+') + "&Enter=Search&count=" + theSearch.query.itemsPerPage() + "&offset=" + (theSearch.query.neededResults() - theSearch.query.itemsPerPage()) + "&order=" + crypt.simpleEncode(theSearch.query.ranking.toExternalString()) + "&resource=" + resource + "&time=3&deleteref=" + urlhash + "&urlmaskfilter=.*");
prop.putHTML("content_authorized_recommend_recommendlink", "/yacysearch.html?query=" + origQ.replace(' ', '+') + "&Enter=Search&count=" + theSearch.query.itemsPerPage() + "&offset=" + (theSearch.query.neededResults() - theSearch.query.itemsPerPage()) + "&order=" + crypt.simpleEncode(theSearch.query.ranking.toExternalString()) + "&resource=" + resource + "&time=3&recommendref=" + urlhash + "&urlmaskfilter=.*");
prop.put("content_authorized_urlhash", urlhash);
final String resulthashString = urlhash;
prop.putHTML("content_title", result.title());
@ -214,8 +215,8 @@ public class yacysearchitem {
final String words = (s.length() > 0) ? s.substring(1) : "";
prop.putHTML("content_words", words);
prop.putHTML("content_showParser_words", words);
prop.putHTML("content_former", theSearch.query.getQueryGoal().getQueryString());
prop.putHTML("content_showPictures_former", theSearch.query.getQueryGoal().getQueryString());
prop.putHTML("content_former", origQ);
prop.putHTML("content_showPictures_former", origQ);
final TextSnippet snippet = result.textSnippet();
final String desc = (snippet == null) ? "" : snippet.isMarked() ? snippet.getLineRaw() : snippet.getLineMarked(theSearch.query.getQueryGoal());
prop.put("content_description", desc);

View File

@ -79,7 +79,7 @@ public class yacysearchtrailer {
count = theSearch.namespaceNavigator.get(name);
if (count == 0) break;
nav = "inurl%3A" + name;
queryStringForUrl = theSearch.query.getQueryGoal().queryStringForUrl();
queryStringForUrl = theSearch.query.getQueryGoal().getOriginalQueryString(true);
p = queryStringForUrl.indexOf(nav);
if (p < 0) {
pos++;
@ -119,7 +119,7 @@ public class yacysearchtrailer {
count = hostNavigator.get(name);
if (count == 0) break;
nav = "site%3A" + name;
queryStringForUrl = theSearch.query.getQueryGoal().queryStringForUrl();
queryStringForUrl = theSearch.query.getQueryGoal().getOriginalQueryString(true);
p = queryStringForUrl.indexOf(nav);
if (p < 0) {
pos++;
@ -158,7 +158,7 @@ public class yacysearchtrailer {
count = theSearch.authorNavigator.get(name);
if (count == 0) break;
nav = (name.indexOf(' ', 0) < 0) ? "author%3A" + name : "author%3A%28" + name.replace(" ", "+") + "%29";
queryStringForUrl = theSearch.query.getQueryGoal().queryStringForUrl();
queryStringForUrl = theSearch.query.getQueryGoal().getOriginalQueryString(true);
p = queryStringForUrl.indexOf(nav);
if (p < 0) {
pos++;
@ -197,9 +197,9 @@ public class yacysearchtrailer {
name = navigatorIterator.next();
count = topicNavigator.get(name);
if (count == 0) break;
if (theSearch.query.getQueryGoal().getQueryString() == null) break;
queryStringForUrl = theSearch.query.getQueryGoal().getOriginalQueryString(true);
if (queryStringForUrl == null) break;
if (name != null) {
queryStringForUrl = theSearch.query.getQueryGoal().queryStringForUrl();
prop.put("nav-topics_element_" + i + "_on", 1);
prop.put(fileType, "nav-topics_element_" + i + "_modifier", name);
prop.put(fileType, "nav-topics_element_" + i + "_name", name);
@ -227,7 +227,7 @@ public class yacysearchtrailer {
count = theSearch.protocolNavigator.get(name);
if (count == 0) break;
nav = "%2F" + name;
queryStringForUrl = theSearch.query.getQueryGoal().queryStringForUrl();
queryStringForUrl = theSearch.query.getQueryGoal().getOriginalQueryString(true);
p = queryStringForUrl.indexOf(nav);
if (p < 0) {
pos++;
@ -266,7 +266,7 @@ public class yacysearchtrailer {
count = theSearch.filetypeNavigator.get(name);
if (count == 0) break;
nav = "filetype%3A" + name;
queryStringForUrl = theSearch.query.getQueryGoal().queryStringForUrl();
queryStringForUrl = theSearch.query.getQueryGoal().getOriginalQueryString(true);
p = queryStringForUrl.indexOf(nav);
if (p < 0) {
pos++;
@ -310,7 +310,7 @@ public class yacysearchtrailer {
count = ve.getValue().get(name);
if (count == 0) break;
nav = "%2Fvocabulary%2F" + navname + "%2F" + MultiProtocolURI.escape(Tagging.encodePrintname(name)).toString();
queryStringForUrl = theSearch.query.getQueryGoal().queryStringForUrl();
queryStringForUrl = theSearch.query.getQueryGoal().getOriginalQueryString(true);
p = queryStringForUrl.indexOf(nav);
if (p < 0) {
queryStringForUrl += "+" + nav;
@ -354,8 +354,9 @@ public class yacysearchtrailer {
prop.put("cat-location", 0);
} else {
prop.put("cat-location", 1);
prop.put(fileType, "cat-location_query", theSearch.query.queryString(true));
prop.put(fileType, "cat-location_queryenc", theSearch.query.queryString(true).replace(' ', '+'));
String uriginalQuery = theSearch.query.getQueryGoal().getOriginalQueryString(true);
prop.put(fileType, "cat-location_query", uriginalQuery);
prop.put(fileType, "cat-location_queryenc", uriginalQuery.replace(' ', '+'));
}
prop.put("num-results_totalcount", theSearch.query.getResultCount());
EventTracker.update(EventTracker.EClass.SEARCH, new ProfilingGraph.EventSearch(theSearch.query.id(true), SearchEventType.FINALIZATION, "bottomline", 0, 0), false);

View File

@ -1034,7 +1034,7 @@ public final class Protocol
final Seed target,
final Blacklist blacklist) {
if (event.query.getQueryGoal().getQueryString() == null || event.query.getQueryGoal().getQueryString().length() == 0) {
if (event.query.getQueryGoal().getOriginalQueryString(false) == null || event.query.getQueryGoal().getOriginalQueryString(false).length() == 0) {
return -1; // we cannot query solr only with word hashes, there is no clear text string
}
event.addExpectedRemoteReferences(count);

View File

@ -3321,7 +3321,7 @@ public final class Switchboard extends serverSwitch {
new Thread() {
@Override
public void run() {
String queryString = searchEvent.query.queryString(true);
String queryString = searchEvent.query.getQueryGoal().getOriginalQueryString(false);
Thread.currentThread().setName("Switchboard.heuristicRSS:" + queryString);
final int meta = queryString.indexOf("heuristic:", 0);
if ( meta >= 0 ) {

View File

@ -67,9 +67,10 @@ public class AccessTracker {
private static void add(final LinkedList<QueryParams> list, final QueryParams query) {
// learn that this word can be a word completion for the DidYouMeanLibrary
if (query.getResultCount() > 10 && query.getQueryGoal().getQueryString() != null && query.getQueryGoal().getQueryString().length() > 0) {
final StringBuilder sb = new StringBuilder(query.getQueryGoal().getQueryString());
sb.append(query.getQueryGoal().getQueryString());
String queryString = query.getQueryGoal().getOriginalQueryString(false);
if (query.getResultCount() > 10 && queryString != null && queryString.length() > 0) {
final StringBuilder sb = new StringBuilder(queryString);
sb.append(queryString);
WordCache.learn(sb);
}
@ -108,8 +109,9 @@ public class AccessTracker {
}
private static void addToDump(final QueryParams query) {
if (query.getQueryGoal().getQueryString() == null || query.getQueryGoal().getQueryString().isEmpty()) return;
addToDump(query.getQueryGoal().getQueryString(), Integer.toString(query.getResultCount()), new Date(query.starttime));
String queryString = query.getQueryGoal().getOriginalQueryString(false);
if (queryString == null || queryString.isEmpty()) return;
addToDump(queryString, Integer.toString(query.getResultCount()), new Date(query.starttime));
}
public static void addToDump(String querystring, String resultcount) {

View File

@ -28,15 +28,10 @@ import java.util.SortedSet;
import net.yacy.cora.federate.solr.Boost;
import net.yacy.cora.federate.solr.YaCySchema;
import net.yacy.cora.order.Base64Order;
import net.yacy.cora.storage.HandleSet;
import net.yacy.cora.util.SpaceExceededException;
import net.yacy.document.parser.html.AbstractScraper;
import net.yacy.document.parser.html.CharacterCoding;
import net.yacy.kelondro.data.word.Word;
import net.yacy.kelondro.data.word.WordReferenceRow;
import net.yacy.kelondro.index.RowHandleSet;
import net.yacy.kelondro.logging.Log;
import net.yacy.search.index.Segment;
import net.yacy.search.index.SolrConfiguration;
@ -48,14 +43,15 @@ public class QueryGoal {
private static char dq = '"';
private static String seps = ".,/&_";
private String querystring;
private String query_original, query_words;
private HandleSet include_hashes, exclude_hashes, all_hashes;
private final ArrayList<String> include_words, exclude_words, all_words;
private final ArrayList<String> include_strings, exclude_strings, all_strings;
public QueryGoal(HandleSet include_hashes, HandleSet exclude_hashes, HandleSet all_hashes) {
this.querystring = null;
this.query_original = null;
this.query_words = null;
this.include_words = null;
this.exclude_words = null;
this.all_words = null;
@ -66,35 +62,12 @@ public class QueryGoal {
this.exclude_hashes = exclude_hashes;
this.all_hashes = all_hashes;
}
public QueryGoal(byte[] queryHash) {
assert querystring != null;
assert queryHash.length == 12;
assert Base64Order.enhancedCoder.wellformed(queryHash);
this.querystring = null;
this.include_words = new ArrayList<String>();
this.exclude_words = new ArrayList<String>();
this.all_words = new ArrayList<String>();
this.include_strings = new ArrayList<String>();
this.exclude_strings = new ArrayList<String>();
this.all_strings = new ArrayList<String>();
this.include_hashes = new RowHandleSet(WordReferenceRow.urlEntryRow.primaryKeyLength, WordReferenceRow.urlEntryRow.objectOrder, 0);
this.exclude_hashes = new RowHandleSet(WordReferenceRow.urlEntryRow.primaryKeyLength, WordReferenceRow.urlEntryRow.objectOrder, 0);
this.all_hashes = new RowHandleSet(WordReferenceRow.urlEntryRow.primaryKeyLength, WordReferenceRow.urlEntryRow.objectOrder, 0);
try {
this.include_hashes.put(queryHash);
this.all_hashes.put(queryHash);
} catch (final SpaceExceededException e) {
Log.logException(e);
}
this.include_hashes = null;
this.exclude_hashes = null;
this.all_hashes = null;
}
public QueryGoal(String querystring) {
assert querystring != null;
this.querystring = querystring;
public QueryGoal(String query_original, String query_words) {
assert query_original != null;
assert query_words != null;
this.query_original = query_original;
this.query_words = query_words;
this.include_words = new ArrayList<String>();
this.exclude_words = new ArrayList<String>();
this.all_words = new ArrayList<String>();
@ -103,16 +76,16 @@ public class QueryGoal {
this.all_strings = new ArrayList<String>();
// remove funny symbols
querystring = CharacterCoding.html2unicode(AbstractScraper.stripAllTags(querystring.toCharArray())).toLowerCase().trim();
this.query_words = CharacterCoding.html2unicode(AbstractScraper.stripAllTags(this.query_words.toCharArray())).toLowerCase().trim();
int c;
for (int i = 0; i < seps.length(); i++) {
while ((c = querystring.indexOf(seps.charAt(i))) >= 0) {
querystring = querystring.substring(0, c) + (((c + 1) < querystring.length()) ? (' ' + querystring.substring(c + 1)) : "");
while ((c = this.query_words.indexOf(seps.charAt(i))) >= 0) {
this.query_words = this.query_words.substring(0, c) + (((c + 1) < this.query_words.length()) ? (' ' + this.query_words.substring(c + 1)) : "");
}
}
// parse first quoted strings
parseQuery(querystring, this.include_strings, this.exclude_strings, this.all_strings);
parseQuery(this.query_words, this.include_strings, this.exclude_strings, this.all_strings);
// .. end then take these strings apart to generate word lists
for (String s: this.include_strings) parseQuery(s, this.include_words, this.include_words, this.all_words);
@ -168,17 +141,31 @@ public class QueryGoal {
}
}
public String getQueryString() {
return this.querystring;
}
public String queryStringForUrl() {
try {
return URLEncoder.encode(this.querystring, "UTF-8");
} catch (final UnsupportedEncodingException e) {
Log.logException(e);
return this.querystring;
public String getOriginalQueryString(final boolean encodeHTML) {
String ret;
if (encodeHTML){
try {
ret = URLEncoder.encode(this.query_original, "UTF-8");
} catch (UnsupportedEncodingException e) {
ret = this.query_original;
}
} else {
ret = this.query_original;
}
return ret;
}
public String getWordQueryString(final boolean encodeHTML) {
String ret;
if (encodeHTML){
try {
ret = URLEncoder.encode(this.query_words, "UTF-8");
} catch (UnsupportedEncodingException e) {
ret = this.query_words;
}
} else {
ret = this.query_words;
}
return ret;
}
public HandleSet getIncludeHashes() {

View File

@ -153,13 +153,13 @@ public final class QueryParams {
public final SortedSet<byte[]> misses; // url hashes that had been sorted out because of constraints in postranking
public QueryParams(
final String queryString,
final String query_original, final String query_words,
final int itemsPerPage,
final Bitfield constraint,
final Segment indexSegment,
final RankingProfile ranking,
final String userAgent) {
this.queryGoal = new QueryGoal(queryString);
this.queryGoal = new QueryGoal(query_original, query_words);
this.ranking = ranking;
this.modifier = new Modifier("");
this.maxDistance = Integer.MAX_VALUE;
@ -398,17 +398,6 @@ public final class QueryParams {
return SetTools.anymatch(wordhashes, keyhashes);
}
public String queryString(final boolean encodeHTML) {
final String ret;
if (encodeHTML){
ret = CharacterCoding.unicode2html(this.queryGoal.getQueryString(), true);
} else {
ret = this.queryGoal.getQueryString();
}
return ret;
}
public SolrQuery solrQuery() {
if (this.queryGoal.getIncludeStrings().size() == 0) return null;
// get text query
@ -588,7 +577,7 @@ public final class QueryParams {
sb.append("/yacysearch.");
sb.append(ext);
sb.append("?query=");
sb.append(newQueryString == null ? theQuery.getQueryGoal().queryStringForUrl() : newQueryString);
sb.append(newQueryString == null ? theQuery.getQueryGoal().getOriginalQueryString(true) : newQueryString);
sb.append(ampersand);
sb.append("maximumRecords=");
@ -619,7 +608,7 @@ public final class QueryParams {
sb.append(ampersand);
sb.append("former=");
sb.append(theQuery.getQueryGoal().queryStringForUrl());
sb.append(theQuery.getQueryGoal().getOriginalQueryString(true));
return sb;
}

View File

@ -235,7 +235,7 @@ public final class RankingProcess extends Thread {
new ProfilingGraph.EventSearch(
this.query.id(true),
SearchEventType.JOIN,
this.query.getQueryGoal().getQueryString(),
this.query.getQueryGoal().getOriginalQueryString(false),
index.size(),
System.currentTimeMillis() - timer),
false);