fix for urlmaskfilter

This commit is contained in:
Michael Peter Christen 2015-01-28 13:40:41 +01:00
parent 2636582435
commit 3d717b749a
5 changed files with 36 additions and 24 deletions

View File

@ -348,7 +348,7 @@ public class yacysearch {
}
if ( !block ) {
String urlmask = null;
String urlmask = (post == null) ? ".*" : post.get("urlmaskfilter", ".*"); // the expression must be a subset of the java Match syntax described in http://lucene.apache.org/core/4_4_0/core/org/apache/lucene/util/automaton/RegExp.html
String tld = null;
String inlink = null;
@ -406,7 +406,7 @@ public class yacysearch {
final String urlstr = querystring.substring(inurlp + 6, ftb);
querystring = querystring.replace("inurl:" + urlstr, "");
if ( !urlstr.isEmpty() ) {
urlmask = urlmask == null ? ".*" + urlstr + ".*" : urlmask + urlstr + ".*";
urlmask = urlmask == null || urlmask.equals(".*") ? ".*" + urlstr + ".*" : urlmask; // we cannot join the conditions; if an urlmask is already given then stay with that
}
modifier.add("inurl:" + urlstr);
}

View File

@ -217,8 +217,11 @@ public class URIMetadataNode extends SolrDocument {
return this.url;
}
public boolean matches(Pattern matcher) {
return matcher.matcher(this.url.toString().toLowerCase()).matches();
public boolean matches(Pattern pattern) {
return pattern.matcher(this.url.toNormalform(true).toLowerCase()).matches();
//CharacterRunAutomaton automaton = new CharacterRunAutomaton(matcher);
//boolean match = automaton.run(this.url.toNormalform(true).toLowerCase());
//return match;
}
public String dc_title() {

View File

@ -835,7 +835,6 @@ public final class Protocol {
baos = null;
}
String filter = event.query.urlMask.pattern().toString();
parts.put("myseed", UTF8.StringBody((event.peers.mySeed() == null) ? "" : event.peers.mySeed().genSeedStr(key)));
parts.put("count", UTF8.StringBody(Integer.toString(Math.max(10, count))));
parts.put("time", UTF8.StringBody(Long.toString(Math.max(3000, time))));
@ -845,7 +844,7 @@ public final class Protocol {
parts.put("duetime", UTF8.StringBody("1000"));
parts.put("urls", UTF8.StringBody(urlhashes));
parts.put("prefer", UTF8.StringBody(event.query.prefer.pattern()));
parts.put("filter", UTF8.StringBody(filter));
parts.put("filter", UTF8.StringBody(event.query.urlMaskString));
parts.put("modifier", UTF8.StringBody(event.query.modifier.toString()));
parts.put("language", UTF8.StringBody(language));
parts.put("sitehash", UTF8.StringBody(event.query.modifier.sitehash));

View File

@ -33,6 +33,7 @@ import java.util.LinkedHashSet;
import java.util.Map;
import java.util.Set;
import java.util.SortedSet;
import java.util.regex.Matcher;
import java.util.regex.Pattern;
import java.util.regex.PatternSyntaxException;
@ -62,6 +63,8 @@ import net.yacy.search.ranking.RankingProfile;
import net.yacy.search.schema.CollectionConfiguration;
import net.yacy.search.schema.CollectionSchema;
import org.apache.lucene.util.automaton.Automata;
import org.apache.lucene.util.automaton.Automaton;
import org.apache.solr.client.solrj.SolrQuery;
import org.apache.solr.client.solrj.SolrQuery.SortClause;
import org.apache.solr.common.params.CommonParams;
@ -103,7 +106,9 @@ public final class QueryParams {
private final QueryGoal queryGoal;
public int itemsPerPage;
public int offset;
public Pattern urlMask;
public Pattern urlMaskPattern;
public Automaton urlMaskAutomaton;
public String urlMaskString;
public final Pattern prefer;
public final String tld, inlink;
@ -175,20 +180,32 @@ public final class QueryParams {
this.itemsPerPage = Math.min((specialRights) ? 10000 : 1000, itemsPerPage);
this.offset = Math.max(0, Math.min((specialRights) ? 10000 - this.itemsPerPage : 1000 - this.itemsPerPage, offset));
try {
this.urlMask = Pattern.compile(urlMask.toLowerCase());
} catch (final PatternSyntaxException ex) {
this.urlMaskString = urlMask;
// solr doesn't like slashes, backslashes or doublepoints; remove them // urlmask = ".*\\." + ft + "(\\?.*)?";
int p;
while ((p = this.urlMaskString.indexOf(':')) >= 0) this.urlMaskString = this.urlMaskString.substring(0, p) + "." + this.urlMaskString.substring(p + 1);
while ((p = this.urlMaskString.indexOf('/')) >= 0) this.urlMaskString = this.urlMaskString.substring(0, p) + "." + this.urlMaskString.substring(p + 1);
while ((p = this.urlMaskString.indexOf('\\')) >= 0) this.urlMaskString = this.urlMaskString.substring(0, p) + "." + this.urlMaskString.substring(p + 2);
this.urlMaskAutomaton = Automata.makeString(this.urlMaskString);
this.urlMaskPattern = Pattern.compile(this.urlMaskString);
} catch (final Throwable ex) {
throw new IllegalArgumentException("Not a valid regular expression: " + urlMask, ex);
}
this.urlMask_isCatchall = this.urlMask.toString().equals(catchall_pattern.toString());
this.urlMask_isCatchall = this.urlMaskString.equals(catchall_pattern.toString());
if (this.urlMask_isCatchall) {
String protocolfilter = modifier.protocol == null ? ".*" : modifier.protocol;
String defaulthostprefix = modifier.protocol == null ? "www" : modifier.protocol;
String hostfilter = modifier.sitehost == null && tld == null ? ".*" : modifier.sitehost == null ? ".*\\." + tld : modifier.sitehost.startsWith(defaulthostprefix + ".") ? "(" + defaulthostprefix + "\\.)?" + modifier.sitehost.substring(4) : "(" + defaulthostprefix + "\\.)?" + modifier.sitehost;
String filefilter = modifier.filetype == null ? ".*" : ".*" + modifier.filetype + ".*";
String filter = protocolfilter + "://" + hostfilter + "/" + filefilter;
if (!filter.equals(".*://.*/.*")) {
this.urlMask = Pattern.compile(filter);
String filter = protocolfilter + "..." + hostfilter + "." + filefilter;
if (!filter.equals(".*....*..*")) {
Pattern r = Pattern.compile("(\\.|(\\.\\*))\\.\\*");
Matcher m;
while ((m = r.matcher(filter)).find()) filter = m.replaceAll(".*");
this.urlMaskString = filter;
this.urlMaskAutomaton = Automata.makeString(filter);
this.urlMask_isCatchall = false;
this.urlMaskPattern = Pattern.compile(filter);
}
}
this.tld = tld;
@ -503,14 +520,7 @@ public final class QueryParams {
if (!this.urlMask_isCatchall) {
// add a filter query on urls
String urlMaskPattern = this.urlMask.pattern();
// solr doesn't like slashes, backslashes or doublepoints; remove them // urlmask = ".*\\." + ft + "(\\?.*)?";
int p;
while ((p = urlMaskPattern.indexOf(':')) >= 0) urlMaskPattern = urlMaskPattern.substring(0, p) + "." + urlMaskPattern.substring(p + 1);
while ((p = urlMaskPattern.indexOf('/')) >= 0) urlMaskPattern = urlMaskPattern.substring(0, p) + "." + urlMaskPattern.substring(p + 1);
while ((p = urlMaskPattern.indexOf('\\')) >= 0) urlMaskPattern = urlMaskPattern.substring(0, p) + "." + urlMaskPattern.substring(p + 2);
fq.append(" AND ").append(CollectionSchema.sku.getSolrFieldName() + ":/" + urlMaskPattern + "/");
fq.append(" AND ").append(CollectionSchema.sku.getSolrFieldName() + ":/" + this.urlMaskString + "/");
}
if (this.radius > 0.0d && this.lat != 0.0d && this.lon != 0.0d) {
@ -583,7 +593,7 @@ public final class QueryParams {
context.append(this.zonecode).append(asterisk);
context.append(ASCII.String(Word.word2hash(this.ranking.toExternalString()))).append(asterisk);
context.append(Base64Order.enhancedCoder.encodeString(this.prefer.toString())).append(asterisk);
context.append(Base64Order.enhancedCoder.encodeString(this.urlMask.toString())).append(asterisk);
context.append(Base64Order.enhancedCoder.encodeString(this.urlMaskString)).append(asterisk);
context.append(this.modifier.sitehash).append(asterisk);
context.append(this.modifier.author).append(asterisk);
context.append(this.modifier.protocol).append(asterisk);

View File

@ -893,7 +893,7 @@ public final class SearchEvent {
if ( !this.query.urlMask_isCatchall ) {
// check url mask
if (!iEntry.matches(this.query.urlMask)) {
if (!iEntry.matches(this.query.urlMaskPattern)) {
if (log.isFine()) log.fine("dropped Node: url mask does not match");
continue pollloop;
}
@ -1114,7 +1114,7 @@ public final class SearchEvent {
URIMetadataNode page;
mainloop: while ((page = pullOneRWI(skipDoubleDom)) != null) {
if (!this.query.urlMask_isCatchall && !page.matches(this.query.urlMask)) {
if (!this.query.urlMask_isCatchall && !page.matches(this.query.urlMaskPattern)) {
if (log.isFine()) log.fine("dropped RWI: no match with urlMask");
if (page.word().local()) this.local_rwi_available.decrementAndGet(); else this.remote_rwi_available.decrementAndGet();
continue;