diff --git a/htroot/yacysearch.java b/htroot/yacysearch.java index 4743c59f7..95810815c 100644 --- a/htroot/yacysearch.java +++ b/htroot/yacysearch.java @@ -348,7 +348,7 @@ public class yacysearch { } if ( !block ) { - String urlmask = null; + String urlmask = (post == null) ? ".*" : post.get("urlmaskfilter", ".*"); // the expression must be a subset of the java Match syntax described in http://lucene.apache.org/core/4_4_0/core/org/apache/lucene/util/automaton/RegExp.html String tld = null; String inlink = null; @@ -406,7 +406,7 @@ public class yacysearch { final String urlstr = querystring.substring(inurlp + 6, ftb); querystring = querystring.replace("inurl:" + urlstr, ""); if ( !urlstr.isEmpty() ) { - urlmask = urlmask == null ? ".*" + urlstr + ".*" : urlmask + urlstr + ".*"; + urlmask = urlmask == null || urlmask.equals(".*") ? ".*" + urlstr + ".*" : urlmask; // we cannot join the conditions; if an urlmask is already given then stay with that } modifier.add("inurl:" + urlstr); } diff --git a/source/net/yacy/kelondro/data/meta/URIMetadataNode.java b/source/net/yacy/kelondro/data/meta/URIMetadataNode.java index e2f53f647..4fa085932 100644 --- a/source/net/yacy/kelondro/data/meta/URIMetadataNode.java +++ b/source/net/yacy/kelondro/data/meta/URIMetadataNode.java @@ -217,8 +217,11 @@ public class URIMetadataNode extends SolrDocument { return this.url; } - public boolean matches(Pattern matcher) { - return matcher.matcher(this.url.toString().toLowerCase()).matches(); + public boolean matches(Pattern pattern) { + return pattern.matcher(this.url.toNormalform(true).toLowerCase()).matches(); + //CharacterRunAutomaton automaton = new CharacterRunAutomaton(matcher); + //boolean match = automaton.run(this.url.toNormalform(true).toLowerCase()); + //return match; } public String dc_title() { diff --git a/source/net/yacy/peers/Protocol.java b/source/net/yacy/peers/Protocol.java index 15915dbb4..a68191616 100644 --- a/source/net/yacy/peers/Protocol.java +++ b/source/net/yacy/peers/Protocol.java @@ -835,7 +835,6 @@ public final class Protocol { baos = null; } - String filter = event.query.urlMask.pattern().toString(); parts.put("myseed", UTF8.StringBody((event.peers.mySeed() == null) ? "" : event.peers.mySeed().genSeedStr(key))); parts.put("count", UTF8.StringBody(Integer.toString(Math.max(10, count)))); parts.put("time", UTF8.StringBody(Long.toString(Math.max(3000, time)))); @@ -845,7 +844,7 @@ public final class Protocol { parts.put("duetime", UTF8.StringBody("1000")); parts.put("urls", UTF8.StringBody(urlhashes)); parts.put("prefer", UTF8.StringBody(event.query.prefer.pattern())); - parts.put("filter", UTF8.StringBody(filter)); + parts.put("filter", UTF8.StringBody(event.query.urlMaskString)); parts.put("modifier", UTF8.StringBody(event.query.modifier.toString())); parts.put("language", UTF8.StringBody(language)); parts.put("sitehash", UTF8.StringBody(event.query.modifier.sitehash)); diff --git a/source/net/yacy/search/query/QueryParams.java b/source/net/yacy/search/query/QueryParams.java index 8eb371af3..9feeec421 100644 --- a/source/net/yacy/search/query/QueryParams.java +++ b/source/net/yacy/search/query/QueryParams.java @@ -33,6 +33,7 @@ import java.util.LinkedHashSet; import java.util.Map; import java.util.Set; import java.util.SortedSet; +import java.util.regex.Matcher; import java.util.regex.Pattern; import java.util.regex.PatternSyntaxException; @@ -62,6 +63,8 @@ import net.yacy.search.ranking.RankingProfile; import net.yacy.search.schema.CollectionConfiguration; import net.yacy.search.schema.CollectionSchema; +import org.apache.lucene.util.automaton.Automata; +import org.apache.lucene.util.automaton.Automaton; import org.apache.solr.client.solrj.SolrQuery; import org.apache.solr.client.solrj.SolrQuery.SortClause; import org.apache.solr.common.params.CommonParams; @@ -103,7 +106,9 @@ public final class QueryParams { private final QueryGoal queryGoal; public int itemsPerPage; public int offset; - public Pattern urlMask; + public Pattern urlMaskPattern; + public Automaton urlMaskAutomaton; + public String urlMaskString; public final Pattern prefer; public final String tld, inlink; @@ -175,20 +180,32 @@ public final class QueryParams { this.itemsPerPage = Math.min((specialRights) ? 10000 : 1000, itemsPerPage); this.offset = Math.max(0, Math.min((specialRights) ? 10000 - this.itemsPerPage : 1000 - this.itemsPerPage, offset)); try { - this.urlMask = Pattern.compile(urlMask.toLowerCase()); - } catch (final PatternSyntaxException ex) { + this.urlMaskString = urlMask; + // solr doesn't like slashes, backslashes or doublepoints; remove them // urlmask = ".*\\." + ft + "(\\?.*)?"; + int p; + while ((p = this.urlMaskString.indexOf(':')) >= 0) this.urlMaskString = this.urlMaskString.substring(0, p) + "." + this.urlMaskString.substring(p + 1); + while ((p = this.urlMaskString.indexOf('/')) >= 0) this.urlMaskString = this.urlMaskString.substring(0, p) + "." + this.urlMaskString.substring(p + 1); + while ((p = this.urlMaskString.indexOf('\\')) >= 0) this.urlMaskString = this.urlMaskString.substring(0, p) + "." + this.urlMaskString.substring(p + 2); + this.urlMaskAutomaton = Automata.makeString(this.urlMaskString); + this.urlMaskPattern = Pattern.compile(this.urlMaskString); + } catch (final Throwable ex) { throw new IllegalArgumentException("Not a valid regular expression: " + urlMask, ex); } - this.urlMask_isCatchall = this.urlMask.toString().equals(catchall_pattern.toString()); + this.urlMask_isCatchall = this.urlMaskString.equals(catchall_pattern.toString()); if (this.urlMask_isCatchall) { String protocolfilter = modifier.protocol == null ? ".*" : modifier.protocol; String defaulthostprefix = modifier.protocol == null ? "www" : modifier.protocol; String hostfilter = modifier.sitehost == null && tld == null ? ".*" : modifier.sitehost == null ? ".*\\." + tld : modifier.sitehost.startsWith(defaulthostprefix + ".") ? "(" + defaulthostprefix + "\\.)?" + modifier.sitehost.substring(4) : "(" + defaulthostprefix + "\\.)?" + modifier.sitehost; String filefilter = modifier.filetype == null ? ".*" : ".*" + modifier.filetype + ".*"; - String filter = protocolfilter + "://" + hostfilter + "/" + filefilter; - if (!filter.equals(".*://.*/.*")) { - this.urlMask = Pattern.compile(filter); + String filter = protocolfilter + "..." + hostfilter + "." + filefilter; + if (!filter.equals(".*....*..*")) { + Pattern r = Pattern.compile("(\\.|(\\.\\*))\\.\\*"); + Matcher m; + while ((m = r.matcher(filter)).find()) filter = m.replaceAll(".*"); + this.urlMaskString = filter; + this.urlMaskAutomaton = Automata.makeString(filter); this.urlMask_isCatchall = false; + this.urlMaskPattern = Pattern.compile(filter); } } this.tld = tld; @@ -503,14 +520,7 @@ public final class QueryParams { if (!this.urlMask_isCatchall) { // add a filter query on urls - String urlMaskPattern = this.urlMask.pattern(); - - // solr doesn't like slashes, backslashes or doublepoints; remove them // urlmask = ".*\\." + ft + "(\\?.*)?"; - int p; - while ((p = urlMaskPattern.indexOf(':')) >= 0) urlMaskPattern = urlMaskPattern.substring(0, p) + "." + urlMaskPattern.substring(p + 1); - while ((p = urlMaskPattern.indexOf('/')) >= 0) urlMaskPattern = urlMaskPattern.substring(0, p) + "." + urlMaskPattern.substring(p + 1); - while ((p = urlMaskPattern.indexOf('\\')) >= 0) urlMaskPattern = urlMaskPattern.substring(0, p) + "." + urlMaskPattern.substring(p + 2); - fq.append(" AND ").append(CollectionSchema.sku.getSolrFieldName() + ":/" + urlMaskPattern + "/"); + fq.append(" AND ").append(CollectionSchema.sku.getSolrFieldName() + ":/" + this.urlMaskString + "/"); } if (this.radius > 0.0d && this.lat != 0.0d && this.lon != 0.0d) { @@ -583,7 +593,7 @@ public final class QueryParams { context.append(this.zonecode).append(asterisk); context.append(ASCII.String(Word.word2hash(this.ranking.toExternalString()))).append(asterisk); context.append(Base64Order.enhancedCoder.encodeString(this.prefer.toString())).append(asterisk); - context.append(Base64Order.enhancedCoder.encodeString(this.urlMask.toString())).append(asterisk); + context.append(Base64Order.enhancedCoder.encodeString(this.urlMaskString)).append(asterisk); context.append(this.modifier.sitehash).append(asterisk); context.append(this.modifier.author).append(asterisk); context.append(this.modifier.protocol).append(asterisk); diff --git a/source/net/yacy/search/query/SearchEvent.java b/source/net/yacy/search/query/SearchEvent.java index 4c7a2f2f4..c1107b4b3 100644 --- a/source/net/yacy/search/query/SearchEvent.java +++ b/source/net/yacy/search/query/SearchEvent.java @@ -893,7 +893,7 @@ public final class SearchEvent { if ( !this.query.urlMask_isCatchall ) { // check url mask - if (!iEntry.matches(this.query.urlMask)) { + if (!iEntry.matches(this.query.urlMaskPattern)) { if (log.isFine()) log.fine("dropped Node: url mask does not match"); continue pollloop; } @@ -1114,7 +1114,7 @@ public final class SearchEvent { URIMetadataNode page; mainloop: while ((page = pullOneRWI(skipDoubleDom)) != null) { - if (!this.query.urlMask_isCatchall && !page.matches(this.query.urlMask)) { + if (!this.query.urlMask_isCatchall && !page.matches(this.query.urlMaskPattern)) { if (log.isFine()) log.fine("dropped RWI: no match with urlMask"); if (page.word().local()) this.local_rwi_available.decrementAndGet(); else this.remote_rwi_available.decrementAndGet(); continue;