mirror of
https://github.com/yacy/yacy_search_server.git
synced 2024-09-19 00:01:41 +02:00
fix for urlmaskfilter
This commit is contained in:
parent
2636582435
commit
3d717b749a
|
@ -348,7 +348,7 @@ public class yacysearch {
|
|||
}
|
||||
|
||||
if ( !block ) {
|
||||
String urlmask = null;
|
||||
String urlmask = (post == null) ? ".*" : post.get("urlmaskfilter", ".*"); // the expression must be a subset of the java Match syntax described in http://lucene.apache.org/core/4_4_0/core/org/apache/lucene/util/automaton/RegExp.html
|
||||
String tld = null;
|
||||
String inlink = null;
|
||||
|
||||
|
@ -406,7 +406,7 @@ public class yacysearch {
|
|||
final String urlstr = querystring.substring(inurlp + 6, ftb);
|
||||
querystring = querystring.replace("inurl:" + urlstr, "");
|
||||
if ( !urlstr.isEmpty() ) {
|
||||
urlmask = urlmask == null ? ".*" + urlstr + ".*" : urlmask + urlstr + ".*";
|
||||
urlmask = urlmask == null || urlmask.equals(".*") ? ".*" + urlstr + ".*" : urlmask; // we cannot join the conditions; if an urlmask is already given then stay with that
|
||||
}
|
||||
modifier.add("inurl:" + urlstr);
|
||||
}
|
||||
|
|
|
@ -217,8 +217,11 @@ public class URIMetadataNode extends SolrDocument {
|
|||
return this.url;
|
||||
}
|
||||
|
||||
public boolean matches(Pattern matcher) {
|
||||
return matcher.matcher(this.url.toString().toLowerCase()).matches();
|
||||
public boolean matches(Pattern pattern) {
|
||||
return pattern.matcher(this.url.toNormalform(true).toLowerCase()).matches();
|
||||
//CharacterRunAutomaton automaton = new CharacterRunAutomaton(matcher);
|
||||
//boolean match = automaton.run(this.url.toNormalform(true).toLowerCase());
|
||||
//return match;
|
||||
}
|
||||
|
||||
public String dc_title() {
|
||||
|
|
|
@ -835,7 +835,6 @@ public final class Protocol {
|
|||
baos = null;
|
||||
}
|
||||
|
||||
String filter = event.query.urlMask.pattern().toString();
|
||||
parts.put("myseed", UTF8.StringBody((event.peers.mySeed() == null) ? "" : event.peers.mySeed().genSeedStr(key)));
|
||||
parts.put("count", UTF8.StringBody(Integer.toString(Math.max(10, count))));
|
||||
parts.put("time", UTF8.StringBody(Long.toString(Math.max(3000, time))));
|
||||
|
@ -845,7 +844,7 @@ public final class Protocol {
|
|||
parts.put("duetime", UTF8.StringBody("1000"));
|
||||
parts.put("urls", UTF8.StringBody(urlhashes));
|
||||
parts.put("prefer", UTF8.StringBody(event.query.prefer.pattern()));
|
||||
parts.put("filter", UTF8.StringBody(filter));
|
||||
parts.put("filter", UTF8.StringBody(event.query.urlMaskString));
|
||||
parts.put("modifier", UTF8.StringBody(event.query.modifier.toString()));
|
||||
parts.put("language", UTF8.StringBody(language));
|
||||
parts.put("sitehash", UTF8.StringBody(event.query.modifier.sitehash));
|
||||
|
|
|
@ -33,6 +33,7 @@ import java.util.LinkedHashSet;
|
|||
import java.util.Map;
|
||||
import java.util.Set;
|
||||
import java.util.SortedSet;
|
||||
import java.util.regex.Matcher;
|
||||
import java.util.regex.Pattern;
|
||||
import java.util.regex.PatternSyntaxException;
|
||||
|
||||
|
@ -62,6 +63,8 @@ import net.yacy.search.ranking.RankingProfile;
|
|||
import net.yacy.search.schema.CollectionConfiguration;
|
||||
import net.yacy.search.schema.CollectionSchema;
|
||||
|
||||
import org.apache.lucene.util.automaton.Automata;
|
||||
import org.apache.lucene.util.automaton.Automaton;
|
||||
import org.apache.solr.client.solrj.SolrQuery;
|
||||
import org.apache.solr.client.solrj.SolrQuery.SortClause;
|
||||
import org.apache.solr.common.params.CommonParams;
|
||||
|
@ -103,7 +106,9 @@ public final class QueryParams {
|
|||
private final QueryGoal queryGoal;
|
||||
public int itemsPerPage;
|
||||
public int offset;
|
||||
public Pattern urlMask;
|
||||
public Pattern urlMaskPattern;
|
||||
public Automaton urlMaskAutomaton;
|
||||
public String urlMaskString;
|
||||
|
||||
public final Pattern prefer;
|
||||
public final String tld, inlink;
|
||||
|
@ -175,20 +180,32 @@ public final class QueryParams {
|
|||
this.itemsPerPage = Math.min((specialRights) ? 10000 : 1000, itemsPerPage);
|
||||
this.offset = Math.max(0, Math.min((specialRights) ? 10000 - this.itemsPerPage : 1000 - this.itemsPerPage, offset));
|
||||
try {
|
||||
this.urlMask = Pattern.compile(urlMask.toLowerCase());
|
||||
} catch (final PatternSyntaxException ex) {
|
||||
this.urlMaskString = urlMask;
|
||||
// solr doesn't like slashes, backslashes or doublepoints; remove them // urlmask = ".*\\." + ft + "(\\?.*)?";
|
||||
int p;
|
||||
while ((p = this.urlMaskString.indexOf(':')) >= 0) this.urlMaskString = this.urlMaskString.substring(0, p) + "." + this.urlMaskString.substring(p + 1);
|
||||
while ((p = this.urlMaskString.indexOf('/')) >= 0) this.urlMaskString = this.urlMaskString.substring(0, p) + "." + this.urlMaskString.substring(p + 1);
|
||||
while ((p = this.urlMaskString.indexOf('\\')) >= 0) this.urlMaskString = this.urlMaskString.substring(0, p) + "." + this.urlMaskString.substring(p + 2);
|
||||
this.urlMaskAutomaton = Automata.makeString(this.urlMaskString);
|
||||
this.urlMaskPattern = Pattern.compile(this.urlMaskString);
|
||||
} catch (final Throwable ex) {
|
||||
throw new IllegalArgumentException("Not a valid regular expression: " + urlMask, ex);
|
||||
}
|
||||
this.urlMask_isCatchall = this.urlMask.toString().equals(catchall_pattern.toString());
|
||||
this.urlMask_isCatchall = this.urlMaskString.equals(catchall_pattern.toString());
|
||||
if (this.urlMask_isCatchall) {
|
||||
String protocolfilter = modifier.protocol == null ? ".*" : modifier.protocol;
|
||||
String defaulthostprefix = modifier.protocol == null ? "www" : modifier.protocol;
|
||||
String hostfilter = modifier.sitehost == null && tld == null ? ".*" : modifier.sitehost == null ? ".*\\." + tld : modifier.sitehost.startsWith(defaulthostprefix + ".") ? "(" + defaulthostprefix + "\\.)?" + modifier.sitehost.substring(4) : "(" + defaulthostprefix + "\\.)?" + modifier.sitehost;
|
||||
String filefilter = modifier.filetype == null ? ".*" : ".*" + modifier.filetype + ".*";
|
||||
String filter = protocolfilter + "://" + hostfilter + "/" + filefilter;
|
||||
if (!filter.equals(".*://.*/.*")) {
|
||||
this.urlMask = Pattern.compile(filter);
|
||||
String filter = protocolfilter + "..." + hostfilter + "." + filefilter;
|
||||
if (!filter.equals(".*....*..*")) {
|
||||
Pattern r = Pattern.compile("(\\.|(\\.\\*))\\.\\*");
|
||||
Matcher m;
|
||||
while ((m = r.matcher(filter)).find()) filter = m.replaceAll(".*");
|
||||
this.urlMaskString = filter;
|
||||
this.urlMaskAutomaton = Automata.makeString(filter);
|
||||
this.urlMask_isCatchall = false;
|
||||
this.urlMaskPattern = Pattern.compile(filter);
|
||||
}
|
||||
}
|
||||
this.tld = tld;
|
||||
|
@ -503,14 +520,7 @@ public final class QueryParams {
|
|||
|
||||
if (!this.urlMask_isCatchall) {
|
||||
// add a filter query on urls
|
||||
String urlMaskPattern = this.urlMask.pattern();
|
||||
|
||||
// solr doesn't like slashes, backslashes or doublepoints; remove them // urlmask = ".*\\." + ft + "(\\?.*)?";
|
||||
int p;
|
||||
while ((p = urlMaskPattern.indexOf(':')) >= 0) urlMaskPattern = urlMaskPattern.substring(0, p) + "." + urlMaskPattern.substring(p + 1);
|
||||
while ((p = urlMaskPattern.indexOf('/')) >= 0) urlMaskPattern = urlMaskPattern.substring(0, p) + "." + urlMaskPattern.substring(p + 1);
|
||||
while ((p = urlMaskPattern.indexOf('\\')) >= 0) urlMaskPattern = urlMaskPattern.substring(0, p) + "." + urlMaskPattern.substring(p + 2);
|
||||
fq.append(" AND ").append(CollectionSchema.sku.getSolrFieldName() + ":/" + urlMaskPattern + "/");
|
||||
fq.append(" AND ").append(CollectionSchema.sku.getSolrFieldName() + ":/" + this.urlMaskString + "/");
|
||||
}
|
||||
|
||||
if (this.radius > 0.0d && this.lat != 0.0d && this.lon != 0.0d) {
|
||||
|
@ -583,7 +593,7 @@ public final class QueryParams {
|
|||
context.append(this.zonecode).append(asterisk);
|
||||
context.append(ASCII.String(Word.word2hash(this.ranking.toExternalString()))).append(asterisk);
|
||||
context.append(Base64Order.enhancedCoder.encodeString(this.prefer.toString())).append(asterisk);
|
||||
context.append(Base64Order.enhancedCoder.encodeString(this.urlMask.toString())).append(asterisk);
|
||||
context.append(Base64Order.enhancedCoder.encodeString(this.urlMaskString)).append(asterisk);
|
||||
context.append(this.modifier.sitehash).append(asterisk);
|
||||
context.append(this.modifier.author).append(asterisk);
|
||||
context.append(this.modifier.protocol).append(asterisk);
|
||||
|
|
|
@ -893,7 +893,7 @@ public final class SearchEvent {
|
|||
|
||||
if ( !this.query.urlMask_isCatchall ) {
|
||||
// check url mask
|
||||
if (!iEntry.matches(this.query.urlMask)) {
|
||||
if (!iEntry.matches(this.query.urlMaskPattern)) {
|
||||
if (log.isFine()) log.fine("dropped Node: url mask does not match");
|
||||
continue pollloop;
|
||||
}
|
||||
|
@ -1114,7 +1114,7 @@ public final class SearchEvent {
|
|||
URIMetadataNode page;
|
||||
mainloop: while ((page = pullOneRWI(skipDoubleDom)) != null) {
|
||||
|
||||
if (!this.query.urlMask_isCatchall && !page.matches(this.query.urlMask)) {
|
||||
if (!this.query.urlMask_isCatchall && !page.matches(this.query.urlMaskPattern)) {
|
||||
if (log.isFine()) log.fine("dropped RWI: no match with urlMask");
|
||||
if (page.word().local()) this.local_rwi_available.decrementAndGet(); else this.remote_rwi_available.decrementAndGet();
|
||||
continue;
|
||||
|
|
Loading…
Reference in New Issue
Block a user