mirror of
https://github.com/yacy/yacy_search_server.git
synced 2024-09-19 00:01:41 +02:00
Improved accuracy of URLs search filters : protocol, tld, host, file ext
This commit is contained in:
parent
d1c7dfd852
commit
0a120787e3
|
@ -912,7 +912,7 @@ public class MultiProtocolURL implements Serializable, Comparable<MultiProtocolU
|
|||
}
|
||||
|
||||
/**
|
||||
* @return the host part of this URL, Punycode encoded for Internationalized Domain Names
|
||||
* @return the host part of this URL, Punycode encoded for Internationalized Domain Names. Can be null, for example for file URLs such as "file:///path/file.ext"
|
||||
*/
|
||||
public String getHost() {
|
||||
return this.host;
|
||||
|
@ -926,6 +926,9 @@ public class MultiProtocolURL implements Serializable, Comparable<MultiProtocolU
|
|||
return orga;
|
||||
}
|
||||
|
||||
/**
|
||||
* @return the top-level domain name part of this url host name, or the empty string.
|
||||
*/
|
||||
public String getTLD() {
|
||||
if (this.host == null) return "";
|
||||
int p = this.host.lastIndexOf('.');
|
||||
|
|
|
@ -46,6 +46,7 @@ import java.util.regex.PatternSyntaxException;
|
|||
|
||||
import org.apache.lucene.util.automaton.Automata;
|
||||
import org.apache.lucene.util.automaton.Automaton;
|
||||
import org.apache.lucene.util.automaton.RegExp;
|
||||
import org.apache.solr.client.solrj.SolrQuery;
|
||||
import org.apache.solr.client.solrj.SolrQuery.SortClause;
|
||||
import org.apache.solr.common.params.DisMaxParams;
|
||||
|
@ -55,6 +56,7 @@ import net.yacy.cora.document.analysis.Classification;
|
|||
import net.yacy.cora.document.analysis.Classification.ContentDomain;
|
||||
import net.yacy.cora.document.encoding.ASCII;
|
||||
import net.yacy.cora.document.id.AnchorURL;
|
||||
import net.yacy.cora.document.id.MultiProtocolURL;
|
||||
import net.yacy.cora.federate.solr.Ranking;
|
||||
import net.yacy.cora.federate.yacy.CacheStrategy;
|
||||
import net.yacy.cora.geo.GeoLocation;
|
||||
|
@ -217,7 +219,7 @@ public final class QueryParams {
|
|||
}
|
||||
this.urlMask_isCatchall = this.urlMaskString.equals(catchall_pattern.toString());
|
||||
if (this.urlMask_isCatchall) {
|
||||
final String filter = QueryParams.buildURLFilter(modifier, tld);
|
||||
final String filter = QueryParams.buildApproximateURLFilter(modifier, tld);
|
||||
if (!QueryParams.catchall_pattern.toString().equals(filter)) {
|
||||
this.urlMaskString = filter;
|
||||
this.urlMaskAutomaton = Automata.makeString(filter);
|
||||
|
@ -277,6 +279,13 @@ public final class QueryParams {
|
|||
}
|
||||
|
||||
/**
|
||||
* Generate an URL filter from the query modifier and eventual tld, usable as a
|
||||
* first approximation for filtering, and compatible with the yacy/search
|
||||
* API.<br/>
|
||||
* For truly accurate filtering, checking constraints against parsed URLs in
|
||||
* MultiprotocolURL instances is easier and more reliable than building a complex regular
|
||||
* expression that must be both compatible with the JDK {@link Pattern} and with Lucene {@link RegExp}.
|
||||
*
|
||||
* @param modifier
|
||||
* query modifier with eventual protocol, sitehost and filetype
|
||||
* constraints. The modifier parameter itselft must not be null.
|
||||
|
@ -285,7 +294,7 @@ public final class QueryParams {
|
|||
* @return an URL filter regular expression from the provided modifier and tld
|
||||
* constraints, matching anything when there are no constraints at all.
|
||||
*/
|
||||
protected static String buildURLFilter(final QueryModifier modifier, final String tld) {
|
||||
protected static String buildApproximateURLFilter(final QueryModifier modifier, final String tld) {
|
||||
final String protocolfilter = modifier.protocol == null ? ".*" : modifier.protocol;
|
||||
final String defaulthostprefix = "www";
|
||||
final String hostfilter;
|
||||
|
@ -416,6 +425,61 @@ public final class QueryParams {
|
|||
sb.append("]");
|
||||
return sb.toString();
|
||||
}
|
||||
|
||||
/**
|
||||
* @param modifier
|
||||
* the query modifier with eventual constraints on protocoln, host
|
||||
* name or file extension
|
||||
* @param tld
|
||||
* an eventual top-level domain name to filter on
|
||||
* @param url
|
||||
* the url to check
|
||||
* @return the constraint that did not match ("url" when url is null,
|
||||
* "protocol", "sitehost", "tld", or "filetype"), or the empty string
|
||||
* when the url matches
|
||||
*/
|
||||
public static String matchesURL(final QueryModifier modifier, final String tld, final MultiProtocolURL url) {
|
||||
if (url == null) {
|
||||
return "url";
|
||||
}
|
||||
if (modifier != null) {
|
||||
if (modifier.protocol != null) {
|
||||
if (!modifier.protocol.equalsIgnoreCase(url.getProtocol())) {
|
||||
return "protocol";
|
||||
}
|
||||
}
|
||||
if (modifier.sitehost != null) {
|
||||
/*
|
||||
* consider to search for hosts with 'www'-prefix, if not already part of the
|
||||
* host name
|
||||
*/
|
||||
final String wwwPrefix = "www.";
|
||||
final String host;
|
||||
final String hostWithWwwPrefix;
|
||||
if (modifier.sitehost.startsWith(wwwPrefix)) {
|
||||
hostWithWwwPrefix = modifier.sitehost;
|
||||
host = modifier.sitehost.substring(wwwPrefix.length());
|
||||
} else {
|
||||
hostWithWwwPrefix = wwwPrefix + modifier.sitehost;
|
||||
host = modifier.sitehost;
|
||||
}
|
||||
if (!host.equalsIgnoreCase(url.getHost()) && !hostWithWwwPrefix.equals(url.getHost())) {
|
||||
return "sitehost";
|
||||
}
|
||||
}
|
||||
if (tld != null) {
|
||||
if (!tld.equalsIgnoreCase(url.getTLD())) {
|
||||
return "tld";
|
||||
}
|
||||
}
|
||||
if (modifier.filetype != null) {
|
||||
if (!modifier.filetype.equalsIgnoreCase(MultiProtocolURL.getFileExtension(url.getFileName()))) {
|
||||
return "filetype";
|
||||
}
|
||||
}
|
||||
}
|
||||
return "";
|
||||
}
|
||||
|
||||
/**
|
||||
* check if the given text matches with the query
|
||||
|
|
|
@ -963,6 +963,16 @@ public final class SearchEvent implements ScoreMapUpdatesListener {
|
|||
try {
|
||||
pollloop: for (URIMetadataNode iEntry: nodeList) {
|
||||
|
||||
// check url related eventual constraints (protocol, tld, sitehost, and filetype)
|
||||
final String matchingResult = QueryParams.matchesURL(this.query.modifier, this.query.tld, iEntry.url());
|
||||
if (!matchingResult.isEmpty()) {
|
||||
if (log.isFine()) {
|
||||
log.fine("dropped Node: " + matchingResult);
|
||||
}
|
||||
updateCountsOnSolrEntryToEvict(iEntry, facets, local, !incrementNavigators);
|
||||
continue pollloop;
|
||||
}
|
||||
|
||||
if ( !this.query.urlMask_isCatchall ) {
|
||||
// check url mask
|
||||
if (!iEntry.matches(this.query.urlMaskPattern)) {
|
||||
|
@ -1019,13 +1029,6 @@ public final class SearchEvent implements ScoreMapUpdatesListener {
|
|||
updateCountsOnSolrEntryToEvict(iEntry, facets, local, !incrementNavigators);
|
||||
continue pollloop;
|
||||
}
|
||||
} else {
|
||||
// filter out all domains that do not match with the site constraint
|
||||
if (iEntry.url().getHost().indexOf(this.query.modifier.sitehost) < 0) {
|
||||
if (log.isFine()) log.fine("dropped Node: sitehost");
|
||||
updateCountsOnSolrEntryToEvict(iEntry, facets, local, !incrementNavigators);
|
||||
continue pollloop;
|
||||
}
|
||||
}
|
||||
|
||||
if (this.query.modifier.language != null) {
|
||||
|
@ -1393,6 +1396,16 @@ public final class SearchEvent implements ScoreMapUpdatesListener {
|
|||
// returns from the current RWI list the best URL entry and removes this entry from the list
|
||||
URIMetadataNode page;
|
||||
mainloop: while ((page = pullOneRWI(skipDoubleDom)) != null) {
|
||||
|
||||
// check url related eventual constraints (protocol, tld, sitehost, and filetype)
|
||||
final String matchingResult = QueryParams.matchesURL(this.query.modifier, this.query.tld, page.url());
|
||||
if (!matchingResult.isEmpty()) {
|
||||
if (log.isFine()) {
|
||||
log.fine("dropped RWI: no match on " + matchingResult);
|
||||
}
|
||||
decrementCounts(page.word());
|
||||
continue;
|
||||
}
|
||||
|
||||
if (!this.query.urlMask_isCatchall && !page.matches(this.query.urlMaskPattern)) {
|
||||
if (log.isFine()) log.fine("dropped RWI: no match with urlMask");
|
||||
|
@ -1427,14 +1440,6 @@ public final class SearchEvent implements ScoreMapUpdatesListener {
|
|||
}
|
||||
|
||||
// filter query modifiers variables (these are host, filetype, protocol, language, author, collection, dates_in_content(on,from,to,timezone) )
|
||||
// while ( protocol, host, filetype ) currently maybe incorporated in (this.query.urlMaskPattern) queryparam
|
||||
|
||||
// check modifier constraint filetype (using fileextension)
|
||||
if (this.query.modifier.filetype != null && !this.query.modifier.filetype.equals(ext)) {
|
||||
if (log.isFine()) log.fine("dropped RWI: file type constraint = " + this.query.modifier.filetype);
|
||||
decrementCounts(page.word());
|
||||
continue;
|
||||
}
|
||||
|
||||
/* check again modifier constraint (language) with the language in the full metadata,
|
||||
* that may differ from the one in the reverse word reference which is already checked in addRWIs()*/
|
||||
|
@ -1480,12 +1485,12 @@ public final class SearchEvent implements ScoreMapUpdatesListener {
|
|||
|
||||
// content control
|
||||
if (Switchboard.getSwitchboard().getConfigBool("contentcontrol.enabled", false)) {
|
||||
FilterEngine f = ContentControlFilterUpdateThread.getNetworkFilter();
|
||||
if (f != null && !f.isListed(page.url(), null)) {
|
||||
FilterEngine f = ContentControlFilterUpdateThread.getNetworkFilter();
|
||||
if (f != null && !f.isListed(page.url(), null)) {
|
||||
if (log.isFine()) log.fine("dropped RWI: url is blacklisted in contentcontrol");
|
||||
decrementCounts(page.word());
|
||||
decrementCounts(page.word());
|
||||
continue;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
final String pageurl = page.url().toNormalform(true);
|
||||
|
|
236
test/java/net/yacy/search/query/QueryParamsTest.java
Normal file
236
test/java/net/yacy/search/query/QueryParamsTest.java
Normal file
|
@ -0,0 +1,236 @@
|
|||
// QueryParamsTest.java
|
||||
// ---------------------------
|
||||
// Copyright 2017 by luccioman; https://github.com/luccioman
|
||||
//
|
||||
// This is a part of YaCy, a peer-to-peer based web search engine
|
||||
//
|
||||
// LICENSE
|
||||
//
|
||||
// This program is free software; you can redistribute it and/or modify
|
||||
// it under the terms of the GNU General Public License as published by
|
||||
// the Free Software Foundation; either version 2 of the License, or
|
||||
// (at your option) any later version.
|
||||
//
|
||||
// This program is distributed in the hope that it will be useful,
|
||||
// but WITHOUT ANY WARRANTY; without even the implied warranty of
|
||||
// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
|
||||
// GNU General Public License for more details.
|
||||
//
|
||||
// You should have received a copy of the GNU General Public License
|
||||
// along with this program; if not, write to the Free Software
|
||||
// Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
|
||||
|
||||
package net.yacy.search.query;
|
||||
|
||||
import java.net.MalformedURLException;
|
||||
|
||||
import org.junit.Assert;
|
||||
import org.junit.Test;
|
||||
|
||||
import net.yacy.cora.document.id.MultiProtocolURL;
|
||||
|
||||
/**
|
||||
* Unit tests for the {@link QueryParams} class.
|
||||
*/
|
||||
public class QueryParamsTest {
|
||||
|
||||
/**
|
||||
* Test URL matching with a single query constraint on top-level domain.
|
||||
* @throws MalformedURLException when a test URL is malformed. Should not happen.
|
||||
*/
|
||||
@Test
|
||||
public void testMatchesURLTLD() throws MalformedURLException {
|
||||
final String[] matchingURLs = { "http://example.org", // most basic matching example
|
||||
"http://example.org/", // normalized basic example
|
||||
"http://www.example.org/", // with www domain prefix
|
||||
"http://example.org:8080", // non default port
|
||||
"http://example.org?key=value", // empty path and query string
|
||||
"http://example.org?key=value#fragment", // empty path, query string and fragment
|
||||
"http://example.org:8080?key=value#fragment", // non default port, empty path, query string and fragment
|
||||
"http://example.org:8080/?key=value#fragment", // normalized non default port, empty path, query string and fragment
|
||||
"http://example.org#fragment", // empty path and fragment
|
||||
"ftp://example.org", // another protocol than http
|
||||
"http://example.org/index.html", // with file
|
||||
"http://example.org/path/index.html", // with path and file
|
||||
"http://example.org:8090/path/index.html", // with non default port, path and file
|
||||
"http://example.org/index.html?key=value", // with file and query string
|
||||
"http://example.org/index.html?key=value#fragment", // with file, query string and url fragment
|
||||
};
|
||||
|
||||
final String[] nonMatchingURLs = { "http://example.test", // basic non matching example
|
||||
"http://example.test/", // normalized basic example
|
||||
"http://org.example.net", // only subdomain matching
|
||||
"http://example.org.net", // only secondary-level domain matching
|
||||
"http://organization.test", // secondary-level starting like the filter
|
||||
"http://test.organic", // top-level domain starting like the filter
|
||||
"http://en.organization.test", // subdomain then secondary-level starting like the filter
|
||||
"http://example.test/path/file.org", // with file ending like the tld filter
|
||||
"http://example.test/?query=example.org", // with query parameter including the tld
|
||||
"http://example.test/#fragment.org", // with query parameter including the tld
|
||||
"file:///path/file.txt", // empty host name in file URL
|
||||
"http://127.0.0.1/index.html", // IPv4 address
|
||||
"http://[2001:db8::ff00:42:8329]/index.html" // IPv6 address
|
||||
};
|
||||
|
||||
final QueryModifier modifier = new QueryModifier(0);
|
||||
checkURLs(matchingURLs, nonMatchingURLs, modifier, "org");
|
||||
}
|
||||
|
||||
/**
|
||||
* Check matching and non matching URLs against the given query modifier and
|
||||
* eventual top-level domain name.
|
||||
*
|
||||
* @param matchingURLs
|
||||
* array of URLs expected to be accepted
|
||||
* @param nonMatchingURLs
|
||||
* array of URLs expected to be rejected
|
||||
* @param modifier
|
||||
* the query modifier
|
||||
* @param tld
|
||||
* the eventual top-level domain to filter on.
|
||||
* @throws MalformedURLException when a test URL string is malformed
|
||||
*/
|
||||
private void checkURLs(final String[] matchingURLs, final String[] nonMatchingURLs, final QueryModifier modifier, final String tld) throws MalformedURLException {
|
||||
for (final String matchingURL : matchingURLs) {
|
||||
Assert.assertEquals(matchingURL + " should match", "", QueryParams.matchesURL(modifier, tld, new MultiProtocolURL(matchingURL)));
|
||||
}
|
||||
for (final String nonMatchingURL : nonMatchingURLs) {
|
||||
Assert.assertNotEquals(nonMatchingURL + " should not match", "",
|
||||
QueryParams.matchesURL(modifier, tld, new MultiProtocolURL(nonMatchingURL)));
|
||||
}
|
||||
}
|
||||
|
||||
/**
|
||||
* Test URL matching build with a single query constraint on URL scheme.
|
||||
* @throws MalformedURLException when a test URL is malformed. Should not happen.
|
||||
*/
|
||||
@Test
|
||||
public void testMatchesURLProtocol() throws MalformedURLException {
|
||||
final String[] matchingURLs = { "http://example.org/" };
|
||||
|
||||
final String[] nonMatchingURLs = { "https://example.org/",
|
||||
"ftp://www.example.test/", "smb://localhost",
|
||||
"mailto:user@example.com", "file:///tmp/path/",
|
||||
"https://example.org/index.html?query=http", // with query parameter including the protocol
|
||||
"https://example.org/index.html#http" // with fragment string including the protocol
|
||||
};
|
||||
final QueryModifier modifier = new QueryModifier(0);
|
||||
modifier.protocol = "http";
|
||||
checkURLs(matchingURLs, nonMatchingURLs, modifier, null);
|
||||
}
|
||||
|
||||
/**
|
||||
* Test URL matching with a single query constraint on host name.
|
||||
* @throws MalformedURLException when a test URL is malformed. Should not happen.
|
||||
*/
|
||||
@Test
|
||||
public void testMatchesURLHostName() throws MalformedURLException {
|
||||
final String[] matchingURLs = { "http://example.org", // most basic matching example
|
||||
"http://example.org/", // normalized basic example
|
||||
"http://www.example.org/", // with www domain prefix
|
||||
"http://example.org:8080", // non default port
|
||||
"http://example.org?key=value", // empty path and query string
|
||||
"http://example.org?key=value#fragment", // empty path, query string and fragment
|
||||
"http://example.org:8080?key=value#fragment", // non default port, empty path, query string and fragment
|
||||
"http://example.org:8080/?key=value#fragment", // normalized non default port, empty path, query string and fragment
|
||||
"http://example.org#fragment", // empty path and fragment
|
||||
"ftp://example.org", // another protocol than http
|
||||
"http://example.org/index.html", // with file
|
||||
"http://example.org/path/index.html", // with path and file
|
||||
"http://example.org:8090/path/index.html", // with non default port, path and file
|
||||
"http://example.org/index.html?key=value", // with file and query string
|
||||
"http://example.org/index.html?key=value#fragment", // with file, query string and url fragment
|
||||
};
|
||||
|
||||
final String[] nonMatchingURLs = { "http://domain.test", // basic non matching example
|
||||
"http://domain.test/", // normalized basic example
|
||||
"http://fr.example.org", // domain prefix different from www
|
||||
"http://example.net", // only secondary-level domain matching
|
||||
"http://test.org", // only top-level domain matching
|
||||
"http://example.organic", // domain starting like the one of the filter
|
||||
"http://unexample.org", // domain ending like the one of the filter
|
||||
"http://example.net/index.html?query=example.org", // with query including the filtered domain
|
||||
"http://example.net/index.html#example.org", // with fragment string including the filtered domain
|
||||
"file:///path/file.txt", // empty host name in file URL
|
||||
"http://127.0.0.1/index.html", // IPv4 address
|
||||
"http://[2001:db8::ff00:42:8329]/index.html" // IPv6 address
|
||||
};
|
||||
final QueryModifier modifier = new QueryModifier(0);
|
||||
modifier.sitehost = "example.org";
|
||||
checkURLs(matchingURLs, nonMatchingURLs, modifier, null);
|
||||
}
|
||||
|
||||
/**
|
||||
* Test URL matching with a single query constraint on file extension.
|
||||
* @throws MalformedURLException when a test URL is malformed. Should not happen.
|
||||
*/
|
||||
@Test
|
||||
public void testMatchesURLFileExt() throws MalformedURLException {
|
||||
final String[] matchingURLs = { "http://example.org/image.html", // most basic matching example
|
||||
"http://example.org/image.html#anchor", // with url fragment
|
||||
"http://example.org/image.html?key=value#anchor", // with query string and url fragment
|
||||
};
|
||||
|
||||
final String[] nonMatchingURLs = { "http://example.org/file.txt", // basic non matching example
|
||||
"http://example.org/file.xhtml", // extension ending like the expected one
|
||||
"http://example.org/html/example.txt", // extension found in path
|
||||
"http://example.org/resource?key=html", // extension found as query parameter value
|
||||
"http://example.org/resource#html", // extension found as url fragment
|
||||
};
|
||||
final QueryModifier modifier = new QueryModifier(0);
|
||||
modifier.filetype = "html";
|
||||
checkURLs(matchingURLs, nonMatchingURLs, modifier, null);
|
||||
}
|
||||
|
||||
/**
|
||||
* Test URL matching with combined protocol and host name query modifiers.
|
||||
* @throws MalformedURLException when a test URL is malformed. Should not happen.
|
||||
*/
|
||||
@Test
|
||||
public void testBuildURLFilterProtocolAndHostName() throws MalformedURLException {
|
||||
final String[] matchingURLs = { "http://example.org", // most basic matching example
|
||||
"http://example.org/", // normalized basic example
|
||||
"http://www.example.org/", // with www domain prefix
|
||||
"http://example.org:8080", // non default port
|
||||
"http://example.org?key=value", // empty path and query string
|
||||
"http://example.org?key=value#fragment", // empty path, query string and fragment
|
||||
"http://example.org:8080?key=value#fragment", // non default port, empty path, query string and fragment
|
||||
"http://example.org:8080/?key=value#fragment", // normalized non default port, empty path, query string and fragment
|
||||
"http://example.org#fragment", // empty path and fragment
|
||||
"http://example.org/index.html", // with file
|
||||
"http://example.org/path/index.html", // with path and file
|
||||
"http://example.org:8090/path/index.html", // with non default port, path and file
|
||||
"http://example.org/index.html?key=value", // with file and query string
|
||||
"http://example.org/index.html?key=value#fragment", // with file, query string and url fragment
|
||||
};
|
||||
|
||||
final String[] nonMatchingURLs = { "ftp://domain.test", // basic non matching example
|
||||
"ftp://domain.test/", // normalized basic example
|
||||
"ftp://example.org/", // only domain matching
|
||||
"http://fr.example.org", // domain prefix different from www
|
||||
"http://example.net", // only secondary-level domain matching
|
||||
"http://test.org", // only top-level domain matching
|
||||
"http://example.organic", // domain starting like the one of the filter
|
||||
"http://unexample.org", // domain ending like the one of the filter
|
||||
"http://example.net/index.html?query=example.org", // with query including the filtered domain
|
||||
"http://example.net/index.html#example.org", // with fragment string including the filtered domain
|
||||
"http://127.0.0.1/index.html", // IPv4 address
|
||||
"http://[2001:db8::ff00:42:8329]/index.html" // IPv6 address
|
||||
};
|
||||
final QueryModifier modifier = new QueryModifier(0);
|
||||
modifier.sitehost = "example.org";
|
||||
modifier.protocol = "http";
|
||||
checkURLs(matchingURLs, nonMatchingURLs, modifier, null);
|
||||
}
|
||||
|
||||
/**
|
||||
* Test URL filter build with no constraints at all
|
||||
*/
|
||||
@Test
|
||||
public void testBuilURLFilterEmpty() {
|
||||
final QueryModifier modifier = new QueryModifier(0);
|
||||
final String filter = QueryParams.buildApproximateURLFilter(modifier, null);
|
||||
|
||||
Assert.assertEquals(QueryParams.catchall_pattern.toString(), filter);
|
||||
}
|
||||
}
|
Loading…
Reference in New Issue
Block a user