Improved accuracy of URLs search filters : protocol, tld, host, file ext

This commit is contained in:
luccioman 2017-12-01 11:19:31 +01:00
parent d1c7dfd852
commit 0a120787e3
4 changed files with 330 additions and 22 deletions

View File

@ -912,7 +912,7 @@ public class MultiProtocolURL implements Serializable, Comparable<MultiProtocolU
}
/**
* @return the host part of this URL, Punycode encoded for Internationalized Domain Names
* @return the host part of this URL, Punycode encoded for Internationalized Domain Names. Can be null, for example for file URLs such as "file:///path/file.ext"
*/
public String getHost() {
return this.host;
@ -926,6 +926,9 @@ public class MultiProtocolURL implements Serializable, Comparable<MultiProtocolU
return orga;
}
/**
* @return the top-level domain name part of this url host name, or the empty string.
*/
public String getTLD() {
if (this.host == null) return "";
int p = this.host.lastIndexOf('.');

View File

@ -46,6 +46,7 @@ import java.util.regex.PatternSyntaxException;
import org.apache.lucene.util.automaton.Automata;
import org.apache.lucene.util.automaton.Automaton;
import org.apache.lucene.util.automaton.RegExp;
import org.apache.solr.client.solrj.SolrQuery;
import org.apache.solr.client.solrj.SolrQuery.SortClause;
import org.apache.solr.common.params.DisMaxParams;
@ -55,6 +56,7 @@ import net.yacy.cora.document.analysis.Classification;
import net.yacy.cora.document.analysis.Classification.ContentDomain;
import net.yacy.cora.document.encoding.ASCII;
import net.yacy.cora.document.id.AnchorURL;
import net.yacy.cora.document.id.MultiProtocolURL;
import net.yacy.cora.federate.solr.Ranking;
import net.yacy.cora.federate.yacy.CacheStrategy;
import net.yacy.cora.geo.GeoLocation;
@ -217,7 +219,7 @@ public final class QueryParams {
}
this.urlMask_isCatchall = this.urlMaskString.equals(catchall_pattern.toString());
if (this.urlMask_isCatchall) {
final String filter = QueryParams.buildURLFilter(modifier, tld);
final String filter = QueryParams.buildApproximateURLFilter(modifier, tld);
if (!QueryParams.catchall_pattern.toString().equals(filter)) {
this.urlMaskString = filter;
this.urlMaskAutomaton = Automata.makeString(filter);
@ -277,6 +279,13 @@ public final class QueryParams {
}
/**
* Generate an URL filter from the query modifier and eventual tld, usable as a
* first approximation for filtering, and compatible with the yacy/search
* API.<br/>
* For truly accurate filtering, checking constraints against parsed URLs in
* MultiprotocolURL instances is easier and more reliable than building a complex regular
* expression that must be both compatible with the JDK {@link Pattern} and with Lucene {@link RegExp}.
*
* @param modifier
* query modifier with eventual protocol, sitehost and filetype
* constraints. The modifier parameter itselft must not be null.
@ -285,7 +294,7 @@ public final class QueryParams {
* @return an URL filter regular expression from the provided modifier and tld
* constraints, matching anything when there are no constraints at all.
*/
protected static String buildURLFilter(final QueryModifier modifier, final String tld) {
protected static String buildApproximateURLFilter(final QueryModifier modifier, final String tld) {
final String protocolfilter = modifier.protocol == null ? ".*" : modifier.protocol;
final String defaulthostprefix = "www";
final String hostfilter;
@ -416,6 +425,61 @@ public final class QueryParams {
sb.append("]");
return sb.toString();
}
/**
* @param modifier
* the query modifier with eventual constraints on protocoln, host
* name or file extension
* @param tld
* an eventual top-level domain name to filter on
* @param url
* the url to check
* @return the constraint that did not match ("url" when url is null,
* "protocol", "sitehost", "tld", or "filetype"), or the empty string
* when the url matches
*/
public static String matchesURL(final QueryModifier modifier, final String tld, final MultiProtocolURL url) {
if (url == null) {
return "url";
}
if (modifier != null) {
if (modifier.protocol != null) {
if (!modifier.protocol.equalsIgnoreCase(url.getProtocol())) {
return "protocol";
}
}
if (modifier.sitehost != null) {
/*
* consider to search for hosts with 'www'-prefix, if not already part of the
* host name
*/
final String wwwPrefix = "www.";
final String host;
final String hostWithWwwPrefix;
if (modifier.sitehost.startsWith(wwwPrefix)) {
hostWithWwwPrefix = modifier.sitehost;
host = modifier.sitehost.substring(wwwPrefix.length());
} else {
hostWithWwwPrefix = wwwPrefix + modifier.sitehost;
host = modifier.sitehost;
}
if (!host.equalsIgnoreCase(url.getHost()) && !hostWithWwwPrefix.equals(url.getHost())) {
return "sitehost";
}
}
if (tld != null) {
if (!tld.equalsIgnoreCase(url.getTLD())) {
return "tld";
}
}
if (modifier.filetype != null) {
if (!modifier.filetype.equalsIgnoreCase(MultiProtocolURL.getFileExtension(url.getFileName()))) {
return "filetype";
}
}
}
return "";
}
/**
* check if the given text matches with the query

View File

@ -963,6 +963,16 @@ public final class SearchEvent implements ScoreMapUpdatesListener {
try {
pollloop: for (URIMetadataNode iEntry: nodeList) {
// check url related eventual constraints (protocol, tld, sitehost, and filetype)
final String matchingResult = QueryParams.matchesURL(this.query.modifier, this.query.tld, iEntry.url());
if (!matchingResult.isEmpty()) {
if (log.isFine()) {
log.fine("dropped Node: " + matchingResult);
}
updateCountsOnSolrEntryToEvict(iEntry, facets, local, !incrementNavigators);
continue pollloop;
}
if ( !this.query.urlMask_isCatchall ) {
// check url mask
if (!iEntry.matches(this.query.urlMaskPattern)) {
@ -1019,13 +1029,6 @@ public final class SearchEvent implements ScoreMapUpdatesListener {
updateCountsOnSolrEntryToEvict(iEntry, facets, local, !incrementNavigators);
continue pollloop;
}
} else {
// filter out all domains that do not match with the site constraint
if (iEntry.url().getHost().indexOf(this.query.modifier.sitehost) < 0) {
if (log.isFine()) log.fine("dropped Node: sitehost");
updateCountsOnSolrEntryToEvict(iEntry, facets, local, !incrementNavigators);
continue pollloop;
}
}
if (this.query.modifier.language != null) {
@ -1393,6 +1396,16 @@ public final class SearchEvent implements ScoreMapUpdatesListener {
// returns from the current RWI list the best URL entry and removes this entry from the list
URIMetadataNode page;
mainloop: while ((page = pullOneRWI(skipDoubleDom)) != null) {
// check url related eventual constraints (protocol, tld, sitehost, and filetype)
final String matchingResult = QueryParams.matchesURL(this.query.modifier, this.query.tld, page.url());
if (!matchingResult.isEmpty()) {
if (log.isFine()) {
log.fine("dropped RWI: no match on " + matchingResult);
}
decrementCounts(page.word());
continue;
}
if (!this.query.urlMask_isCatchall && !page.matches(this.query.urlMaskPattern)) {
if (log.isFine()) log.fine("dropped RWI: no match with urlMask");
@ -1427,14 +1440,6 @@ public final class SearchEvent implements ScoreMapUpdatesListener {
}
// filter query modifiers variables (these are host, filetype, protocol, language, author, collection, dates_in_content(on,from,to,timezone) )
// while ( protocol, host, filetype ) currently maybe incorporated in (this.query.urlMaskPattern) queryparam
// check modifier constraint filetype (using fileextension)
if (this.query.modifier.filetype != null && !this.query.modifier.filetype.equals(ext)) {
if (log.isFine()) log.fine("dropped RWI: file type constraint = " + this.query.modifier.filetype);
decrementCounts(page.word());
continue;
}
/* check again modifier constraint (language) with the language in the full metadata,
* that may differ from the one in the reverse word reference which is already checked in addRWIs()*/
@ -1480,12 +1485,12 @@ public final class SearchEvent implements ScoreMapUpdatesListener {
// content control
if (Switchboard.getSwitchboard().getConfigBool("contentcontrol.enabled", false)) {
FilterEngine f = ContentControlFilterUpdateThread.getNetworkFilter();
if (f != null && !f.isListed(page.url(), null)) {
FilterEngine f = ContentControlFilterUpdateThread.getNetworkFilter();
if (f != null && !f.isListed(page.url(), null)) {
if (log.isFine()) log.fine("dropped RWI: url is blacklisted in contentcontrol");
decrementCounts(page.word());
decrementCounts(page.word());
continue;
}
}
}
final String pageurl = page.url().toNormalform(true);

View File

@ -0,0 +1,236 @@
// QueryParamsTest.java
// ---------------------------
// Copyright 2017 by luccioman; https://github.com/luccioman
//
// This is a part of YaCy, a peer-to-peer based web search engine
//
// LICENSE
//
// This program is free software; you can redistribute it and/or modify
// it under the terms of the GNU General Public License as published by
// the Free Software Foundation; either version 2 of the License, or
// (at your option) any later version.
//
// This program is distributed in the hope that it will be useful,
// but WITHOUT ANY WARRANTY; without even the implied warranty of
// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
// GNU General Public License for more details.
//
// You should have received a copy of the GNU General Public License
// along with this program; if not, write to the Free Software
// Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
package net.yacy.search.query;
import java.net.MalformedURLException;
import org.junit.Assert;
import org.junit.Test;
import net.yacy.cora.document.id.MultiProtocolURL;
/**
* Unit tests for the {@link QueryParams} class.
*/
public class QueryParamsTest {
/**
* Test URL matching with a single query constraint on top-level domain.
* @throws MalformedURLException when a test URL is malformed. Should not happen.
*/
@Test
public void testMatchesURLTLD() throws MalformedURLException {
final String[] matchingURLs = { "http://example.org", // most basic matching example
"http://example.org/", // normalized basic example
"http://www.example.org/", // with www domain prefix
"http://example.org:8080", // non default port
"http://example.org?key=value", // empty path and query string
"http://example.org?key=value#fragment", // empty path, query string and fragment
"http://example.org:8080?key=value#fragment", // non default port, empty path, query string and fragment
"http://example.org:8080/?key=value#fragment", // normalized non default port, empty path, query string and fragment
"http://example.org#fragment", // empty path and fragment
"ftp://example.org", // another protocol than http
"http://example.org/index.html", // with file
"http://example.org/path/index.html", // with path and file
"http://example.org:8090/path/index.html", // with non default port, path and file
"http://example.org/index.html?key=value", // with file and query string
"http://example.org/index.html?key=value#fragment", // with file, query string and url fragment
};
final String[] nonMatchingURLs = { "http://example.test", // basic non matching example
"http://example.test/", // normalized basic example
"http://org.example.net", // only subdomain matching
"http://example.org.net", // only secondary-level domain matching
"http://organization.test", // secondary-level starting like the filter
"http://test.organic", // top-level domain starting like the filter
"http://en.organization.test", // subdomain then secondary-level starting like the filter
"http://example.test/path/file.org", // with file ending like the tld filter
"http://example.test/?query=example.org", // with query parameter including the tld
"http://example.test/#fragment.org", // with query parameter including the tld
"file:///path/file.txt", // empty host name in file URL
"http://127.0.0.1/index.html", // IPv4 address
"http://[2001:db8::ff00:42:8329]/index.html" // IPv6 address
};
final QueryModifier modifier = new QueryModifier(0);
checkURLs(matchingURLs, nonMatchingURLs, modifier, "org");
}
/**
* Check matching and non matching URLs against the given query modifier and
* eventual top-level domain name.
*
* @param matchingURLs
* array of URLs expected to be accepted
* @param nonMatchingURLs
* array of URLs expected to be rejected
* @param modifier
* the query modifier
* @param tld
* the eventual top-level domain to filter on.
* @throws MalformedURLException when a test URL string is malformed
*/
private void checkURLs(final String[] matchingURLs, final String[] nonMatchingURLs, final QueryModifier modifier, final String tld) throws MalformedURLException {
for (final String matchingURL : matchingURLs) {
Assert.assertEquals(matchingURL + " should match", "", QueryParams.matchesURL(modifier, tld, new MultiProtocolURL(matchingURL)));
}
for (final String nonMatchingURL : nonMatchingURLs) {
Assert.assertNotEquals(nonMatchingURL + " should not match", "",
QueryParams.matchesURL(modifier, tld, new MultiProtocolURL(nonMatchingURL)));
}
}
/**
* Test URL matching build with a single query constraint on URL scheme.
* @throws MalformedURLException when a test URL is malformed. Should not happen.
*/
@Test
public void testMatchesURLProtocol() throws MalformedURLException {
final String[] matchingURLs = { "http://example.org/" };
final String[] nonMatchingURLs = { "https://example.org/",
"ftp://www.example.test/", "smb://localhost",
"mailto:user@example.com", "file:///tmp/path/",
"https://example.org/index.html?query=http", // with query parameter including the protocol
"https://example.org/index.html#http" // with fragment string including the protocol
};
final QueryModifier modifier = new QueryModifier(0);
modifier.protocol = "http";
checkURLs(matchingURLs, nonMatchingURLs, modifier, null);
}
/**
* Test URL matching with a single query constraint on host name.
* @throws MalformedURLException when a test URL is malformed. Should not happen.
*/
@Test
public void testMatchesURLHostName() throws MalformedURLException {
final String[] matchingURLs = { "http://example.org", // most basic matching example
"http://example.org/", // normalized basic example
"http://www.example.org/", // with www domain prefix
"http://example.org:8080", // non default port
"http://example.org?key=value", // empty path and query string
"http://example.org?key=value#fragment", // empty path, query string and fragment
"http://example.org:8080?key=value#fragment", // non default port, empty path, query string and fragment
"http://example.org:8080/?key=value#fragment", // normalized non default port, empty path, query string and fragment
"http://example.org#fragment", // empty path and fragment
"ftp://example.org", // another protocol than http
"http://example.org/index.html", // with file
"http://example.org/path/index.html", // with path and file
"http://example.org:8090/path/index.html", // with non default port, path and file
"http://example.org/index.html?key=value", // with file and query string
"http://example.org/index.html?key=value#fragment", // with file, query string and url fragment
};
final String[] nonMatchingURLs = { "http://domain.test", // basic non matching example
"http://domain.test/", // normalized basic example
"http://fr.example.org", // domain prefix different from www
"http://example.net", // only secondary-level domain matching
"http://test.org", // only top-level domain matching
"http://example.organic", // domain starting like the one of the filter
"http://unexample.org", // domain ending like the one of the filter
"http://example.net/index.html?query=example.org", // with query including the filtered domain
"http://example.net/index.html#example.org", // with fragment string including the filtered domain
"file:///path/file.txt", // empty host name in file URL
"http://127.0.0.1/index.html", // IPv4 address
"http://[2001:db8::ff00:42:8329]/index.html" // IPv6 address
};
final QueryModifier modifier = new QueryModifier(0);
modifier.sitehost = "example.org";
checkURLs(matchingURLs, nonMatchingURLs, modifier, null);
}
/**
* Test URL matching with a single query constraint on file extension.
* @throws MalformedURLException when a test URL is malformed. Should not happen.
*/
@Test
public void testMatchesURLFileExt() throws MalformedURLException {
final String[] matchingURLs = { "http://example.org/image.html", // most basic matching example
"http://example.org/image.html#anchor", // with url fragment
"http://example.org/image.html?key=value#anchor", // with query string and url fragment
};
final String[] nonMatchingURLs = { "http://example.org/file.txt", // basic non matching example
"http://example.org/file.xhtml", // extension ending like the expected one
"http://example.org/html/example.txt", // extension found in path
"http://example.org/resource?key=html", // extension found as query parameter value
"http://example.org/resource#html", // extension found as url fragment
};
final QueryModifier modifier = new QueryModifier(0);
modifier.filetype = "html";
checkURLs(matchingURLs, nonMatchingURLs, modifier, null);
}
/**
* Test URL matching with combined protocol and host name query modifiers.
* @throws MalformedURLException when a test URL is malformed. Should not happen.
*/
@Test
public void testBuildURLFilterProtocolAndHostName() throws MalformedURLException {
final String[] matchingURLs = { "http://example.org", // most basic matching example
"http://example.org/", // normalized basic example
"http://www.example.org/", // with www domain prefix
"http://example.org:8080", // non default port
"http://example.org?key=value", // empty path and query string
"http://example.org?key=value#fragment", // empty path, query string and fragment
"http://example.org:8080?key=value#fragment", // non default port, empty path, query string and fragment
"http://example.org:8080/?key=value#fragment", // normalized non default port, empty path, query string and fragment
"http://example.org#fragment", // empty path and fragment
"http://example.org/index.html", // with file
"http://example.org/path/index.html", // with path and file
"http://example.org:8090/path/index.html", // with non default port, path and file
"http://example.org/index.html?key=value", // with file and query string
"http://example.org/index.html?key=value#fragment", // with file, query string and url fragment
};
final String[] nonMatchingURLs = { "ftp://domain.test", // basic non matching example
"ftp://domain.test/", // normalized basic example
"ftp://example.org/", // only domain matching
"http://fr.example.org", // domain prefix different from www
"http://example.net", // only secondary-level domain matching
"http://test.org", // only top-level domain matching
"http://example.organic", // domain starting like the one of the filter
"http://unexample.org", // domain ending like the one of the filter
"http://example.net/index.html?query=example.org", // with query including the filtered domain
"http://example.net/index.html#example.org", // with fragment string including the filtered domain
"http://127.0.0.1/index.html", // IPv4 address
"http://[2001:db8::ff00:42:8329]/index.html" // IPv6 address
};
final QueryModifier modifier = new QueryModifier(0);
modifier.sitehost = "example.org";
modifier.protocol = "http";
checkURLs(matchingURLs, nonMatchingURLs, modifier, null);
}
/**
* Test URL filter build with no constraints at all
*/
@Test
public void testBuilURLFilterEmpty() {
final QueryModifier modifier = new QueryModifier(0);
final String filter = QueryParams.buildApproximateURLFilter(modifier, null);
Assert.assertEquals(QueryParams.catchall_pattern.toString(), filter);
}
}