From 0a120787e3a6ece55d38418d5adb35093c358102 Mon Sep 17 00:00:00 2001 From: luccioman Date: Fri, 1 Dec 2017 11:19:31 +0100 Subject: [PATCH] Improved accuracy of URLs search filters : protocol, tld, host, file ext --- .../cora/document/id/MultiProtocolURL.java | 5 +- source/net/yacy/search/query/QueryParams.java | 68 ++++- source/net/yacy/search/query/SearchEvent.java | 43 ++-- .../yacy/search/query/QueryParamsTest.java | 236 ++++++++++++++++++ 4 files changed, 330 insertions(+), 22 deletions(-) create mode 100644 test/java/net/yacy/search/query/QueryParamsTest.java diff --git a/source/net/yacy/cora/document/id/MultiProtocolURL.java b/source/net/yacy/cora/document/id/MultiProtocolURL.java index a0b428658..3eac3b22f 100644 --- a/source/net/yacy/cora/document/id/MultiProtocolURL.java +++ b/source/net/yacy/cora/document/id/MultiProtocolURL.java @@ -912,7 +912,7 @@ public class MultiProtocolURL implements Serializable, Comparable + * For truly accurate filtering, checking constraints against parsed URLs in + * MultiprotocolURL instances is easier and more reliable than building a complex regular + * expression that must be both compatible with the JDK {@link Pattern} and with Lucene {@link RegExp}. + * * @param modifier * query modifier with eventual protocol, sitehost and filetype * constraints. The modifier parameter itselft must not be null. @@ -285,7 +294,7 @@ public final class QueryParams { * @return an URL filter regular expression from the provided modifier and tld * constraints, matching anything when there are no constraints at all. */ - protected static String buildURLFilter(final QueryModifier modifier, final String tld) { + protected static String buildApproximateURLFilter(final QueryModifier modifier, final String tld) { final String protocolfilter = modifier.protocol == null ? ".*" : modifier.protocol; final String defaulthostprefix = "www"; final String hostfilter; @@ -416,6 +425,61 @@ public final class QueryParams { sb.append("]"); return sb.toString(); } + + /** + * @param modifier + * the query modifier with eventual constraints on protocoln, host + * name or file extension + * @param tld + * an eventual top-level domain name to filter on + * @param url + * the url to check + * @return the constraint that did not match ("url" when url is null, + * "protocol", "sitehost", "tld", or "filetype"), or the empty string + * when the url matches + */ + public static String matchesURL(final QueryModifier modifier, final String tld, final MultiProtocolURL url) { + if (url == null) { + return "url"; + } + if (modifier != null) { + if (modifier.protocol != null) { + if (!modifier.protocol.equalsIgnoreCase(url.getProtocol())) { + return "protocol"; + } + } + if (modifier.sitehost != null) { + /* + * consider to search for hosts with 'www'-prefix, if not already part of the + * host name + */ + final String wwwPrefix = "www."; + final String host; + final String hostWithWwwPrefix; + if (modifier.sitehost.startsWith(wwwPrefix)) { + hostWithWwwPrefix = modifier.sitehost; + host = modifier.sitehost.substring(wwwPrefix.length()); + } else { + hostWithWwwPrefix = wwwPrefix + modifier.sitehost; + host = modifier.sitehost; + } + if (!host.equalsIgnoreCase(url.getHost()) && !hostWithWwwPrefix.equals(url.getHost())) { + return "sitehost"; + } + } + if (tld != null) { + if (!tld.equalsIgnoreCase(url.getTLD())) { + return "tld"; + } + } + if (modifier.filetype != null) { + if (!modifier.filetype.equalsIgnoreCase(MultiProtocolURL.getFileExtension(url.getFileName()))) { + return "filetype"; + } + } + } + return ""; + } /** * check if the given text matches with the query diff --git a/source/net/yacy/search/query/SearchEvent.java b/source/net/yacy/search/query/SearchEvent.java index f27724d65..167bf9ece 100644 --- a/source/net/yacy/search/query/SearchEvent.java +++ b/source/net/yacy/search/query/SearchEvent.java @@ -963,6 +963,16 @@ public final class SearchEvent implements ScoreMapUpdatesListener { try { pollloop: for (URIMetadataNode iEntry: nodeList) { + // check url related eventual constraints (protocol, tld, sitehost, and filetype) + final String matchingResult = QueryParams.matchesURL(this.query.modifier, this.query.tld, iEntry.url()); + if (!matchingResult.isEmpty()) { + if (log.isFine()) { + log.fine("dropped Node: " + matchingResult); + } + updateCountsOnSolrEntryToEvict(iEntry, facets, local, !incrementNavigators); + continue pollloop; + } + if ( !this.query.urlMask_isCatchall ) { // check url mask if (!iEntry.matches(this.query.urlMaskPattern)) { @@ -1019,13 +1029,6 @@ public final class SearchEvent implements ScoreMapUpdatesListener { updateCountsOnSolrEntryToEvict(iEntry, facets, local, !incrementNavigators); continue pollloop; } - } else { - // filter out all domains that do not match with the site constraint - if (iEntry.url().getHost().indexOf(this.query.modifier.sitehost) < 0) { - if (log.isFine()) log.fine("dropped Node: sitehost"); - updateCountsOnSolrEntryToEvict(iEntry, facets, local, !incrementNavigators); - continue pollloop; - } } if (this.query.modifier.language != null) { @@ -1393,6 +1396,16 @@ public final class SearchEvent implements ScoreMapUpdatesListener { // returns from the current RWI list the best URL entry and removes this entry from the list URIMetadataNode page; mainloop: while ((page = pullOneRWI(skipDoubleDom)) != null) { + + // check url related eventual constraints (protocol, tld, sitehost, and filetype) + final String matchingResult = QueryParams.matchesURL(this.query.modifier, this.query.tld, page.url()); + if (!matchingResult.isEmpty()) { + if (log.isFine()) { + log.fine("dropped RWI: no match on " + matchingResult); + } + decrementCounts(page.word()); + continue; + } if (!this.query.urlMask_isCatchall && !page.matches(this.query.urlMaskPattern)) { if (log.isFine()) log.fine("dropped RWI: no match with urlMask"); @@ -1427,14 +1440,6 @@ public final class SearchEvent implements ScoreMapUpdatesListener { } // filter query modifiers variables (these are host, filetype, protocol, language, author, collection, dates_in_content(on,from,to,timezone) ) - // while ( protocol, host, filetype ) currently maybe incorporated in (this.query.urlMaskPattern) queryparam - - // check modifier constraint filetype (using fileextension) - if (this.query.modifier.filetype != null && !this.query.modifier.filetype.equals(ext)) { - if (log.isFine()) log.fine("dropped RWI: file type constraint = " + this.query.modifier.filetype); - decrementCounts(page.word()); - continue; - } /* check again modifier constraint (language) with the language in the full metadata, * that may differ from the one in the reverse word reference which is already checked in addRWIs()*/ @@ -1480,12 +1485,12 @@ public final class SearchEvent implements ScoreMapUpdatesListener { // content control if (Switchboard.getSwitchboard().getConfigBool("contentcontrol.enabled", false)) { - FilterEngine f = ContentControlFilterUpdateThread.getNetworkFilter(); - if (f != null && !f.isListed(page.url(), null)) { + FilterEngine f = ContentControlFilterUpdateThread.getNetworkFilter(); + if (f != null && !f.isListed(page.url(), null)) { if (log.isFine()) log.fine("dropped RWI: url is blacklisted in contentcontrol"); - decrementCounts(page.word()); + decrementCounts(page.word()); continue; - } + } } final String pageurl = page.url().toNormalform(true); diff --git a/test/java/net/yacy/search/query/QueryParamsTest.java b/test/java/net/yacy/search/query/QueryParamsTest.java new file mode 100644 index 000000000..8cd2a250d --- /dev/null +++ b/test/java/net/yacy/search/query/QueryParamsTest.java @@ -0,0 +1,236 @@ +// QueryParamsTest.java +// --------------------------- +// Copyright 2017 by luccioman; https://github.com/luccioman +// +// This is a part of YaCy, a peer-to-peer based web search engine +// +// LICENSE +// +// This program is free software; you can redistribute it and/or modify +// it under the terms of the GNU General Public License as published by +// the Free Software Foundation; either version 2 of the License, or +// (at your option) any later version. +// +// This program is distributed in the hope that it will be useful, +// but WITHOUT ANY WARRANTY; without even the implied warranty of +// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +// GNU General Public License for more details. +// +// You should have received a copy of the GNU General Public License +// along with this program; if not, write to the Free Software +// Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA + +package net.yacy.search.query; + +import java.net.MalformedURLException; + +import org.junit.Assert; +import org.junit.Test; + +import net.yacy.cora.document.id.MultiProtocolURL; + +/** + * Unit tests for the {@link QueryParams} class. + */ +public class QueryParamsTest { + + /** + * Test URL matching with a single query constraint on top-level domain. + * @throws MalformedURLException when a test URL is malformed. Should not happen. + */ + @Test + public void testMatchesURLTLD() throws MalformedURLException { + final String[] matchingURLs = { "http://example.org", // most basic matching example + "http://example.org/", // normalized basic example + "http://www.example.org/", // with www domain prefix + "http://example.org:8080", // non default port + "http://example.org?key=value", // empty path and query string + "http://example.org?key=value#fragment", // empty path, query string and fragment + "http://example.org:8080?key=value#fragment", // non default port, empty path, query string and fragment + "http://example.org:8080/?key=value#fragment", // normalized non default port, empty path, query string and fragment + "http://example.org#fragment", // empty path and fragment + "ftp://example.org", // another protocol than http + "http://example.org/index.html", // with file + "http://example.org/path/index.html", // with path and file + "http://example.org:8090/path/index.html", // with non default port, path and file + "http://example.org/index.html?key=value", // with file and query string + "http://example.org/index.html?key=value#fragment", // with file, query string and url fragment + }; + + final String[] nonMatchingURLs = { "http://example.test", // basic non matching example + "http://example.test/", // normalized basic example + "http://org.example.net", // only subdomain matching + "http://example.org.net", // only secondary-level domain matching + "http://organization.test", // secondary-level starting like the filter + "http://test.organic", // top-level domain starting like the filter + "http://en.organization.test", // subdomain then secondary-level starting like the filter + "http://example.test/path/file.org", // with file ending like the tld filter + "http://example.test/?query=example.org", // with query parameter including the tld + "http://example.test/#fragment.org", // with query parameter including the tld + "file:///path/file.txt", // empty host name in file URL + "http://127.0.0.1/index.html", // IPv4 address + "http://[2001:db8::ff00:42:8329]/index.html" // IPv6 address + }; + + final QueryModifier modifier = new QueryModifier(0); + checkURLs(matchingURLs, nonMatchingURLs, modifier, "org"); + } + + /** + * Check matching and non matching URLs against the given query modifier and + * eventual top-level domain name. + * + * @param matchingURLs + * array of URLs expected to be accepted + * @param nonMatchingURLs + * array of URLs expected to be rejected + * @param modifier + * the query modifier + * @param tld + * the eventual top-level domain to filter on. + * @throws MalformedURLException when a test URL string is malformed + */ + private void checkURLs(final String[] matchingURLs, final String[] nonMatchingURLs, final QueryModifier modifier, final String tld) throws MalformedURLException { + for (final String matchingURL : matchingURLs) { + Assert.assertEquals(matchingURL + " should match", "", QueryParams.matchesURL(modifier, tld, new MultiProtocolURL(matchingURL))); + } + for (final String nonMatchingURL : nonMatchingURLs) { + Assert.assertNotEquals(nonMatchingURL + " should not match", "", + QueryParams.matchesURL(modifier, tld, new MultiProtocolURL(nonMatchingURL))); + } + } + + /** + * Test URL matching build with a single query constraint on URL scheme. + * @throws MalformedURLException when a test URL is malformed. Should not happen. + */ + @Test + public void testMatchesURLProtocol() throws MalformedURLException { + final String[] matchingURLs = { "http://example.org/" }; + + final String[] nonMatchingURLs = { "https://example.org/", + "ftp://www.example.test/", "smb://localhost", + "mailto:user@example.com", "file:///tmp/path/", + "https://example.org/index.html?query=http", // with query parameter including the protocol + "https://example.org/index.html#http" // with fragment string including the protocol + }; + final QueryModifier modifier = new QueryModifier(0); + modifier.protocol = "http"; + checkURLs(matchingURLs, nonMatchingURLs, modifier, null); + } + + /** + * Test URL matching with a single query constraint on host name. + * @throws MalformedURLException when a test URL is malformed. Should not happen. + */ + @Test + public void testMatchesURLHostName() throws MalformedURLException { + final String[] matchingURLs = { "http://example.org", // most basic matching example + "http://example.org/", // normalized basic example + "http://www.example.org/", // with www domain prefix + "http://example.org:8080", // non default port + "http://example.org?key=value", // empty path and query string + "http://example.org?key=value#fragment", // empty path, query string and fragment + "http://example.org:8080?key=value#fragment", // non default port, empty path, query string and fragment + "http://example.org:8080/?key=value#fragment", // normalized non default port, empty path, query string and fragment + "http://example.org#fragment", // empty path and fragment + "ftp://example.org", // another protocol than http + "http://example.org/index.html", // with file + "http://example.org/path/index.html", // with path and file + "http://example.org:8090/path/index.html", // with non default port, path and file + "http://example.org/index.html?key=value", // with file and query string + "http://example.org/index.html?key=value#fragment", // with file, query string and url fragment + }; + + final String[] nonMatchingURLs = { "http://domain.test", // basic non matching example + "http://domain.test/", // normalized basic example + "http://fr.example.org", // domain prefix different from www + "http://example.net", // only secondary-level domain matching + "http://test.org", // only top-level domain matching + "http://example.organic", // domain starting like the one of the filter + "http://unexample.org", // domain ending like the one of the filter + "http://example.net/index.html?query=example.org", // with query including the filtered domain + "http://example.net/index.html#example.org", // with fragment string including the filtered domain + "file:///path/file.txt", // empty host name in file URL + "http://127.0.0.1/index.html", // IPv4 address + "http://[2001:db8::ff00:42:8329]/index.html" // IPv6 address + }; + final QueryModifier modifier = new QueryModifier(0); + modifier.sitehost = "example.org"; + checkURLs(matchingURLs, nonMatchingURLs, modifier, null); + } + + /** + * Test URL matching with a single query constraint on file extension. + * @throws MalformedURLException when a test URL is malformed. Should not happen. + */ + @Test + public void testMatchesURLFileExt() throws MalformedURLException { + final String[] matchingURLs = { "http://example.org/image.html", // most basic matching example + "http://example.org/image.html#anchor", // with url fragment + "http://example.org/image.html?key=value#anchor", // with query string and url fragment + }; + + final String[] nonMatchingURLs = { "http://example.org/file.txt", // basic non matching example + "http://example.org/file.xhtml", // extension ending like the expected one + "http://example.org/html/example.txt", // extension found in path + "http://example.org/resource?key=html", // extension found as query parameter value + "http://example.org/resource#html", // extension found as url fragment + }; + final QueryModifier modifier = new QueryModifier(0); + modifier.filetype = "html"; + checkURLs(matchingURLs, nonMatchingURLs, modifier, null); + } + + /** + * Test URL matching with combined protocol and host name query modifiers. + * @throws MalformedURLException when a test URL is malformed. Should not happen. + */ + @Test + public void testBuildURLFilterProtocolAndHostName() throws MalformedURLException { + final String[] matchingURLs = { "http://example.org", // most basic matching example + "http://example.org/", // normalized basic example + "http://www.example.org/", // with www domain prefix + "http://example.org:8080", // non default port + "http://example.org?key=value", // empty path and query string + "http://example.org?key=value#fragment", // empty path, query string and fragment + "http://example.org:8080?key=value#fragment", // non default port, empty path, query string and fragment + "http://example.org:8080/?key=value#fragment", // normalized non default port, empty path, query string and fragment + "http://example.org#fragment", // empty path and fragment + "http://example.org/index.html", // with file + "http://example.org/path/index.html", // with path and file + "http://example.org:8090/path/index.html", // with non default port, path and file + "http://example.org/index.html?key=value", // with file and query string + "http://example.org/index.html?key=value#fragment", // with file, query string and url fragment + }; + + final String[] nonMatchingURLs = { "ftp://domain.test", // basic non matching example + "ftp://domain.test/", // normalized basic example + "ftp://example.org/", // only domain matching + "http://fr.example.org", // domain prefix different from www + "http://example.net", // only secondary-level domain matching + "http://test.org", // only top-level domain matching + "http://example.organic", // domain starting like the one of the filter + "http://unexample.org", // domain ending like the one of the filter + "http://example.net/index.html?query=example.org", // with query including the filtered domain + "http://example.net/index.html#example.org", // with fragment string including the filtered domain + "http://127.0.0.1/index.html", // IPv4 address + "http://[2001:db8::ff00:42:8329]/index.html" // IPv6 address + }; + final QueryModifier modifier = new QueryModifier(0); + modifier.sitehost = "example.org"; + modifier.protocol = "http"; + checkURLs(matchingURLs, nonMatchingURLs, modifier, null); + } + + /** + * Test URL filter build with no constraints at all + */ + @Test + public void testBuilURLFilterEmpty() { + final QueryModifier modifier = new QueryModifier(0); + final String filter = QueryParams.buildApproximateURLFilter(modifier, null); + + Assert.assertEquals(QueryParams.catchall_pattern.toString(), filter); + } +}