Improved accuracy of URLs search filters : protocol, tld, host, file ext

2024-09-19 00:01:41 +02:00 · 2017-12-01 11:19:31 +01:00 · 2017-12-01 11:19:31 +01:00 · 0a120787e3
commit 0a120787e3
parent d1c7dfd852
4 changed files with 330 additions and 22 deletions
--- a/source/net/yacy/cora/document/id/MultiProtocolURL.java
+++ b/source/net/yacy/cora/document/id/MultiProtocolURL.java
@ -912,7 +912,7 @@ public class MultiProtocolURL implements Serializable, Comparable<MultiProtocolU
    }

    /**
-     * @return the host part of this URL, Punycode encoded for Internationalized Domain Names
+     * @return the host part of this URL, Punycode encoded for Internationalized Domain Names. Can be null, for example for file URLs such as "file:///path/file.ext"
     */
    public String getHost() {
        return this.host;
@ -926,6 +926,9 @@ public class MultiProtocolURL implements Serializable, Comparable<MultiProtocolU
        return orga;
    }

+    /**
+     * @return the top-level domain name part of this url host name, or the empty string.
+     */
    public String getTLD() {
        if (this.host == null) return "";
        int p = this.host.lastIndexOf('.');
--- a/source/net/yacy/search/query/QueryParams.java
+++ b/source/net/yacy/search/query/QueryParams.java
@ -46,6 +46,7 @@ import java.util.regex.PatternSyntaxException;

 import org.apache.lucene.util.automaton.Automata;
 import org.apache.lucene.util.automaton.Automaton;
+import org.apache.lucene.util.automaton.RegExp;
 import org.apache.solr.client.solrj.SolrQuery;
 import org.apache.solr.client.solrj.SolrQuery.SortClause;
 import org.apache.solr.common.params.DisMaxParams;
@ -55,6 +56,7 @@ import net.yacy.cora.document.analysis.Classification;
 import net.yacy.cora.document.analysis.Classification.ContentDomain;
 import net.yacy.cora.document.encoding.ASCII;
 import net.yacy.cora.document.id.AnchorURL;
+import net.yacy.cora.document.id.MultiProtocolURL;
 import net.yacy.cora.federate.solr.Ranking;
 import net.yacy.cora.federate.yacy.CacheStrategy;
 import net.yacy.cora.geo.GeoLocation;
@ -217,7 +219,7 @@ public final class QueryParams {
        }
        this.urlMask_isCatchall = this.urlMaskString.equals(catchall_pattern.toString());
        if (this.urlMask_isCatchall) {
-            final String filter = QueryParams.buildURLFilter(modifier, tld);
+            final String filter = QueryParams.buildApproximateURLFilter(modifier, tld);
            if (!QueryParams.catchall_pattern.toString().equals(filter)) {
                this.urlMaskString = filter;
                this.urlMaskAutomaton = Automata.makeString(filter);
@ -277,6 +279,13 @@ public final class QueryParams {
    }

 	/**
+	 * Generate an URL filter from the query modifier and eventual tld, usable as a
+	 * first approximation for filtering, and compatible with the yacy/search
+	 * API.<br/>
+	 * For truly accurate filtering, checking constraints against parsed URLs in 
+	 * MultiprotocolURL instances is easier and more reliable than building a complex regular
+	 * expression that must be both compatible with the JDK {@link Pattern} and with Lucene {@link RegExp}.
+	 * 
 	 * @param modifier
 	 *            query modifier with eventual protocol, sitehost and filetype
 	 *            constraints. The modifier parameter itselft must not be null.
@ -285,7 +294,7 @@ public final class QueryParams {
 	 * @return an URL filter regular expression from the provided modifier and tld
 	 *         constraints, matching anything when there are no constraints at all.
 	 */
-	protected static String buildURLFilter(final QueryModifier modifier, final String tld) {
+	protected static String buildApproximateURLFilter(final QueryModifier modifier, final String tld) {
 		final String protocolfilter = modifier.protocol == null ? ".*" : modifier.protocol;
 		final String defaulthostprefix = "www";
 		final String hostfilter;
@ -416,6 +425,61 @@ public final class QueryParams {
        sb.append("]");
        return sb.toString();
    }
+    
+	/**
+	 * @param modifier
+	 *            the query modifier with eventual constraints on protocoln, host
+	 *            name or file extension
+	 * @param tld
+	 *            an eventual top-level domain name to filter on
+	 * @param url
+	 *            the url to check
+	 * @return the constraint that did not match ("url" when url is null,
+	 *         "protocol", "sitehost", "tld", or "filetype"), or the empty string
+	 *         when the url matches
+	 */
+	public static String matchesURL(final QueryModifier modifier, final String tld, final MultiProtocolURL url) {
+		if (url == null) {
+			return "url";
+		}
+		if (modifier != null) {
+			if (modifier.protocol != null) {
+				if (!modifier.protocol.equalsIgnoreCase(url.getProtocol())) {
+					return "protocol";
+				}
+			}
+			if (modifier.sitehost != null) {
+				/*
+				 * consider to search for hosts with 'www'-prefix, if not already part of the
+				 * host name
+				 */
+				final String wwwPrefix = "www.";
+				final String host;
+				final String hostWithWwwPrefix;
+				if (modifier.sitehost.startsWith(wwwPrefix)) {
+					hostWithWwwPrefix = modifier.sitehost;
+					host = modifier.sitehost.substring(wwwPrefix.length());
+				} else {
+					hostWithWwwPrefix = wwwPrefix + modifier.sitehost;
+					host = modifier.sitehost;
+				}
+				if (!host.equalsIgnoreCase(url.getHost()) && !hostWithWwwPrefix.equals(url.getHost())) {
+					return "sitehost";
+				}
+			}
+			if (tld != null) {
+				if (!tld.equalsIgnoreCase(url.getTLD())) {
+					return "tld";
+				}
+			}
+			if (modifier.filetype != null) {
+				if (!modifier.filetype.equalsIgnoreCase(MultiProtocolURL.getFileExtension(url.getFileName()))) {
+					return "filetype";
+				}
+			}
+		}
+		return "";
+	}

    /**
     * check if the given text matches with the query
--- a/source/net/yacy/search/query/SearchEvent.java
+++ b/source/net/yacy/search/query/SearchEvent.java
@ -963,6 +963,16 @@ public final class SearchEvent implements ScoreMapUpdatesListener {
        try {
            pollloop: for (URIMetadataNode iEntry: nodeList) {
            	
+                // check url related eventual constraints (protocol, tld, sitehost, and filetype)
+            	final String matchingResult = QueryParams.matchesURL(this.query.modifier, this.query.tld, iEntry.url());
+                if (!matchingResult.isEmpty()) {
+                    if (log.isFine()) {
+                    	log.fine("dropped Node: " + matchingResult);
+                    }
+                    updateCountsOnSolrEntryToEvict(iEntry, facets, local, !incrementNavigators);
+                    continue pollloop;
+                }
+            	
                if ( !this.query.urlMask_isCatchall ) {
                    // check url mask
                    if (!iEntry.matches(this.query.urlMaskPattern)) {
@ -1019,13 +1029,6 @@ public final class SearchEvent implements ScoreMapUpdatesListener {
                        updateCountsOnSolrEntryToEvict(iEntry, facets, local, !incrementNavigators);
                        continue pollloop;
                    }
-                } else {
-                    // filter out all domains that do not match with the site constraint
-                    if (iEntry.url().getHost().indexOf(this.query.modifier.sitehost) < 0) {
-                        if (log.isFine()) log.fine("dropped Node: sitehost");
-                        updateCountsOnSolrEntryToEvict(iEntry, facets, local, !incrementNavigators);
-                        continue pollloop;
-                    }
                }

                if (this.query.modifier.language != null) {
@ -1393,6 +1396,16 @@ public final class SearchEvent implements ScoreMapUpdatesListener {
        // returns from the current RWI list the best URL entry and removes this entry from the list
        URIMetadataNode page;
        mainloop: while ((page = pullOneRWI(skipDoubleDom)) != null) {
+        	
+            // check url related eventual constraints (protocol, tld, sitehost, and filetype)
+        	final String matchingResult = QueryParams.matchesURL(this.query.modifier, this.query.tld, page.url());
+            if (!matchingResult.isEmpty()) {
+                if (log.isFine()) {
+                	log.fine("dropped RWI: no match on " + matchingResult);
+                }
+                decrementCounts(page.word());
+                continue;
+            }

            if (!this.query.urlMask_isCatchall && !page.matches(this.query.urlMaskPattern)) {
                if (log.isFine()) log.fine("dropped RWI: no match with urlMask");
@ -1427,14 +1440,6 @@ public final class SearchEvent implements ScoreMapUpdatesListener {
            }

            // filter query modifiers variables (these are host, filetype, protocol, language, author, collection, dates_in_content(on,from,to,timezone) )
-            // while ( protocol, host, filetype ) currently maybe incorporated in (this.query.urlMaskPattern)  queryparam
-
-            // check modifier constraint filetype (using fileextension)
-            if (this.query.modifier.filetype != null && !this.query.modifier.filetype.equals(ext)) {
-                if (log.isFine()) log.fine("dropped RWI: file type constraint = " + this.query.modifier.filetype);
-                decrementCounts(page.word());
-                continue;
-            }

            /* check again modifier constraint (language) with the language in the full metadata, 
             * that may differ from the one in the reverse word reference which is already checked in addRWIs()*/
@ -1480,12 +1485,12 @@ public final class SearchEvent implements ScoreMapUpdatesListener {

            // content control
            if (Switchboard.getSwitchboard().getConfigBool("contentcontrol.enabled", false)) {
-		FilterEngine f = ContentControlFilterUpdateThread.getNetworkFilter();
-		if (f != null && !f.isListed(page.url(), null)) {
+            	FilterEngine f = ContentControlFilterUpdateThread.getNetworkFilter();
+            	if (f != null && !f.isListed(page.url(), null)) {
                    if (log.isFine()) log.fine("dropped RWI: url is blacklisted in contentcontrol");
-	            decrementCounts(page.word());
+                    decrementCounts(page.word());
                    continue;
-		}
+            	}
            }

            final String pageurl = page.url().toNormalform(true);
--- a/test/java/net/yacy/search/query/QueryParamsTest.java
+++ b/test/java/net/yacy/search/query/QueryParamsTest.java
@ -0,0 +1,236 @@
+// QueryParamsTest.java
+// ---------------------------
+// Copyright 2017 by luccioman; https://github.com/luccioman
+//
+// This is a part of YaCy, a peer-to-peer based web search engine
+//
+// LICENSE
+//
+// This program is free software; you can redistribute it and/or modify
+// it under the terms of the GNU General Public License as published by
+// the Free Software Foundation; either version 2 of the License, or
+// (at your option) any later version.
+//
+// This program is distributed in the hope that it will be useful,
+// but WITHOUT ANY WARRANTY; without even the implied warranty of
+// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+// GNU General Public License for more details.
+//
+// You should have received a copy of the GNU General Public License
+// along with this program; if not, write to the Free Software
+// Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA  02111-1307  USA
+
+package net.yacy.search.query;
+
+import java.net.MalformedURLException;
+
+import org.junit.Assert;
+import org.junit.Test;
+
+import net.yacy.cora.document.id.MultiProtocolURL;
+
+/**
+ * Unit tests for the {@link QueryParams} class.
+ */
+public class QueryParamsTest {
+
+	/**
+	 * Test URL matching with a single query constraint on top-level domain.
+	 * @throws MalformedURLException when a test URL is malformed. Should not happen.
+	 */
+	@Test
+	public void testMatchesURLTLD() throws MalformedURLException {
+		final String[] matchingURLs = { "http://example.org", // most basic matching example
+				"http://example.org/", // normalized basic example
+				"http://www.example.org/", // with www domain prefix
+				"http://example.org:8080", // non default port
+				"http://example.org?key=value", // empty path and query string
+				"http://example.org?key=value#fragment", // empty path, query string and fragment
+				"http://example.org:8080?key=value#fragment", // non default port, empty path, query string and fragment
+				"http://example.org:8080/?key=value#fragment", // normalized non default port, empty path, query string and fragment
+				"http://example.org#fragment", // empty path and fragment
+				"ftp://example.org", // another protocol than http
+				"http://example.org/index.html", // with file
+				"http://example.org/path/index.html", // with path and file
+				"http://example.org:8090/path/index.html", // with non default port, path and file
+				"http://example.org/index.html?key=value", // with file and query string
+				"http://example.org/index.html?key=value#fragment", // with file, query string and url fragment
+		};
+
+		final String[] nonMatchingURLs = { "http://example.test", // basic non matching example
+				"http://example.test/", // normalized basic example
+				"http://org.example.net", // only subdomain matching
+				"http://example.org.net", // only secondary-level domain matching
+				"http://organization.test", // secondary-level starting like the filter
+				"http://test.organic", // top-level domain starting like the filter
+				"http://en.organization.test", // subdomain then secondary-level starting like the filter
+				"http://example.test/path/file.org", // with file ending like the tld filter
+				"http://example.test/?query=example.org", // with query parameter including the tld
+				"http://example.test/#fragment.org", // with query parameter including the tld
+				"file:///path/file.txt", // empty host name in file URL
+				"http://127.0.0.1/index.html", // IPv4 address
+				"http://[2001:db8::ff00:42:8329]/index.html" // IPv6 address
+		};
+		
+		final QueryModifier modifier = new QueryModifier(0);
+		checkURLs(matchingURLs, nonMatchingURLs, modifier, "org");
+	}
+
+	/**
+	 * Check matching and non matching URLs against the given query modifier and
+	 * eventual top-level domain name.
+	 * 
+	 * @param matchingURLs
+	 *            array of URLs expected to be accepted
+	 * @param nonMatchingURLs
+	 *            array of URLs expected to be rejected
+	 * @param modifier
+	 *            the query modifier
+	 * @param tld
+	 *            the eventual top-level domain to filter on.
+	 * @throws MalformedURLException when a test URL string is malformed
+	 */
+	private void checkURLs(final String[] matchingURLs, final String[] nonMatchingURLs, final QueryModifier modifier, final String tld) throws MalformedURLException {
+		for (final String matchingURL : matchingURLs) {
+			Assert.assertEquals(matchingURL + " should match", "", QueryParams.matchesURL(modifier, tld, new MultiProtocolURL(matchingURL)));
+		}
+		for (final String nonMatchingURL : nonMatchingURLs) {
+			Assert.assertNotEquals(nonMatchingURL + " should not match", "",
+					QueryParams.matchesURL(modifier, tld, new MultiProtocolURL(nonMatchingURL)));
+		}
+	}
+
+	/**
+	 * Test URL matching build with a single query constraint on URL scheme.
+	 * @throws MalformedURLException when a test URL is malformed. Should not happen.
+	 */
+	@Test
+	public void testMatchesURLProtocol() throws MalformedURLException {
+		final String[] matchingURLs = { "http://example.org/" };
+
+		final String[] nonMatchingURLs = { "https://example.org/", 
+				"ftp://www.example.test/", "smb://localhost",
+				"mailto:user@example.com", "file:///tmp/path/",
+				"https://example.org/index.html?query=http", // with query parameter including the protocol
+				"https://example.org/index.html#http" // with fragment string including the protocol
+		};
+		final QueryModifier modifier = new QueryModifier(0);
+		modifier.protocol = "http";
+		checkURLs(matchingURLs, nonMatchingURLs, modifier, null);
+	}
+	
+	/**
+	 * Test URL matching with a single query constraint on host name.
+	 * @throws MalformedURLException when a test URL is malformed. Should not happen.
+	 */
+	@Test
+	public void testMatchesURLHostName() throws MalformedURLException {
+		final String[] matchingURLs = { "http://example.org", // most basic matching example
+				"http://example.org/", // normalized basic example
+				"http://www.example.org/", // with www domain prefix
+				"http://example.org:8080", // non default port
+				"http://example.org?key=value", // empty path and query string
+				"http://example.org?key=value#fragment", // empty path, query string and fragment
+				"http://example.org:8080?key=value#fragment", // non default port, empty path, query string and fragment
+				"http://example.org:8080/?key=value#fragment", // normalized non default port, empty path, query string and fragment
+				"http://example.org#fragment", // empty path and fragment
+				"ftp://example.org", // another protocol than http
+				"http://example.org/index.html", // with file
+				"http://example.org/path/index.html", // with path and file
+				"http://example.org:8090/path/index.html", // with non default port, path and file
+				"http://example.org/index.html?key=value", // with file and query string
+				"http://example.org/index.html?key=value#fragment", // with file, query string and url fragment
+		};
+
+		final String[] nonMatchingURLs = { "http://domain.test", // basic non matching example
+				"http://domain.test/", // normalized basic example
+				"http://fr.example.org", // domain prefix different from www
+				"http://example.net", // only secondary-level domain matching
+				"http://test.org", // only top-level domain matching
+				"http://example.organic", // domain starting like the one of the filter
+				"http://unexample.org", // domain ending like the one of the filter
+				"http://example.net/index.html?query=example.org", // with query including the filtered domain
+				"http://example.net/index.html#example.org", // with fragment string including the filtered domain
+				"file:///path/file.txt", // empty host name in file URL
+				"http://127.0.0.1/index.html", // IPv4 address
+				"http://[2001:db8::ff00:42:8329]/index.html" // IPv6 address
+		};
+		final QueryModifier modifier = new QueryModifier(0);
+		modifier.sitehost = "example.org";
+		checkURLs(matchingURLs, nonMatchingURLs, modifier, null);
+	}
+	
+	/**
+	 * Test URL matching with a single query constraint on file extension.
+	 * @throws MalformedURLException when a test URL is malformed. Should not happen.
+	 */
+	@Test
+	public void testMatchesURLFileExt() throws MalformedURLException {
+		final String[] matchingURLs = { "http://example.org/image.html", // most basic matching example
+				"http://example.org/image.html#anchor", // with url fragment
+				"http://example.org/image.html?key=value#anchor", // with query string and url fragment
+		};
+
+		final String[] nonMatchingURLs = { "http://example.org/file.txt", // basic non matching example
+				"http://example.org/file.xhtml", // extension ending like the expected one
+				"http://example.org/html/example.txt", // extension found in path
+				"http://example.org/resource?key=html", // extension found as query parameter value
+				"http://example.org/resource#html", // extension found as url fragment
+		};
+		final QueryModifier modifier = new QueryModifier(0);
+		modifier.filetype = "html";
+		checkURLs(matchingURLs, nonMatchingURLs, modifier, null);
+	}
+	
+	/**
+	 * Test URL matching with combined protocol and host name query modifiers.
+	 * @throws MalformedURLException when a test URL is malformed. Should not happen.
+	 */
+	@Test
+	public void testBuildURLFilterProtocolAndHostName() throws MalformedURLException {
+		final String[] matchingURLs = { "http://example.org", // most basic matching example
+				"http://example.org/", // normalized basic example
+				"http://www.example.org/", // with www domain prefix
+				"http://example.org:8080", // non default port
+				"http://example.org?key=value", // empty path and query string
+				"http://example.org?key=value#fragment", // empty path, query string and fragment
+				"http://example.org:8080?key=value#fragment", // non default port, empty path, query string and fragment
+				"http://example.org:8080/?key=value#fragment", // normalized non default port, empty path, query string and fragment
+				"http://example.org#fragment", // empty path and fragment
+				"http://example.org/index.html", // with file
+				"http://example.org/path/index.html", // with path and file
+				"http://example.org:8090/path/index.html", // with non default port, path and file
+				"http://example.org/index.html?key=value", // with file and query string
+				"http://example.org/index.html?key=value#fragment", // with file, query string and url fragment
+		};
+
+		final String[] nonMatchingURLs = { "ftp://domain.test", // basic non matching example
+				"ftp://domain.test/", // normalized basic example
+				"ftp://example.org/", // only domain matching
+				"http://fr.example.org", // domain prefix different from www
+				"http://example.net", // only secondary-level domain matching
+				"http://test.org", // only top-level domain matching
+				"http://example.organic", // domain starting like the one of the filter
+				"http://unexample.org", // domain ending like the one of the filter
+				"http://example.net/index.html?query=example.org", // with query including the filtered domain
+				"http://example.net/index.html#example.org", // with fragment string including the filtered domain
+				"http://127.0.0.1/index.html", // IPv4 address
+				"http://[2001:db8::ff00:42:8329]/index.html" // IPv6 address
+		};
+		final QueryModifier modifier = new QueryModifier(0);
+		modifier.sitehost = "example.org";
+		modifier.protocol = "http";
+		checkURLs(matchingURLs, nonMatchingURLs, modifier, null);
+	}
+	
+	/**
+	 * Test URL filter build with no constraints at all
+	 */
+	@Test
+	public void testBuilURLFilterEmpty() {
+		final QueryModifier modifier = new QueryModifier(0);
+		final String filter = QueryParams.buildApproximateURLFilter(modifier, null);
+		
+		Assert.assertEquals(QueryParams.catchall_pattern.toString(), filter);
+	}
+}