diff --git a/ci/setup_conda_env.sh b/ci/setup_conda_env.sh index cf3cfcb25..a5bb3e1bc 100755 --- a/ci/setup_conda_env.sh +++ b/ci/setup_conda_env.sh @@ -5,4 +5,4 @@ conda create -n hosts python=$PYTHON_VERSION || exit 1 source activate hosts echo "Installing packages..." -conda install mock flake8 +conda install mock flake8 beautifulsoup4 lxml diff --git a/readme_template.md b/readme_template.md index 92148c5a6..6d9b8f737 100644 --- a/readme_template.md +++ b/readme_template.md @@ -39,14 +39,20 @@ folders. ## Generate your own unified hosts file +**Note** if you are using Python 3, please install the dependencies with: + + pip3 install --user -r requirements.txt + +**Note** if you are using Python 2, please install the dependencies with: + + pip2 install --user -r requirements_python2.txt + +**Note** we recommend the `--user` flag which installs the required dependencies at the user level. More information about it can be found on pip [documentation](https://pip.pypa.io/en/stable/reference/pip_install/?highlight=--user#cmdoption-user). + To run unit tests, in the top level directory, just run: python testUpdateHostsFile.py -**Note** if you are using Python 2, you must first install the `mock` library: - - pip install mock - The `updateHostsFile.py` script, which is Python 2.7 and Python 3-compatible, will generate a unified hosts file based on the sources in the local `data/` subfolder. The script will prompt you whether it should fetch updated @@ -104,9 +110,9 @@ in a subfolder. If the subfolder does not exist, it will be created. section at the top, containing lines like `127.0.0.1 localhost`. This is useful for configuring proximate DNS services on the local network. -`--compress`, or `-c`: `false` (default) or `true`, *Compress* the hosts file -ignoring non-necessary lines (empty lines and comments) and putting multiple -domains in each line. Reducing the number of lines of the hosts file improves +`--compress`, or `-c`: `false` (default) or `true`, *Compress* the hosts file +ignoring non-necessary lines (empty lines and comments) and putting multiple +domains in each line. Reducing the number of lines of the hosts file improves the performances under Windows (with DNS Client service enabled). `--minimise`, or `-m`: `false` (default) or `true`, like `--compress`, but puts diff --git a/requirements.txt b/requirements.txt new file mode 100644 index 000000000..c51989f4f --- /dev/null +++ b/requirements.txt @@ -0,0 +1,3 @@ +lxml==4.1.1 +beautifulsoup4==4.6.0 +mock==2.0.0 diff --git a/requirements_python2.txt b/requirements_python2.txt new file mode 100644 index 000000000..5447edf82 --- /dev/null +++ b/requirements_python2.txt @@ -0,0 +1,3 @@ +mock==2.0.0 +lxml==4.1.1 +beautifulsoup4==4.6.0 diff --git a/testUpdateHostsFile.py b/testUpdateHostsFile.py index c1968c882..b4f4bfdca 100644 --- a/testUpdateHostsFile.py +++ b/testUpdateHostsFile.py @@ -5,25 +5,29 @@ # # Python script for testing updateHostFiles.py -from updateHostsFile import ( - Colors, PY3, colorize, display_exclusion_options, exclude_domain, - flush_dns_cache, gather_custom_exclusions, get_defaults, get_file_by_url, - is_valid_domain_format, matches_exclusions, move_hosts_file_into_place, - normalize_rule, path_join_robust, print_failure, print_success, - prompt_for_exclusions, prompt_for_move, prompt_for_flush_dns_cache, - prompt_for_update, query_yes_no, recursive_glob, remove_old_hosts_file, - supports_color, strip_rule, update_all_sources, update_readme_data, - update_sources_data, write_data, write_opening_header) - -import updateHostsFile -import unittest -import tempfile -import locale -import shutil import json -import sys +import locale import os import re +import shutil +import sys +import tempfile +import unittest + +import updateHostsFile +from updateHostsFile import (PY3, Colors, colorize, display_exclusion_options, + domain_to_idna, exclude_domain, flush_dns_cache, + gather_custom_exclusions, get_defaults, + get_file_by_url, is_valid_domain_format, + matches_exclusions, move_hosts_file_into_place, + normalize_rule, path_join_robust, print_failure, + print_success, prompt_for_exclusions, + prompt_for_flush_dns_cache, prompt_for_move, + prompt_for_update, query_yes_no, recursive_glob, + remove_old_hosts_file, strip_rule, supports_color, + update_all_sources, update_readme_data, + update_sources_data, write_data, + write_opening_header) if PY3: from io import BytesIO, StringIO @@ -1360,6 +1364,81 @@ def mock_url_open_decode_fail(_): return m +class DomainToIDNA(Base): + + def __init__(self, *args, **kwargs): + super(DomainToIDNA, self).__init__(*args, **kwargs) + + self.domains = [b'\xc9\xa2oogle.com', b'www.huala\xc3\xb1e.cl'] + self.expected_domains = ['xn--oogle-wmc.com', 'www.xn--hualae-0wa.cl'] + + def test_empty_line(self): + data = ["", "\r", "\n"] + + for empty in data: + expected = empty + + actual = domain_to_idna(empty) + self.assertEqual(actual, expected) + + def test_commented_line(self): + data = "# Hello World" + expected = data + actual = domain_to_idna(data) + + self.assertEqual(actual, expected) + + def test_simple_line(self): + # Test with a space as separator. + for i in range(len(self.domains)): + data = (b"0.0.0.0 " + self.domains[i]).decode('utf-8') + expected = "0.0.0.0 " + self.expected_domains[i] + + actual = domain_to_idna(data) + + self.assertEqual(actual, expected) + + # Test with a tabulation as separator. + for i in range(len(self.domains)): + data = (b"0.0.0.0\t" + self.domains[i]).decode('utf-8') + expected = "0.0.0.0\t" + self.expected_domains[i] + + actual = domain_to_idna(data) + + self.assertEqual(actual, expected) + + def test_single_line_with_comment_at_the_end(self): + # Test with a space as separator. + for i in range(len(self.domains)): + data = (b"0.0.0.0 " + self.domains[i] + b" # Hello World") \ + .decode('utf-8') + expected = "0.0.0.0 " + self.expected_domains[i] + " # Hello World" + + actual = domain_to_idna(data) + + self.assertEqual(actual, expected) + + # Test with a tabulation as separator. + for i in range(len(self.domains)): + data = (b"0.0.0.0\t" + self.domains[i] + b" # Hello World") \ + .decode('utf-8') + expected = "0.0.0.0\t" + self.expected_domains[i] + \ + " # Hello World" + + actual = domain_to_idna(data) + + self.assertEqual(actual, expected) + + def test_single_line_without_prefix(self): + for i in range(len(self.domains)): + data = self.domains[i].decode('utf-8') + expected = self.expected_domains[i] + + actual = domain_to_idna(data) + + self.assertEqual(actual, expected) + + class GetFileByUrl(BaseStdout): @mock.patch("updateHostsFile.urlopen", diff --git a/updateHostsFile.py b/updateHostsFile.py index ad77cc823..e69cf3a8c 100644 --- a/updateHostsFile.py +++ b/updateHostsFile.py @@ -6,23 +6,26 @@ # This Python script will combine all the host files you provide # as sources into one, unique host file to keep you internet browsing happy. -from __future__ import (absolute_import, division, - print_function, unicode_literals) -from glob import glob +from __future__ import (absolute_import, division, print_function, + unicode_literals) -import os +import argparse +import fnmatch +import json import locale +import os import platform import re import shutil +import socket import subprocess import sys import tempfile import time -import fnmatch -import argparse -import socket -import json +from glob import glob + +import lxml # noqa: F401 +from bs4 import BeautifulSoup # Detecting Python 3 for version-dependent implementations PY3 = sys.version_info >= (3, 0) @@ -1126,6 +1129,60 @@ def remove_old_hosts_file(backup): # End File Logic +def domain_to_idna(line): + """ + Encode a domain which is presente into a line into `idna`. This way we + avoid the most encoding issue. + + Parameters + ---------- + line : str + The line we have to encode/decode. + + Returns + ------- + line : str + The line in a converted format. + + Notes + ----- + - This function encode only the domain to `idna` format because in + most cases, the encoding issue is due to a domain which looks like + `b'\xc9\xa2oogle.com'.decode('idna')`. + - About the splitting: + We split because we only want to encode the domain and not the full + line, which may cause some issues. Keep in mind that we split, but we + still concatenate once we encoded the domain. + + - The following split the prefix `0.0.0.0` or `127.0.0.1` of a line. + - The following also split the trailing comment of a given line. + """ + + if not line.startswith('#'): + for separator in ['\t', ' ']: + comment = '' + + if separator in line: + splited_line = line.split(separator) + if '#' in splited_line[1]: + index_comment = splited_line[1].find('#') + + if index_comment > -1: + comment = splited_line[1][index_comment:] + + splited_line[1] = splited_line[1] \ + .split(comment)[0] \ + .encode("IDNA").decode("UTF-8") + \ + comment + + splited_line[1] = splited_line[1] \ + .encode("IDNA") \ + .decode("UTF-8") + return separator.join(splited_line) + return line.encode("IDNA").decode("UTF-8") + return line.encode("UTF-8").decode("UTF-8") + + # Helper Functions def get_file_by_url(url): """ @@ -1141,11 +1198,17 @@ def get_file_by_url(url): url_data : str or None The data retrieved at that URL from the file. Returns None if the attempted retrieval is unsuccessful. + + Note + ---- + - BeautifulSoup is used in this case to avoid having to search in which + format we have to encode or decode data before parsing it to UTF-8. """ try: f = urlopen(url) - return f.read().decode("UTF-8") + soup = BeautifulSoup(f.read(), 'lxml').get_text() + return '\n'.join(list(map(domain_to_idna, soup.split('\n')))) except Exception: print("Problem getting file: ", url) @@ -1165,7 +1228,10 @@ def write_data(f, data): if PY3: f.write(bytes(data, "UTF-8")) else: - f.write(str(data).encode("UTF-8")) + try: + f.write(str(data)) + except UnicodeEncodeError: + f.write(str(data.encode("UTF-8"))) def list_dir_no_hidden(path):