Merge pull request #520 from funilrys/notDavid-protocol

Possible fix of the encoding and/or downlaod issue(s)
2024-07-02 18:45:52 +02:00 · 2018-03-03 15:11:38 -05:00 · 2018-03-03 15:11:38 -05:00 · aa6b09561b
commit aa6b09561b
parent 37da299efb 1e64d1287a
6 changed files with 191 additions and 34 deletions
--- a/ci/setup_conda_env.sh
+++ b/ci/setup_conda_env.sh
@ -5,4 +5,4 @@ conda create -n hosts python=$PYTHON_VERSION || exit 1
 source activate hosts

 echo "Installing packages..."
-conda install mock flake8
+conda install mock flake8 beautifulsoup4 lxml
--- a/readme_template.md
+++ b/readme_template.md
@ -39,14 +39,20 @@ folders.

 ## Generate your own unified hosts file

+**Note** if you are using Python 3, please install the dependencies with:
+
+    pip3 install --user -r requirements.txt
+
+**Note** if you are using Python 2, please install the dependencies with:
+
+    pip2 install --user -r requirements_python2.txt
+
+**Note** we recommend the `--user` flag which installs the required dependencies at the user level. More information about it can be found on pip [documentation](https://pip.pypa.io/en/stable/reference/pip_install/?highlight=--user#cmdoption-user).
+
 To run unit tests, in the top level directory, just run:

    python testUpdateHostsFile.py

-**Note** if you are using Python 2, you must first install the `mock` library:
-
-    pip install mock
-
 The `updateHostsFile.py` script, which is Python 2.7 and Python 3-compatible,
 will generate a unified hosts file based on the sources in the local `data/`
 subfolder.  The script will prompt you whether it should fetch updated
@ -104,9 +110,9 @@ in a subfolder.  If the subfolder does not exist, it will be created.
 section at the top, containing lines like `127.0.0.1 localhost`.  This is
 useful for configuring proximate DNS services on the local network.

-`--compress`, or `-c`: `false` (default) or `true`, *Compress* the hosts file 
-ignoring non-necessary lines (empty lines and comments) and putting multiple 
-domains in each line. Reducing the number of lines of the hosts file improves 
+`--compress`, or `-c`: `false` (default) or `true`, *Compress* the hosts file
+ignoring non-necessary lines (empty lines and comments) and putting multiple
+domains in each line. Reducing the number of lines of the hosts file improves
 the performances under Windows (with DNS Client service enabled).

 `--minimise`, or `-m`: `false` (default) or `true`, like `--compress`, but puts
--- a/requirements.txt
+++ b/requirements.txt
@ -0,0 +1,3 @@
+lxml==4.1.1
+beautifulsoup4==4.6.0
+mock==2.0.0
--- a/requirements_python2.txt
+++ b/requirements_python2.txt
@ -0,0 +1,3 @@
+mock==2.0.0
+lxml==4.1.1
+beautifulsoup4==4.6.0
--- a/testUpdateHostsFile.py
+++ b/testUpdateHostsFile.py
@ -5,25 +5,29 @@
 #
 # Python script for testing updateHostFiles.py

-from updateHostsFile import (
-    Colors, PY3, colorize, display_exclusion_options, exclude_domain,
-    flush_dns_cache, gather_custom_exclusions, get_defaults, get_file_by_url,
-    is_valid_domain_format, matches_exclusions, move_hosts_file_into_place,
-    normalize_rule, path_join_robust, print_failure, print_success,
-    prompt_for_exclusions, prompt_for_move, prompt_for_flush_dns_cache,
-    prompt_for_update, query_yes_no, recursive_glob, remove_old_hosts_file,
-    supports_color, strip_rule, update_all_sources, update_readme_data,
-    update_sources_data, write_data, write_opening_header)
-
-import updateHostsFile
-import unittest
-import tempfile
-import locale
-import shutil
 import json
-import sys
+import locale
 import os
 import re
+import shutil
+import sys
+import tempfile
+import unittest
+
+import updateHostsFile
+from updateHostsFile import (PY3, Colors, colorize, display_exclusion_options,
+                             domain_to_idna, exclude_domain, flush_dns_cache,
+                             gather_custom_exclusions, get_defaults,
+                             get_file_by_url, is_valid_domain_format,
+                             matches_exclusions, move_hosts_file_into_place,
+                             normalize_rule, path_join_robust, print_failure,
+                             print_success, prompt_for_exclusions,
+                             prompt_for_flush_dns_cache, prompt_for_move,
+                             prompt_for_update, query_yes_no, recursive_glob,
+                             remove_old_hosts_file, strip_rule, supports_color,
+                             update_all_sources, update_readme_data,
+                             update_sources_data, write_data,
+                             write_opening_header)

 if PY3:
    from io import BytesIO, StringIO
@ -1360,6 +1364,81 @@ def mock_url_open_decode_fail(_):
    return m


+class DomainToIDNA(Base):
+
+    def __init__(self, *args, **kwargs):
+        super(DomainToIDNA, self).__init__(*args, **kwargs)
+
+        self.domains = [b'\xc9\xa2oogle.com', b'www.huala\xc3\xb1e.cl']
+        self.expected_domains = ['xn--oogle-wmc.com', 'www.xn--hualae-0wa.cl']
+
+    def test_empty_line(self):
+        data = ["", "\r", "\n"]
+
+        for empty in data:
+            expected = empty
+
+            actual = domain_to_idna(empty)
+            self.assertEqual(actual, expected)
+
+    def test_commented_line(self):
+        data = "# Hello World"
+        expected = data
+        actual = domain_to_idna(data)
+
+        self.assertEqual(actual, expected)
+
+    def test_simple_line(self):
+        # Test with a space as separator.
+        for i in range(len(self.domains)):
+            data = (b"0.0.0.0 " + self.domains[i]).decode('utf-8')
+            expected = "0.0.0.0 " + self.expected_domains[i]
+
+            actual = domain_to_idna(data)
+
+            self.assertEqual(actual, expected)
+
+        # Test with a tabulation as separator.
+        for i in range(len(self.domains)):
+            data = (b"0.0.0.0\t" + self.domains[i]).decode('utf-8')
+            expected = "0.0.0.0\t" + self.expected_domains[i]
+
+            actual = domain_to_idna(data)
+
+            self.assertEqual(actual, expected)
+
+    def test_single_line_with_comment_at_the_end(self):
+        # Test with a space as separator.
+        for i in range(len(self.domains)):
+            data = (b"0.0.0.0 " + self.domains[i] + b" # Hello World") \
+                .decode('utf-8')
+            expected = "0.0.0.0 " + self.expected_domains[i] + " # Hello World"
+
+            actual = domain_to_idna(data)
+
+            self.assertEqual(actual, expected)
+
+        # Test with a tabulation as separator.
+        for i in range(len(self.domains)):
+            data = (b"0.0.0.0\t" + self.domains[i] + b" # Hello World") \
+                .decode('utf-8')
+            expected = "0.0.0.0\t" + self.expected_domains[i] + \
+                " # Hello World"
+
+            actual = domain_to_idna(data)
+
+            self.assertEqual(actual, expected)
+
+    def test_single_line_without_prefix(self):
+        for i in range(len(self.domains)):
+            data = self.domains[i].decode('utf-8')
+            expected = self.expected_domains[i]
+
+            actual = domain_to_idna(data)
+
+            self.assertEqual(actual, expected)
+
+
 class GetFileByUrl(BaseStdout):

    @mock.patch("updateHostsFile.urlopen",
--- a/updateHostsFile.py
+++ b/updateHostsFile.py
@ -6,23 +6,26 @@
 # This Python script will combine all the host files you provide
 # as sources into one, unique host file to keep you internet browsing happy.

-from __future__ import (absolute_import, division,
-                        print_function, unicode_literals)
-from glob import glob
+from __future__ import (absolute_import, division, print_function,
+                        unicode_literals)

-import os
+import argparse
+import fnmatch
+import json
 import locale
+import os
 import platform
 import re
 import shutil
+import socket
 import subprocess
 import sys
 import tempfile
 import time
-import fnmatch
-import argparse
-import socket
-import json
+from glob import glob
+
+import lxml  # noqa: F401
+from bs4 import BeautifulSoup

 # Detecting Python 3 for version-dependent implementations
 PY3 = sys.version_info >= (3, 0)
@ -1126,6 +1129,60 @@ def remove_old_hosts_file(backup):
 # End File Logic


+def domain_to_idna(line):
+    """
+    Encode a domain which is presente into a line into `idna`. This way we
+    avoid the most encoding issue.
+
+    Parameters
+    ----------
+    line : str
+        The line we have to encode/decode.
+
+    Returns
+    -------
+    line : str
+        The line in a converted format.
+
+    Notes
+    -----
+    - This function encode only the domain to `idna` format because in
+        most cases, the encoding issue is due to a domain which looks like
+        `b'\xc9\xa2oogle.com'.decode('idna')`.
+    - About the splitting:
+        We split because we only want to encode the domain and not the full
+        line, which may cause some issues. Keep in mind that we split, but we
+        still concatenate once we encoded the domain.
+
+        - The following split the prefix `0.0.0.0` or `127.0.0.1` of a line.
+        - The following also split the trailing comment of a given line.
+    """
+
+    if not line.startswith('#'):
+        for separator in ['\t', ' ']:
+            comment = ''
+
+            if separator in line:
+                splited_line = line.split(separator)
+                if '#' in splited_line[1]:
+                    index_comment = splited_line[1].find('#')
+
+                    if index_comment > -1:
+                        comment = splited_line[1][index_comment:]
+
+                        splited_line[1] = splited_line[1] \
+                            .split(comment)[0] \
+                            .encode("IDNA").decode("UTF-8") + \
+                            comment
+
+                splited_line[1] = splited_line[1] \
+                    .encode("IDNA") \
+                    .decode("UTF-8")
+                return separator.join(splited_line)
+        return line.encode("IDNA").decode("UTF-8")
+    return line.encode("UTF-8").decode("UTF-8")
+
+
 # Helper Functions
 def get_file_by_url(url):
    """
@ -1141,11 +1198,17 @@ def get_file_by_url(url):
    url_data : str or None
        The data retrieved at that URL from the file. Returns None if the
        attempted retrieval is unsuccessful.
+
+    Note
+    ----
+    - BeautifulSoup is used in this case to avoid having to search in which
+        format we have to encode or decode data before parsing it to UTF-8.
    """

    try:
        f = urlopen(url)
-        return f.read().decode("UTF-8")
+        soup = BeautifulSoup(f.read(), 'lxml').get_text()
+        return '\n'.join(list(map(domain_to_idna, soup.split('\n'))))
    except Exception:
        print("Problem getting file: ", url)

@ -1165,7 +1228,10 @@ def write_data(f, data):
    if PY3:
        f.write(bytes(data, "UTF-8"))
    else:
-        f.write(str(data).encode("UTF-8"))
+        try:
+            f.write(str(data))
+        except UnicodeEncodeError:
+            f.write(str(data.encode("UTF-8")))


 def list_dir_no_hidden(path):