Merge pull request #520 from funilrys/notDavid-protocol

Possible fix of the encoding and/or downlaod issue(s)
2024-07-07 13:02:27 +02:00 · 2018-03-03 15:11:38 -05:00 · 2018-03-03 15:11:38 -05:00 · aa6b09561b
commit aa6b09561b
parent 37da299efb 1e64d1287a
6 changed files with 191 additions and 34 deletions
--- a/ci/setup_conda_env.sh
+++ b/ci/setup_conda_env.sh
@ -5,4 +5,4 @@ conda create -n hosts python=$PYTHON_VERSION || exit 1
 source activate hosts
 echo "Installing packages..."
-conda install mock flake8
+conda install mock flake8 beautifulsoup4 lxml
--- a/readme_template.md
+++ b/readme_template.md
@ -39,14 +39,20 @@ folders.
 ## Generate your own unified hosts file
 **Note** if you are using Python 3, please install the dependencies with:
    pip3 install --user -r requirements.txt
 **Note** if you are using Python 2, please install the dependencies with:
    pip2 install --user -r requirements_python2.txt
 **Note** we recommend the `--user` flag which installs the required dependencies at the user level. More information about it can be found on pip [documentation](https://pip.pypa.io/en/stable/reference/pip_install/?highlight=--user#cmdoption-user).
 To run unit tests, in the top level directory, just run:
    python testUpdateHostsFile.py
 **Note** if you are using Python 2, you must first install the `mock` library:
    pip install mock
 The `updateHostsFile.py` script, which is Python 2.7 and Python 3-compatible,
 will generate a unified hosts file based on the sources in the local `data/`
 subfolder.  The script will prompt you whether it should fetch updated
--- a/requirements.txt
+++ b/requirements.txt
@ -0,0 +1,3 @@
 lxml==4.1.1
 beautifulsoup4==4.6.0
 mock==2.0.0
--- a/requirements_python2.txt
+++ b/requirements_python2.txt
@ -0,0 +1,3 @@
 mock==2.0.0
 lxml==4.1.1
 beautifulsoup4==4.6.0
--- a/testUpdateHostsFile.py
+++ b/testUpdateHostsFile.py
@ -5,25 +5,29 @@
 #
 # Python script for testing updateHostFiles.py
 from updateHostsFile import (
    Colors, PY3, colorize, display_exclusion_options, exclude_domain,
    flush_dns_cache, gather_custom_exclusions, get_defaults, get_file_by_url,
    is_valid_domain_format, matches_exclusions, move_hosts_file_into_place,
    normalize_rule, path_join_robust, print_failure, print_success,
    prompt_for_exclusions, prompt_for_move, prompt_for_flush_dns_cache,
    prompt_for_update, query_yes_no, recursive_glob, remove_old_hosts_file,
    supports_color, strip_rule, update_all_sources, update_readme_data,
    update_sources_data, write_data, write_opening_header)
 import updateHostsFile
 import unittest
 import tempfile
 import locale
 import shutil
 import json
-import sys
+import locale
 import os
 import re
 import shutil
 import sys
 import tempfile
 import unittest
 import updateHostsFile
 from updateHostsFile import (PY3, Colors, colorize, display_exclusion_options,
                             domain_to_idna, exclude_domain, flush_dns_cache,
                             gather_custom_exclusions, get_defaults,
                             get_file_by_url, is_valid_domain_format,
                             matches_exclusions, move_hosts_file_into_place,
                             normalize_rule, path_join_robust, print_failure,
                             print_success, prompt_for_exclusions,
                             prompt_for_flush_dns_cache, prompt_for_move,
                             prompt_for_update, query_yes_no, recursive_glob,
                             remove_old_hosts_file, strip_rule, supports_color,
                             update_all_sources, update_readme_data,
                             update_sources_data, write_data,
                             write_opening_header)
 if PY3:
    from io import BytesIO, StringIO
@ -1360,6 +1364,81 @@ def mock_url_open_decode_fail(_):
    return m
 class DomainToIDNA(Base):
    def __init__(self, *args, **kwargs):
        super(DomainToIDNA, self).__init__(*args, **kwargs)
        self.domains = [b'\xc9\xa2oogle.com', b'www.huala\xc3\xb1e.cl']
        self.expected_domains = ['xn--oogle-wmc.com', 'www.xn--hualae-0wa.cl']
    def test_empty_line(self):
        data = ["", "\r", "\n"]
        for empty in data:
            expected = empty
            actual = domain_to_idna(empty)
            self.assertEqual(actual, expected)
    def test_commented_line(self):
        data = "# Hello World"
        expected = data
        actual = domain_to_idna(data)
        self.assertEqual(actual, expected)
    def test_simple_line(self):
        # Test with a space as separator.
        for i in range(len(self.domains)):
            data = (b"0.0.0.0 " + self.domains[i]).decode('utf-8')
            expected = "0.0.0.0 " + self.expected_domains[i]
            actual = domain_to_idna(data)
            self.assertEqual(actual, expected)
        # Test with a tabulation as separator.
        for i in range(len(self.domains)):
            data = (b"0.0.0.0\t" + self.domains[i]).decode('utf-8')
            expected = "0.0.0.0\t" + self.expected_domains[i]
            actual = domain_to_idna(data)
            self.assertEqual(actual, expected)
    def test_single_line_with_comment_at_the_end(self):
        # Test with a space as separator.
        for i in range(len(self.domains)):
            data = (b"0.0.0.0 " + self.domains[i] + b" # Hello World") \
                .decode('utf-8')
            expected = "0.0.0.0 " + self.expected_domains[i] + " # Hello World"
            actual = domain_to_idna(data)
            self.assertEqual(actual, expected)
        # Test with a tabulation as separator.
        for i in range(len(self.domains)):
            data = (b"0.0.0.0\t" + self.domains[i] + b" # Hello World") \
                .decode('utf-8')
            expected = "0.0.0.0\t" + self.expected_domains[i] + \
                " # Hello World"
            actual = domain_to_idna(data)
            self.assertEqual(actual, expected)
    def test_single_line_without_prefix(self):
        for i in range(len(self.domains)):
            data = self.domains[i].decode('utf-8')
            expected = self.expected_domains[i]
            actual = domain_to_idna(data)
            self.assertEqual(actual, expected)
 class GetFileByUrl(BaseStdout):
    @mock.patch("updateHostsFile.urlopen",
--- a/updateHostsFile.py
+++ b/updateHostsFile.py
@ -6,23 +6,26 @@
 # This Python script will combine all the host files you provide
 # as sources into one, unique host file to keep you internet browsing happy.
-from __future__ import (absolute_import, division,
+from __future__ import (absolute_import, division, print_function,
-                        print_function, unicode_literals)
+                        unicode_literals)
 from glob import glob
-import os
+import argparse
 import fnmatch
 import json
 import locale
 import os
 import platform
 import re
 import shutil
 import socket
 import subprocess
 import sys
 import tempfile
 import time
-import fnmatch
+from glob import glob
-import argparse
+
-import socket
+import lxml  # noqa: F401
-import json
+from bs4 import BeautifulSoup
 # Detecting Python 3 for version-dependent implementations
 PY3 = sys.version_info >= (3, 0)
@ -1126,6 +1129,60 @@ def remove_old_hosts_file(backup):
 # End File Logic
 def domain_to_idna(line):
    """
    Encode a domain which is presente into a line into `idna`. This way we
    avoid the most encoding issue.
    Parameters
    ----------
    line : str
        The line we have to encode/decode.
    Returns
    -------
    line : str
        The line in a converted format.
    Notes
    -----
    - This function encode only the domain to `idna` format because in
        most cases, the encoding issue is due to a domain which looks like
        `b'\xc9\xa2oogle.com'.decode('idna')`.
    - About the splitting:
        We split because we only want to encode the domain and not the full
        line, which may cause some issues. Keep in mind that we split, but we
        still concatenate once we encoded the domain.
        - The following split the prefix `0.0.0.0` or `127.0.0.1` of a line.
        - The following also split the trailing comment of a given line.
    """
    if not line.startswith('#'):
        for separator in ['\t', ' ']:
            comment = ''
            if separator in line:
                splited_line = line.split(separator)
                if '#' in splited_line[1]:
                    index_comment = splited_line[1].find('#')
                    if index_comment > -1:
                        comment = splited_line[1][index_comment:]
                        splited_line[1] = splited_line[1] \
                            .split(comment)[0] \
                            .encode("IDNA").decode("UTF-8") + \
                            comment
                splited_line[1] = splited_line[1] \
                    .encode("IDNA") \
                    .decode("UTF-8")
                return separator.join(splited_line)
        return line.encode("IDNA").decode("UTF-8")
    return line.encode("UTF-8").decode("UTF-8")
 # Helper Functions
 def get_file_by_url(url):
    """
@ -1141,11 +1198,17 @@ def get_file_by_url(url):
    url_data : str or None
        The data retrieved at that URL from the file. Returns None if the
        attempted retrieval is unsuccessful.
    Note
    ----
    - BeautifulSoup is used in this case to avoid having to search in which
        format we have to encode or decode data before parsing it to UTF-8.
    """
    try:
        f = urlopen(url)
-        return f.read().decode("UTF-8")
+        soup = BeautifulSoup(f.read(), 'lxml').get_text()
        return '\n'.join(list(map(domain_to_idna, soup.split('\n'))))
    except Exception:
        print("Problem getting file: ", url)
@ -1165,7 +1228,10 @@ def write_data(f, data):
    if PY3:
        f.write(bytes(data, "UTF-8"))
    else:
-        f.write(str(data).encode("UTF-8"))
+        try:
            f.write(str(data))
        except UnicodeEncodeError:
            f.write(str(data.encode("UTF-8")))
 def list_dir_no_hidden(path):