Merge pull request #520 from funilrys/notDavid-protocol

Possible fix of the encoding and/or downlaod issue(s)
This commit is contained in:
Steven Black 2018-03-03 15:11:38 -05:00 committed by GitHub
commit aa6b09561b
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23
6 changed files with 191 additions and 34 deletions

View File

@ -5,4 +5,4 @@ conda create -n hosts python=$PYTHON_VERSION || exit 1
source activate hosts
echo "Installing packages..."
conda install mock flake8
conda install mock flake8 beautifulsoup4 lxml

View File

@ -39,14 +39,20 @@ folders.
## Generate your own unified hosts file
**Note** if you are using Python 3, please install the dependencies with:
pip3 install --user -r requirements.txt
**Note** if you are using Python 2, please install the dependencies with:
pip2 install --user -r requirements_python2.txt
**Note** we recommend the `--user` flag which installs the required dependencies at the user level. More information about it can be found on pip [documentation](https://pip.pypa.io/en/stable/reference/pip_install/?highlight=--user#cmdoption-user).
To run unit tests, in the top level directory, just run:
python testUpdateHostsFile.py
**Note** if you are using Python 2, you must first install the `mock` library:
pip install mock
The `updateHostsFile.py` script, which is Python 2.7 and Python 3-compatible,
will generate a unified hosts file based on the sources in the local `data/`
subfolder. The script will prompt you whether it should fetch updated
@ -104,9 +110,9 @@ in a subfolder. If the subfolder does not exist, it will be created.
section at the top, containing lines like `127.0.0.1 localhost`. This is
useful for configuring proximate DNS services on the local network.
`--compress`, or `-c`: `false` (default) or `true`, *Compress* the hosts file
ignoring non-necessary lines (empty lines and comments) and putting multiple
domains in each line. Reducing the number of lines of the hosts file improves
`--compress`, or `-c`: `false` (default) or `true`, *Compress* the hosts file
ignoring non-necessary lines (empty lines and comments) and putting multiple
domains in each line. Reducing the number of lines of the hosts file improves
the performances under Windows (with DNS Client service enabled).
`--minimise`, or `-m`: `false` (default) or `true`, like `--compress`, but puts

3
requirements.txt Normal file
View File

@ -0,0 +1,3 @@
lxml==4.1.1
beautifulsoup4==4.6.0
mock==2.0.0

3
requirements_python2.txt Normal file
View File

@ -0,0 +1,3 @@
mock==2.0.0
lxml==4.1.1
beautifulsoup4==4.6.0

View File

@ -5,25 +5,29 @@
#
# Python script for testing updateHostFiles.py
from updateHostsFile import (
Colors, PY3, colorize, display_exclusion_options, exclude_domain,
flush_dns_cache, gather_custom_exclusions, get_defaults, get_file_by_url,
is_valid_domain_format, matches_exclusions, move_hosts_file_into_place,
normalize_rule, path_join_robust, print_failure, print_success,
prompt_for_exclusions, prompt_for_move, prompt_for_flush_dns_cache,
prompt_for_update, query_yes_no, recursive_glob, remove_old_hosts_file,
supports_color, strip_rule, update_all_sources, update_readme_data,
update_sources_data, write_data, write_opening_header)
import updateHostsFile
import unittest
import tempfile
import locale
import shutil
import json
import sys
import locale
import os
import re
import shutil
import sys
import tempfile
import unittest
import updateHostsFile
from updateHostsFile import (PY3, Colors, colorize, display_exclusion_options,
domain_to_idna, exclude_domain, flush_dns_cache,
gather_custom_exclusions, get_defaults,
get_file_by_url, is_valid_domain_format,
matches_exclusions, move_hosts_file_into_place,
normalize_rule, path_join_robust, print_failure,
print_success, prompt_for_exclusions,
prompt_for_flush_dns_cache, prompt_for_move,
prompt_for_update, query_yes_no, recursive_glob,
remove_old_hosts_file, strip_rule, supports_color,
update_all_sources, update_readme_data,
update_sources_data, write_data,
write_opening_header)
if PY3:
from io import BytesIO, StringIO
@ -1360,6 +1364,81 @@ def mock_url_open_decode_fail(_):
return m
class DomainToIDNA(Base):
def __init__(self, *args, **kwargs):
super(DomainToIDNA, self).__init__(*args, **kwargs)
self.domains = [b'\xc9\xa2oogle.com', b'www.huala\xc3\xb1e.cl']
self.expected_domains = ['xn--oogle-wmc.com', 'www.xn--hualae-0wa.cl']
def test_empty_line(self):
data = ["", "\r", "\n"]
for empty in data:
expected = empty
actual = domain_to_idna(empty)
self.assertEqual(actual, expected)
def test_commented_line(self):
data = "# Hello World"
expected = data
actual = domain_to_idna(data)
self.assertEqual(actual, expected)
def test_simple_line(self):
# Test with a space as separator.
for i in range(len(self.domains)):
data = (b"0.0.0.0 " + self.domains[i]).decode('utf-8')
expected = "0.0.0.0 " + self.expected_domains[i]
actual = domain_to_idna(data)
self.assertEqual(actual, expected)
# Test with a tabulation as separator.
for i in range(len(self.domains)):
data = (b"0.0.0.0\t" + self.domains[i]).decode('utf-8')
expected = "0.0.0.0\t" + self.expected_domains[i]
actual = domain_to_idna(data)
self.assertEqual(actual, expected)
def test_single_line_with_comment_at_the_end(self):
# Test with a space as separator.
for i in range(len(self.domains)):
data = (b"0.0.0.0 " + self.domains[i] + b" # Hello World") \
.decode('utf-8')
expected = "0.0.0.0 " + self.expected_domains[i] + " # Hello World"
actual = domain_to_idna(data)
self.assertEqual(actual, expected)
# Test with a tabulation as separator.
for i in range(len(self.domains)):
data = (b"0.0.0.0\t" + self.domains[i] + b" # Hello World") \
.decode('utf-8')
expected = "0.0.0.0\t" + self.expected_domains[i] + \
" # Hello World"
actual = domain_to_idna(data)
self.assertEqual(actual, expected)
def test_single_line_without_prefix(self):
for i in range(len(self.domains)):
data = self.domains[i].decode('utf-8')
expected = self.expected_domains[i]
actual = domain_to_idna(data)
self.assertEqual(actual, expected)
class GetFileByUrl(BaseStdout):
@mock.patch("updateHostsFile.urlopen",

View File

@ -6,23 +6,26 @@
# This Python script will combine all the host files you provide
# as sources into one, unique host file to keep you internet browsing happy.
from __future__ import (absolute_import, division,
print_function, unicode_literals)
from glob import glob
from __future__ import (absolute_import, division, print_function,
unicode_literals)
import os
import argparse
import fnmatch
import json
import locale
import os
import platform
import re
import shutil
import socket
import subprocess
import sys
import tempfile
import time
import fnmatch
import argparse
import socket
import json
from glob import glob
import lxml # noqa: F401
from bs4 import BeautifulSoup
# Detecting Python 3 for version-dependent implementations
PY3 = sys.version_info >= (3, 0)
@ -1126,6 +1129,60 @@ def remove_old_hosts_file(backup):
# End File Logic
def domain_to_idna(line):
"""
Encode a domain which is presente into a line into `idna`. This way we
avoid the most encoding issue.
Parameters
----------
line : str
The line we have to encode/decode.
Returns
-------
line : str
The line in a converted format.
Notes
-----
- This function encode only the domain to `idna` format because in
most cases, the encoding issue is due to a domain which looks like
`b'\xc9\xa2oogle.com'.decode('idna')`.
- About the splitting:
We split because we only want to encode the domain and not the full
line, which may cause some issues. Keep in mind that we split, but we
still concatenate once we encoded the domain.
- The following split the prefix `0.0.0.0` or `127.0.0.1` of a line.
- The following also split the trailing comment of a given line.
"""
if not line.startswith('#'):
for separator in ['\t', ' ']:
comment = ''
if separator in line:
splited_line = line.split(separator)
if '#' in splited_line[1]:
index_comment = splited_line[1].find('#')
if index_comment > -1:
comment = splited_line[1][index_comment:]
splited_line[1] = splited_line[1] \
.split(comment)[0] \
.encode("IDNA").decode("UTF-8") + \
comment
splited_line[1] = splited_line[1] \
.encode("IDNA") \
.decode("UTF-8")
return separator.join(splited_line)
return line.encode("IDNA").decode("UTF-8")
return line.encode("UTF-8").decode("UTF-8")
# Helper Functions
def get_file_by_url(url):
"""
@ -1141,11 +1198,17 @@ def get_file_by_url(url):
url_data : str or None
The data retrieved at that URL from the file. Returns None if the
attempted retrieval is unsuccessful.
Note
----
- BeautifulSoup is used in this case to avoid having to search in which
format we have to encode or decode data before parsing it to UTF-8.
"""
try:
f = urlopen(url)
return f.read().decode("UTF-8")
soup = BeautifulSoup(f.read(), 'lxml').get_text()
return '\n'.join(list(map(domain_to_idna, soup.split('\n'))))
except Exception:
print("Problem getting file: ", url)
@ -1165,7 +1228,10 @@ def write_data(f, data):
if PY3:
f.write(bytes(data, "UTF-8"))
else:
f.write(str(data).encode("UTF-8"))
try:
f.write(str(data))
except UnicodeEncodeError:
f.write(str(data.encode("UTF-8")))
def list_dir_no_hidden(path):