hosts/updateHostsFile.py

#!/usr/bin/env python

# Script by Ben Limmer
# https://github.com/l1m5
#
# This simple Python script will combine all the host files you provide
# as sources into one, unique host file to keep you internet browsing happy.

# Making Python 2 compatible with Python 3
from __future__ import absolute_import, division, print_function, unicode_literals

import os
import platform
import re
import string
import subprocess
import sys
import tempfile
import glob
# zip files are not used actually, support deleted
# StringIO is not needed in Python 3
# Python 3 works differently with urlopen

# Supporting urlopen in Python 2 and Python 3
try:
	from urllib.parse import urlparse, urlencode
	from urllib.request import urlopen, Request
	from urllib.error import HTTPError
except ImportError:
	from urlparse import urlparse
	from urllib import urlencode
	from urllib2 import urlopen, Request, HTTPError

# This function handles both Python 2 and Python 3
def getFileByUrl(url):
	try:
		f = urlopen(url)
		return f.read().decode("UTF-8")
	except:
		print ("Problem getting file: ", url);
		# raise


# In Python 3   "print" is a function, braces are added everywhere

# Detecting Python 3 for version-dependent implementations
Python3=False;
cur_version = sys.version_info
if cur_version >= (3, 0):
	Python3=True;

# This function works in both Python 2 and Python 3
def myInput(msg=""):
	if Python3:
		return input(msg);
	else:
		return raw_input(msg);


# Cross-python writing function
def writeData(f, data):
	if Python3:
		f.write(bytes(data, 'UTF-8'))
	else:
		f.write(str(data).encode('UTF-8'))

# This function doesn't list hidden files
def listdir_nohidden(path):
	return glob.glob(os.path.join(path, '*'))

# Project Settings
BASEDIR_PATH = os.path.dirname(os.path.realpath(__file__))
DATA_PATH = os.path.join(BASEDIR_PATH, 'data')
DATA_FILENAMES = 'hosts'
UPDATE_URL_FILENAME = 'update.info'
SOURCES = listdir_nohidden(DATA_PATH)
README_TEMPLATE = os.path.join(BASEDIR_PATH, 'readme_template.md')
README_FILE = os.path.join(BASEDIR_PATH, 'readme.md')
TARGET_HOST = '0.0.0.0'
WHITELIST_FILE = os.path.join(BASEDIR_PATH, 'whitelist')
HOSTS_FILE = os.path.join(BASEDIR_PATH, 'hosts')

# Exclusions
EXCLUSION_PATTERN = '([a-zA-Z\d-]+\.){0,}' #append domain the end
EXCLUSIONS = []

# Common domains to exclude
COMMON_EXCLUSIONS = ['hulu.com']

# Global vars
exclusionRegexs = []
numberOfRules = 0

def main():
	promptForUpdate()
	promptForExclusions()
	mergeFile = createInitialFile()
	finalFile = removeDups(mergeFile)
	finalizeFile(finalFile)
	excludeFromFile()
	updateReadme(numberOfRules)
	printSuccess('Success! Your shiny new hosts file has been prepared.\nIt contains ' + "{:,}".format( numberOfRules ) + ' unique entries.')

	promptForMove(finalFile)

# Exclusion from file
def excludeFromFile():
    global numberOfRules
    if os.path.isfile(WHITELIST_FILE):
    	with open(WHITELIST_FILE, "r") as ins:
    		for line in ins:
    			EXCLUSIONS.append(line)
        f = open(HOSTS_FILE)
        output = []
        for line in f:
            write = 'true'
            for domain in EXCLUSIONS:
                if domain in line:
                    write = 'false'
                    numberOfRules -= 1
                    break
            if (write == 'true'):
                output.append(line)
        f.close()
        f = open(HOSTS_FILE, 'w')
        f.writelines(output)
        f.close()
        f = open(HOSTS_FILE)
        output = []
        for line in f:
            if 'unique entries' not in line:
                output.append(line)
            else:
                output.append('# Merging these sources produced ' + "{:,}".format( numberOfRules ) + ' unique entries\n')
        f.close()
        f = open(HOSTS_FILE, 'w')
        f.writelines(output)
        f.close()

# Prompt the User
def promptForUpdate():
	response = query_yes_no("Do you want to update all data sources?")
	if (response == "yes"):
		updateAllSources()
	else:
		print ('OK, we\'ll stick with what we\'ve  got locally.')

def promptForExclusions():
	response = query_yes_no("Do you want to exclude any domains?\n" +
							"For example, hulu.com video streaming must be able to access " +
							"its tracking and ad servers in order to play video.")
	if (response == "yes"):
		displayExclusionOptions()
	else:
		print ('OK, we\'ll only exclude domains in the whitelist.')

def promptForMoreCustomExclusions():
	response = query_yes_no("Do you have more domains you want to enter?")
	if (response == "yes"):
		return True
	else:
		return False

def promptForMove(finalFile):
  response = query_yes_no("Do you want to replace your existing hosts file with the newly generated file?")
  if (response == "yes"):
    moveHostsFileIntoPlace(finalFile)
  else:
    return False
# End Prompt the User

# Exclusion logic
def displayExclusionOptions():
	for exclusionOption in COMMON_EXCLUSIONS:
		response = query_yes_no("Do you want to exclude the domain " + exclusionOption + " ?")
		if (response == "yes"):
			excludeDomain(exclusionOption)
		else:
			continue
	response = query_yes_no("Do you want to exclude any other domains?")
	if (response == "yes"):
		gatherCustomExclusions()

def gatherCustomExclusions():
	while True:
		# Cross-python Input
		domainFromUser = myInput("Enter the domain you want to exclude (e.g. facebook.com): ")
		if (isValidDomainFormat(domainFromUser)):
			excludeDomain(domainFromUser)
		if (promptForMoreCustomExclusions() == False):
			return

def excludeDomain(domain):
	exclusionRegexs.append(re.compile(EXCLUSION_PATTERN + domain))

def matchesExclusions(strippedRule):
	strippedDomain = strippedRule.split()[1]
	for exclusionRegex in exclusionRegexs:
		if exclusionRegex.search(strippedDomain):
			return True
	return False
# End Exclusion Logic

# Update Logic
def updateAllSources():
	for source in SOURCES:
		updateURL = getUpdateURLFromFile(source)
		if (updateURL == None):
			continue;
		print ('Updating source ' + source + ' from ' + updateURL)
		# Cross-python call
		updatedFile = getFileByUrl(updateURL);
		updatedFile = updatedFile.replace('\r', '') #get rid of carriage-return symbols

		# This is cross-python code
		dataFile = open(os.path.join(DATA_PATH, source, DATA_FILENAMES), 'wb')
		writeData(dataFile, updatedFile);
		dataFile.close()

def getUpdateURLFromFile(source):
	pathToUpdateFile = os.path.join(DATA_PATH, source, UPDATE_URL_FILENAME)
	if os.path.exists(pathToUpdateFile):
		updateFile = open(pathToUpdateFile, 'r')
		retURL = updateFile.readline().strip()
		updateFile.close()
	else:
		retURL = None
		printFailure('Warning: Can\'t find the update file for source ' + source + '\n' +
					 'Make sure that there\'s a file at ' + pathToUpdateFile)
	return retURL
# End Update Logic

# File Logic
def createInitialFile():
	mergeFile = tempfile.NamedTemporaryFile()
	for source in SOURCES:
		curFile = open(os.path.join(DATA_PATH, source, DATA_FILENAMES), 'r')
		#Done in a cross-python way
		writeData(mergeFile, '\n# Begin ' + source + '\n')
		writeData(mergeFile, curFile.read())
		writeData(mergeFile, '\n# End ' + source + '\n')

	return mergeFile

def removeDups(mergeFile):
	global numberOfRules

    # Another mode is required to read and write the file in Python 3
	finalFile = open(os.path.join(BASEDIR_PATH, 'hosts'), 'r+b')
	mergeFile.seek(0) # reset file pointer

	hostnames = set()
	hostnames.add("localhost")
	for line in mergeFile.readlines():
        # Explicit encoding
		line = line.decode("UTF-8")
		# Testing the first character doesn't require startswith
		if line[0] == '#' or re.match(r'^\s*$', line[0]):
			# Cross-python write
			writeData(finalFile, line)
			continue

		strippedRule = stripRule(line) #strip comments
		if matchesExclusions(strippedRule):
			continue
		hostname, normalizedRule = normalizeRule(strippedRule) # normalize rule

		if normalizedRule and (hostname not in hostnames):
			writeData(finalFile, normalizedRule)
			hostnames.add(hostname)
			numberOfRules += 1

	mergeFile.close()

	return finalFile

def normalizeRule(rule):
	result = re.search(r'^[ \t]*(\d+\.\d+\.\d+\.\d+)\s+([\w\.-]+)(.*)',rule)
	if result:
		target, hostname, suffix = result.groups()
		hostname = hostname.lower() # explicitly lowercase hostname
		if suffix is not '':
			# add suffix as comment only, not as a separate host
			return hostname, "%s %s #%s\n" % (TARGET_HOST, hostname, suffix)
		else:
			return hostname, "%s %s\n" % (TARGET_HOST, hostname)
	print ('==>%s<==' % rule)
	return None, None

def finalizeFile(finalFile):
	writeOpeningHeader(finalFile)
	finalFile.close()

# Some sources put comments around their rules, for accuracy we need to strip them
# the comments are preserved in the output hosts file
def stripRule(line):
	splitLine = line.split()
	if (len(splitLine) < 2) :
        # This is due to the diffrences between bytes and string type in Python 3
		printFailure('A line in the hostfile is going to cause problems because it is nonstandard\n' +
					 'The line reads ' + str(line) + ' please check your data files. Maybe you have a comment without a #?')
		sys.exit()
	return splitLine[0] + ' ' + splitLine[1]

def writeOpeningHeader(finalFile):
	global numberOfRules
	finalFile.seek(0) #reset file pointer
	fileContents = finalFile.read(); #save content
	finalFile.seek(0) #write at the top
	writeData(finalFile, '# This file is a merged collection of hosts from reputable sources,\n')
	writeData(finalFile, '# with a dash of crowd sourcing via Github\n#\n')
	writeData(finalFile, '# Project home page: https://github.com/StevenBlack/hosts\n#\n')
	writeData(finalFile, '# Current sources:\n')
	for source in SOURCES:
		writeData(finalFile, '#    ' + source + '\n')
	writeData(finalFile, '#\n')
	writeData(finalFile, '# Merging these sources produced ' + "{:,}".format( numberOfRules ) + ' unique entries\n')
	writeData(finalFile, '# ===============================================================\n')
	writeData(finalFile, '\n')
	writeData(finalFile, '127.0.0.1 localhost\n')
	writeData(finalFile, '\n')

	preamble = os.path.join(BASEDIR_PATH, "myhosts");
	if os.path.isfile(preamble):
		with open(preamble, "r") as f:
			writeData(finalFile, f.read());

	finalFile.write(fileContents)

def updateReadme(numberOfRules):
	with open(README_FILE, "wt") as out:
		for line in open(README_TEMPLATE):
			out.write(line.replace('@NUM_ENTRIES@', "{:,}".format( numberOfRules )))

def moveHostsFileIntoPlace(finalFile):
	if (os.name == 'posix'):
		print ('Moving the file requires administrative privileges. You might need to enter your password.')
		if(subprocess.call(["/usr/bin/sudo", "cp", os.path.abspath(finalFile.name), "/etc/hosts"])):
			printFailure("Moving the file failed.")
		print ('Flushing the DNS Cache to utilize new hosts file...')
		if (platform.system() == 'Darwin'):
			if(subprocess.call(["/usr/bin/sudo", "killall", "-HUP", "mDNSResponder"])):
				printFailure("Flushing the DNS Cache failed.")
		else:
			if os.path.isfile("/etc/rc.d/init.d/nscd"):
				if(subprocess.call(["/usr/bin/sudo", "/etc/rc.d/init.d/nscd", "restart"])):
					printFailure("Flushing the DNS Cache failed.")
	elif (os.name == 'nt'):
		print ('Automatically moving the hosts file in place is not yet supported.')
		print ('Please move the generated file to %SystemRoot%\system32\drivers\etc\hosts')

# End File Logic

# Helper Functions
## {{{ http://code.activestate.com/recipes/577058/ (r2)
def query_yes_no(question, default="yes"):
    """Ask a yes/no question via raw_input() and return their answer.

    "question" is a string that is presented to the user.
    "default" is the presumed answer if the user just hits <Enter>.
        It must be "yes" (the default), "no" or None (meaning
        an answer is required of the user).

    The "answer" return value is one of "yes" or "no".
    """
    valid = {"yes":"yes",   "y":"yes",  "ye":"yes",
             "no":"no",     "n":"no"}
    if default == None:
        prompt = " [y/n] "
    elif default == "yes":
        prompt = " [Y/n] "
    elif default == "no":
        prompt = " [y/N] "
    else:
        raise ValueError("invalid default answer: '%s'" % default)

    while 1:
        sys.stdout.write(colorize(question, colors.PROMPT) + prompt)
        # Changed to be cross-python
        choice = myInput().lower()
        if default is not None and choice == '':
            return default
        elif choice in valid.keys():
            return valid[choice]
        else:
            printFailure("Please respond with 'yes' or 'no' "\
                             "(or 'y' or 'n').\n")
## end of http://code.activestate.com/recipes/577058/ }}}

def isValidDomainFormat(domain):
	if (domain == ''):
		print ("You didn\'t enter a domain. Try again.")
		return False
	domainRegex = re.compile("www\d{0,3}[.]|https?")
	if (domainRegex.match(domain)):
		print ("The domain " + domain + " is not valid. Do not include www.domain.com or http(s)://domain.com. Try again.")
		return False
	else:
		return True

# Colors
class colors:
    PROMPT 	= '\033[94m'
    SUCCESS = '\033[92m'
    FAIL 	= '\033[91m'
    ENDC 	= '\033[0m'

def colorize(text, color):
	return color + text + colors.ENDC

def printSuccess(text):
	print (colorize(text, colors.SUCCESS))

def printFailure(text):
	print (colorize(text, colors.FAIL))
# End Helper Functions

if __name__ == "__main__":
	main()