hosts/updateHostsFile.py

#!/usr/bin/env python

# Script by Ben Limmer
# https://github.com/l1m5
#
# This simple Python script will combine all the host files you provide
# as sources into one, unique host file to keep you internet browsing happy.

import os
import re
import string
import sys
import tempfile
import urllib2

# Project Settings
BASEDIR_PATH = os.path.dirname(os.path.realpath(__file__))
DATA_PATH = BASEDIR_PATH + '/data'
DATA_FILENAMES = 'hosts'
UPDATE_URL_FILENAME = 'update.info'
SOURCES = os.listdir(DATA_PATH)
README_TEMPLATE = BASEDIR_PATH + '/readme_template.md'
README_FILE = BASEDIR_PATH + '/readme.md'

# Exclusions
EXCLUSION_PATTERN = '([a-zA-Z\d-]+\.){0,}' #append domain the end

# Common domains to exclude
COMMON_EXCLUSIONS = ['hulu.com']

# Global vars
exclusionRegexs = []
numberOfRules = 0

def main():
	promptForUpdate()
	promptForExclusions()
	mergeFile = createInitialFile()
	finalFile = removeDups(mergeFile)
	finalizeFile(finalFile)
	updateReadme(numberOfRules)
	printSuccess('Success! Your shiny new hosts file has been prepared.\nIt contains ' + str(numberOfRules) + ' unique entries.')
	print 'Copy the generated file to /etc/hosts or %SystemRoot%\system32\drivers\etc\hosts'

# Prompt the User
def promptForUpdate():
	response = query_yes_no("Do you want to update all data sources?")
	if (response == "yes"):
		updateAllSources()
	else:
		print 'OK, we\'ll stick with what we\'ve  got locally.'

def promptForExclusions():
	response = query_yes_no("Do you want to exclude any domains?\n" +
							"For example, hulu.com video streaming must be able to access " +
							"its tracking and ad servers in order to play video.")
	if (response == "yes"):
		displayExclusionOptions()
	else:
		print 'OK, we won\'t exclude any domains.'

def promptForMoreCustomExclusions():
	response = query_yes_no("Do you have more domains you want to enter?")
	if (response == "yes"):
		return True
	else:
		return False
# End Prompt the User

# Exclusion logic
def displayExclusionOptions():
	for exclusionOption in COMMON_EXCLUSIONS:
		response = query_yes_no("Do you want to exclude the domain " + exclusionOption + " ?")
		if (response == "yes"):
			excludeDomain(exclusionOption)
		else:
			continue
	response = query_yes_no("Do you want to exclude any other domains?")
	if (response == "yes"):
		gatherCustomExclusions()

def gatherCustomExclusions():
	while True:
		domainFromUser = raw_input("Enter the domain you want to exclude (e.g. facebook.com): ")
		if (isValidDomainFormat(domainFromUser)):
			excludeDomain(domainFromUser)
		if (promptForMoreCustomExclusions() == False):
			return

def excludeDomain(domain):
	exclusionRegexs.append(re.compile(EXCLUSION_PATTERN + domain))

def matchesExclusions(strippedRule):
	strippedDomain = strippedRule.split()[1]
	for exclusionRegex in exclusionRegexs:
		if exclusionRegex.search(strippedDomain):
			return True
	return False
# End Exclusion Logic

# Update Logic
def updateAllSources():
	for source in SOURCES:
		updateURL = getUpdateURLFromFile(source)
		if (updateURL == None):
			continue;
		print 'Updating source ' + source + ' from ' + updateURL
		updatedFile = urllib2.urlopen(updateURL)
		updatedFile = updatedFile.read()
		updatedFile = string.replace( updatedFile, '\r', '' ) #get rid of carriage-return symbols

		dataFile   = open(DATA_PATH + '/' + source + '/' + DATA_FILENAMES, 'w')
		dataFile.write(updatedFile)
		dataFile.close()

def getUpdateURLFromFile(source):
	pathToUpdateFile = DATA_PATH + '/' + source + '/' + UPDATE_URL_FILENAME
	if os.path.exists(pathToUpdateFile):
		updateFile = open(pathToUpdateFile, 'r')
		retURL = updateFile.readline().strip()
		updateFile.close()
	else:
		retURL = None
		printFailure('Warning: Can\'t find the update file for source ' + source + '\n' +
					 'Make sure that there\'s a file at ' + pathToUpdateFile)
	return retURL
# End Update Logic

# File Logic
def createInitialFile():
	mergeFile = tempfile.NamedTemporaryFile()
	for source in SOURCES:
		curFile = open(DATA_PATH + '/' + source +'/' + DATA_FILENAMES, 'r')
		mergeFile.write('\n# Begin ' + source + '\n')
		mergeFile.write(curFile.read())
		mergeFile.write('\n# End ' + source + '\n')
	return mergeFile

def removeDups(mergeFile):
	global numberOfRules

	finalFile = open(BASEDIR_PATH + '/hosts', 'w+b')
	mergeFile.seek(0) # reset file pointer

	rules_seen = set()
	for line in mergeFile.readlines():
		if line[0].startswith("#") or line[0] == '\n':
			finalFile.write(line) #maintain the comments for readability
			continue
		strippedRule = stripRule(line) #strip comments
		if matchesExclusions(strippedRule):
			continue
		if strippedRule not in rules_seen:
			finalFile.write(line)
			rules_seen.add(strippedRule)
			numberOfRules += 1

	mergeFile.close()

	return finalFile

def finalizeFile(finalFile):
	writeOpeningHeader(finalFile)
	finalFile.close()

# Some sources put comments around their rules, for accuracy we need to strip them
# the comments are preserved in the output hosts file
def stripRule(line):
	splitLine = line.split()
	if (len(splitLine) < 2) :
		printFailure('A line in the hostfile is going to cause problems because it is nonstandard\n' +
					 'The line reads ' + line + ' please check your data files. Maybe you have a comment without a #?')
		sys.exit()
	return splitLine[0] + ' ' + splitLine[1]

def writeOpeningHeader(finalFile):
	global numberOfRules
	finalFile.seek(0) #reset file pointer
	fileContents = finalFile.read(); #save content
	finalFile.seek(0) #write at the top
	finalFile.write('# This file is a merged collection of hosts from reputable sources,\n')
	finalFile.write('# with a dash of crowd sourcing via Github\n#\n')
	finalFile.write('# Project home page: https://github.com/StevenBlack/hosts\n#\n')
	finalFile.write('# Current sources:\n')
	for source in SOURCES:
		finalFile.write('#    ' + source + '\n')
	finalFile.write('#\n')
	finalFile.write('# Merging these sources produced ' + str(numberOfRules) + ' unique entries\n')
	finalFile.write('# ===============================================================\n')
	finalFile.write(fileContents)

def updateReadme(numberOfRules):
	with open(README_FILE, "wt") as out:
		for line in open(README_TEMPLATE):
			out.write(line.replace('@NUM_ENTRIES@', str(numberOfRules)))

# End File Logic

# Helper Functions
## {{{ http://code.activestate.com/recipes/577058/ (r2)
def query_yes_no(question, default="yes"):
    """Ask a yes/no question via raw_input() and return their answer.
    
    "question" is a string that is presented to the user.
    "default" is the presumed answer if the user just hits <Enter>.
        It must be "yes" (the default), "no" or None (meaning
        an answer is required of the user).

    The "answer" return value is one of "yes" or "no".
    """
    valid = {"yes":"yes",   "y":"yes",  "ye":"yes",
             "no":"no",     "n":"no"}
    if default == None:
        prompt = " [y/n] "
    elif default == "yes":
        prompt = " [Y/n] "
    elif default == "no":
        prompt = " [y/N] "
    else:
        raise ValueError("invalid default answer: '%s'" % default)

    while 1:
        sys.stdout.write(colorize(question, colors.PROMPT) + prompt)
        choice = raw_input().lower()
        if default is not None and choice == '':
            return default
        elif choice in valid.keys():
            return valid[choice]
        else:
            printFailure("Please respond with 'yes' or 'no' "\
                             "(or 'y' or 'n').\n")
## end of http://code.activestate.com/recipes/577058/ }}}

def isValidDomainFormat(domain):
	if (domain == ''):
		print "You didn\'t enter a domain. Try again."
		return False
	domainRegex = re.compile("www\d{0,3}[.]|https?")
	if (domainRegex.match(domain)):
		print "The domain " + domain + " is not valid. Do not include www.domain.com or http(s)://domain.com. Try again."
		return False
	else:
		return True

# Colors
class colors:
    PROMPT 	= '\033[94m'
    SUCCESS = '\033[92m'
    FAIL 	= '\033[91m'
    ENDC 	= '\033[0m'

def colorize(text, color):
	return color + text + colors.ENDC

def printSuccess(text):
	print colorize(text, colors.SUCCESS)

def printFailure(text):
	print colorize(text, colors.FAIL)
# End Helper Functions

if __name__ == "__main__":
	main()