hosts/updateHostsFile.py

264 lines
8.1 KiB
Python
Raw Normal View History

#!/usr/bin/env python
# Script by Ben Limmer
# https://github.com/l1m5
#
# This simple Python script will combine all the host files you provide
# as sources into one, unique host file to keep you internet browsing happy.
import os
import re
import string
import sys
import tempfile
import urllib2
# Project Settings
BASEDIR_PATH = os.path.dirname(os.path.realpath(__file__))
DATA_PATH = BASEDIR_PATH + '/data'
DATA_FILENAMES = 'hosts'
UPDATE_URL_FILENAME = 'update.info'
SOURCES = os.listdir(DATA_PATH)
README_TEMPLATE = BASEDIR_PATH + '/readme_template.md'
README_FILE = BASEDIR_PATH + '/readme.md'
# Exclusions
EXCLUSION_PATTERN = '([a-zA-Z\d-]+\.){0,}' #append domain the end
# Common domains to exclude
COMMON_EXCLUSIONS = ['hulu.com']
# Global vars
exclusionRegexs = []
numberOfRules = 0
def main():
promptForUpdate()
promptForExclusions()
mergeFile = createInitialFile()
finalFile = removeDups(mergeFile)
finalizeFile(finalFile)
updateReadme(numberOfRules)
printSuccess('Success! Your shiny new hosts file has been prepared.\nIt contains ' + str(numberOfRules) + ' unique entries.')
print 'Copy the generated file to /etc/hosts or %SystemRoot%\system32\drivers\etc\hosts'
# Prompt the User
def promptForUpdate():
response = query_yes_no("Do you want to update all data sources?")
if (response == "yes"):
updateAllSources()
else:
print 'OK, we\'ll stick with what we\'ve got locally.'
def promptForExclusions():
response = query_yes_no("Do you want to exclude any domains?\n" +
"For example, hulu.com video streaming must be able to access " +
"its tracking and ad servers in order to play video.")
if (response == "yes"):
displayExclusionOptions()
else:
print 'OK, we won\'t exclude any domains.'
def promptForMoreCustomExclusions():
response = query_yes_no("Do you have more domains you want to enter?")
if (response == "yes"):
return True
else:
return False
# End Prompt the User
# Exclusion logic
def displayExclusionOptions():
for exclusionOption in COMMON_EXCLUSIONS:
response = query_yes_no("Do you want to exclude the domain " + exclusionOption + " ?")
if (response == "yes"):
excludeDomain(exclusionOption)
else:
continue
response = query_yes_no("Do you want to exclude any other domains?")
if (response == "yes"):
gatherCustomExclusions()
def gatherCustomExclusions():
while True:
domainFromUser = raw_input("Enter the domain you want to exclude (e.g. facebook.com): ")
if (isValidDomainFormat(domainFromUser)):
excludeDomain(domainFromUser)
if (promptForMoreCustomExclusions() == False):
return
def excludeDomain(domain):
exclusionRegexs.append(re.compile(EXCLUSION_PATTERN + domain))
def matchesExclusions(strippedRule):
strippedDomain = strippedRule.split()[1]
for exclusionRegex in exclusionRegexs:
if exclusionRegex.search(strippedDomain):
return True
return False
# End Exclusion Logic
# Update Logic
def updateAllSources():
for source in SOURCES:
updateURL = getUpdateURLFromFile(source)
if (updateURL == None):
continue;
print 'Updating source ' + source + ' from ' + updateURL
updatedFile = urllib2.urlopen(updateURL)
updatedFile = updatedFile.read()
updatedFile = string.replace( updatedFile, '\r', '' ) #get rid of carriage-return symbols
dataFile = open(DATA_PATH + '/' + source + '/' + DATA_FILENAMES, 'w')
dataFile.write(updatedFile)
dataFile.close()
def getUpdateURLFromFile(source):
pathToUpdateFile = DATA_PATH + '/' + source + '/' + UPDATE_URL_FILENAME
if os.path.exists(pathToUpdateFile):
updateFile = open(pathToUpdateFile, 'r')
retURL = updateFile.readline().strip()
updateFile.close()
else:
retURL = None
printFailure('Warning: Can\'t find the update file for source ' + source + '\n' +
'Make sure that there\'s a file at ' + pathToUpdateFile)
return retURL
# End Update Logic
# File Logic
def createInitialFile():
mergeFile = tempfile.NamedTemporaryFile()
for source in SOURCES:
curFile = open(DATA_PATH + '/' + source +'/' + DATA_FILENAMES, 'r')
mergeFile.write('\n# Begin ' + source + '\n')
mergeFile.write(curFile.read())
mergeFile.write('\n# End ' + source + '\n')
return mergeFile
def removeDups(mergeFile):
global numberOfRules
finalFile = open(BASEDIR_PATH + '/hosts', 'w+b')
mergeFile.seek(0) # reset file pointer
rules_seen = set()
for line in mergeFile.readlines():
if line[0].startswith("#") or line[0] == '\n':
finalFile.write(line) #maintain the comments for readability
continue
strippedRule = stripRule(line) #strip comments
if matchesExclusions(strippedRule):
continue
if strippedRule not in rules_seen:
finalFile.write(line)
rules_seen.add(strippedRule)
numberOfRules += 1
mergeFile.close()
return finalFile
def finalizeFile(finalFile):
writeOpeningHeader(finalFile)
finalFile.close()
# Some sources put comments around their rules, for accuracy we need to strip them
# the comments are preserved in the output hosts file
def stripRule(line):
splitLine = line.split()
if (len(splitLine) < 2) :
printFailure('A line in the hostfile is going to cause problems because it is nonstandard\n' +
'The line reads ' + line + ' please check your data files. Maybe you have a comment without a #?')
sys.exit()
return splitLine[0] + ' ' + splitLine[1]
def writeOpeningHeader(finalFile):
global numberOfRules
finalFile.seek(0) #reset file pointer
fileContents = finalFile.read(); #save content
finalFile.seek(0) #write at the top
finalFile.write('# This file is a merged collection of hosts from reputable sources,\n')
finalFile.write('# with a dash of crowd sourcing via Github\n#\n')
finalFile.write('# Project home page: https://github.com/StevenBlack/hosts\n#\n')
finalFile.write('# Current sources:\n')
for source in SOURCES:
finalFile.write('# ' + source + '\n')
finalFile.write('#\n')
finalFile.write('# Merging these sources produced ' + str(numberOfRules) + ' unique entries\n')
finalFile.write('# ===============================================================\n')
finalFile.write(fileContents)
def updateReadme(numberOfRules):
with open(README_FILE, "wt") as out:
for line in open(README_TEMPLATE):
out.write(line.replace('@NUM_ENTRIES@', str(numberOfRules)))
# End File Logic
# Helper Functions
## {{{ http://code.activestate.com/recipes/577058/ (r2)
def query_yes_no(question, default="yes"):
"""Ask a yes/no question via raw_input() and return their answer.
"question" is a string that is presented to the user.
"default" is the presumed answer if the user just hits <Enter>.
It must be "yes" (the default), "no" or None (meaning
an answer is required of the user).
The "answer" return value is one of "yes" or "no".
"""
valid = {"yes":"yes", "y":"yes", "ye":"yes",
"no":"no", "n":"no"}
if default == None:
prompt = " [y/n] "
elif default == "yes":
prompt = " [Y/n] "
elif default == "no":
prompt = " [y/N] "
else:
raise ValueError("invalid default answer: '%s'" % default)
while 1:
sys.stdout.write(colorize(question, colors.PROMPT) + prompt)
choice = raw_input().lower()
if default is not None and choice == '':
return default
elif choice in valid.keys():
return valid[choice]
else:
printFailure("Please respond with 'yes' or 'no' "\
"(or 'y' or 'n').\n")
## end of http://code.activestate.com/recipes/577058/ }}}
def isValidDomainFormat(domain):
if (domain == ''):
print "You didn\'t enter a domain. Try again."
return False
domainRegex = re.compile("www\d{0,3}[.]|https?")
if (domainRegex.match(domain)):
print "The domain " + domain + " is not valid. Do not include www.domain.com or http(s)://domain.com. Try again."
return False
else:
return True
# Colors
class colors:
PROMPT = '\033[94m'
SUCCESS = '\033[92m'
FAIL = '\033[91m'
ENDC = '\033[0m'
def colorize(text, color):
return color + text + colors.ENDC
def printSuccess(text):
print colorize(text, colors.SUCCESS)
def printFailure(text):
print colorize(text, colors.FAIL)
# End Helper Functions
if __name__ == "__main__":
main()