#!/usr/bin/env python

# Script by Ben Limmer
# https://github.com/l1m5
#
# This Python script will combine all the host files you provide
# as sources into one, unique host file to keep you internet browsing happy.

# Making Python 2 compatible with Python 3
from __future__ import absolute_import, division, print_function, unicode_literals

import os
import platform
import re
import string
import subprocess
import sys
import tempfile
import glob
# zip files are not used actually, support deleted
# StringIO is not needed in Python 3
# Python 3 works differently with urlopen

# Supporting urlopen in Python 2 and Python 3
try:
	from urllib.parse import urlparse, urlencode
	from urllib.request import urlopen, Request
	from urllib.error import HTTPError
except ImportError:
	from urlparse import urlparse
	from urllib import urlencode
	from urllib2 import urlopen, Request, HTTPError

# This function handles both Python 2 and Python 3
def getFileByUrl(url):
	try:
		f = urlopen(url)
		return f.read().decode( "UTF-8" )
	except:
		print ( "Problem getting file: ", url );
		# raise

# In Python 3   "print" is a function, braces are added everywhere

# Detecting Python 3 for version-dependent implementations
Python3     = False;
cur_version = sys.version_info
if cur_version >= ( 3, 0 ):
	Python3 = True;

# This function works in both Python 2 and Python 3
def myInput( msg = "" ):
	if Python3:
		return input( msg );
	else:
		return raw_input( msg );


# Cross-python writing function
def writeData( f, data ):
	if Python3:
		f.write( bytes( data, 'UTF-8' ))
	else:
		f.write( str( data ).encode( 'UTF-8' ))

# This function doesn't list hidden files
def listdir_nohidden( path ):
	return glob.glob( os.path.join( path, '*' ))

# Project Settings
BASEDIR_PATH        = os.path.dirname( os.path.realpath( __file__ ))
DATA_PATH           = os.path.join( BASEDIR_PATH, 'data' )
DATA_FILENAMES      = 'hosts'
UPDATE_URL_FILENAME = 'update.info'
SOURCES             = listdir_nohidden( DATA_PATH )
README_TEMPLATE     = os.path.join( BASEDIR_PATH, 'readme_template.md' )
README_FILE         = os.path.join( BASEDIR_PATH, 'readme.md' )
TARGET_HOST         = '0.0.0.0'
WHITELIST_FILE      = os.path.join( BASEDIR_PATH, 'whitelist' )

# Exclusions
EXCLUSION_PATTERN = '([a-zA-Z\d-]+\.){0,}' #append domain the end
EXCLUSIONS        = []
# Common domains to exclude
COMMON_EXCLUSIONS = ['hulu.com']

# Global vars
exclusionRegexs = []
numberOfRules   = 0

def main():
	promptForUpdate()
	promptForExclusions()
	mergeFile = createInitialFile()
	removeOldHostsFile()
	finalFile = removeDupsAndExcl( mergeFile )
	finalizeFile( finalFile )
	updateReadme( numberOfRules )
	printSuccess( 'Success! Your new hosts file has been prepared.\nIt contains ' + "{:,}".format( numberOfRules ) + ' unique entries.' )

	promptForMove( finalFile )

# Prompt the User
def promptForUpdate():
	# Create hosts file if it doesn't exists
	if not os.path.isfile( os.path.join(BASEDIR_PATH, 'hosts' )):
		try:
			file = open( os.path.join( BASEDIR_PATH, 'hosts' ), 'w+' ).close()
		except:
			printFailure( "ERROR: No 'hosts' file in the folder, try creating one manually" )

	response = query_yes_no( "Do you want to update all data sources?" )
	if ( response == "yes" ):
		updateAllSources()
	else:
		print ( 'OK, we\'ll stick with what we\'ve  got locally.' )

def promptForExclusions():
	response = query_yes_no( "Do you want to exclude any domains?\n" +
							"For example, hulu.com video streaming must be able to access " +
							"its tracking and ad servers in order to play video." )
	if ( response == "yes" ):
		displayExclusionOptions()
	else:
		print ( 'OK, we\'ll only exclude domains in the whitelist.' )

def promptForMoreCustomExclusions():
	response = query_yes_no( "Do you have more domains you want to enter?" )
	if ( response == "yes" ):
		return True
	else:
		return False

def promptForMove( finalFile ):
  response = query_yes_no( "Do you want to replace your existing hosts file with the newly generated file?" )
  if ( response == "yes" ):
    moveHostsFileIntoPlace( finalFile )
  else:
    return False
# End Prompt the User

# Exclusion logic
def displayExclusionOptions():
	for exclusionOption in COMMON_EXCLUSIONS:
		response = query_yes_no( "Do you want to exclude the domain " + exclusionOption + " ?" )
		if ( response == "yes" ):
			excludeDomain(exclusionOption)
		else:
			continue
	response = query_yes_no( "Do you want to exclude any other domains?" )
	if ( response == "yes" ):
		gatherCustomExclusions()

def gatherCustomExclusions():
	while True:
		# Cross-python Input
		domainFromUser = myInput( "Enter the domain you want to exclude (e.g. facebook.com): " )
		if (isValidDomainFormat( domainFromUser )):
			excludeDomain( domainFromUser )
		if ( promptForMoreCustomExclusions() == False ):
			return

def excludeDomain( domain ):
	exclusionRegexs.append( re.compile( EXCLUSION_PATTERN + domain ))

def matchesExclusions( strippedRule ):
	strippedDomain = strippedRule.split()[1]
	for exclusionRegex in exclusionRegexs:
		if exclusionRegex.search( strippedDomain ):
			return True
	return False
# End Exclusion Logic

# Update Logic
def updateAllSources():
	for source in SOURCES:
		updateURL = getUpdateURLFromFile( source )
		if ( updateURL == None ):
			continue;
		print ( 'Updating source ' + source + ' from ' + updateURL )
		# Cross-python call
		updatedFile = getFileByUrl( updateURL );
		updatedFile = updatedFile.replace( '\r', '' ) #get rid of carriage-return symbols

		# This is cross-python code
		dataFile = open( os.path.join( DATA_PATH, source, DATA_FILENAMES ), 'wb' )
		writeData( dataFile, updatedFile );
		dataFile.close()

def getUpdateURLFromFile( source ):
	pathToUpdateFile = os.path.join( DATA_PATH, source, UPDATE_URL_FILENAME )
	if os.path.exists( pathToUpdateFile ):
		updateFile = open( pathToUpdateFile, 'r' )
		retURL     = updateFile.readline().strip()
		updateFile.close()
	else:
		retURL = None
		printFailure( 'Warning: Can\'t find the update file for source ' + source + '\n' +
					 'Make sure that there\'s a file at ' + pathToUpdateFile )
	return retURL
# End Update Logic

# File Logic
def createInitialFile():
	mergeFile = tempfile.NamedTemporaryFile()
	for source in SOURCES:
		curFile = open( os.path.join( DATA_PATH, source, DATA_FILENAMES ), 'r' )
		#Done in a cross-python way
		writeData( mergeFile, curFile.read() )

	return mergeFile

def removeDupsAndExcl( mergeFile ):
	global numberOfRules
	if os.path.isfile( WHITELIST_FILE ):
		with open( WHITELIST_FILE, "r" ) as ins:
			for line in ins:
				EXCLUSIONS.append( line )

    # Another mode is required to read and write the file in Python 3
	finalFile = open( os.path.join( BASEDIR_PATH, 'hosts' ), 'r+b' )
	mergeFile.seek( 0 ) # reset file pointer

	hostnames = set()
	hostnames.add( "localhost" )
	for line in mergeFile.readlines():
		write = 'true'
        # Explicit encoding
		line = line.decode( "UTF-8" )
		# Testing the first character doesn't require startswith
		if line[0] == '#' or re.match(r'^\s*$', line[0]):
			# Cross-python write
			writeData( finalFile, line )
			continue
		if '::1' in line:
			continue

		strippedRule = stripRule( line ) #strip comments
		if len( strippedRule ) == 0:
			continue
		if matchesExclusions( strippedRule ):
			continue
		hostname, normalizedRule = normalizeRule( strippedRule ) # normalize rule
		for exclude in EXCLUSIONS:
			if ( exclude in line ):
				write = 'false'
				break
		if normalizedRule and ( hostname not in hostnames ) and ( write == 'true' ):
			writeData( finalFile, normalizedRule )
			hostnames.add( hostname )
			numberOfRules += 1

	mergeFile.close()

	return finalFile

def normalizeRule(rule):
	result = re.search(r'^[ \t]*(\d+\.\d+\.\d+\.\d+)\s+([\w\.-]+)(.*)', rule )
	if result:
		target, hostname, suffix = result.groups()
		hostname = hostname.lower() # explicitly lowercase hostname
		if suffix is not '':
			# add suffix as comment only, not as a separate host
			return hostname, "%s %s #%s\n" % ( TARGET_HOST, hostname, suffix )
		else:
			return hostname, "%s %s\n" % ( TARGET_HOST, hostname )
	print ( '==>%s<==' % rule )
	return None, None

def finalizeFile( finalFile ):
	writeOpeningHeader( finalFile )
	finalFile.close()

# Some sources put comments around their rules, for accuracy we need to strip them
# the comments are preserved in the output hosts file
def stripRule( line ):
	splitLine = line.split()
	if ( len( splitLine ) < 2 ) :
		# just return blank
		return ''
	else:
		return splitLine[0] + ' ' + splitLine[1]

def writeOpeningHeader(finalFile):
	global numberOfRules
	finalFile.seek( 0 ) #reset file pointer
	fileContents = finalFile.read(); #save content
	finalFile.seek( 0 ) #write at the top
	writeData( finalFile, '# This file is a merged collection of hosts from reputable sources,\n' )
	writeData( finalFile, '# with a dash of crowd sourcing via Github\n#\n' )
	writeData( finalFile, '# Project home page: https://github.com/StevenBlack/hosts\n#\n' )
	writeData( finalFile, '# ===============================================================\n' )
	writeData( finalFile, '\n' )
	writeData( finalFile, '127.0.0.1 localhost\n' )
	writeData( finalFile, '::1 localhost\n' )
	writeData( finalFile, '\n' )

	preamble = os.path.join( BASEDIR_PATH, "myhosts" );
	if os.path.isfile( preamble ):
		with open( preamble, "r" ) as f:
			writeData( finalFile, f.read() );

	finalFile.write( fileContents )

def updateReadme( numberOfRules ):
	with open( README_FILE, "wt" ) as out:
		for line in open( README_TEMPLATE ):
			out.write( line.replace( '@NUM_ENTRIES@', "{:,}".format( numberOfRules )))

def moveHostsFileIntoPlace( finalFile ):
	if ( os.name == 'posix' ):
		print ( 'Moving the file requires administrative privileges. You might need to enter your password.' )
		if(subprocess.call( ["/usr/bin/sudo", "cp", os.path.abspath( finalFile.name ), "/etc/hosts"] )):
			printFailure( "Moving the file failed." )
		print ('Flushing the DNS Cache to utilize new hosts file...' )
		if ( platform.system() == 'Darwin' ):
			if( subprocess.call( ["/usr/bin/sudo", "killall", "-HUP", "mDNSResponder"] )):
				printFailure( "Flushing the DNS Cache failed." )
		else:
			if os.path.isfile( "/etc/rc.d/init.d/nscd" ):
				if( subprocess.call(["/usr/bin/sudo", "/etc/rc.d/init.d/nscd", "restart"] )):
					printFailure( "Flushing the DNS Cache failed." )
	elif ( os.name == 'nt' ):
		print ( 'Automatically moving the hosts file in place is not yet supported.' )
		print ( 'Please move the generated file to %SystemRoot%\system32\drivers\etc\hosts' )

def removeOldHostsFile():       		# hotfix since merging with an already existing hosts file leads to artefacts and duplicates
	oldFilePath = os.path.join( BASEDIR_PATH, 'hosts' )
	open( oldFilePath, 'a' ).close()		# create if already removed, so remove wont raise an error
	os.remove(oldFilePath);
	open( oldFilePath, 'a' ).close()		# create new empty hostsfile

# End File Logic

# Helper Functions
## {{{ http://code.activestate.com/recipes/577058/ (r2)
def query_yes_no( question, default = "yes" ):
    """Ask a yes/no question via raw_input() and return their answer.

    "question" is a string that is presented to the user.
    "default" is the presumed answer if the user just hits <Enter>.
        It must be "yes" (the default), "no" or None (meaning
        an answer is required of the user).

    The "answer" return value is one of "yes" or "no".
    """
    valid = {"yes":"yes",   "y":"yes",  "ye":"yes",
             "no":"no",     "n":"no"}
    if default == None:
        prompt = " [y/n] "
    elif default == "yes":
        prompt = " [Y/n] "
    elif default == "no":
        prompt = " [y/N] "
    else:
        raise ValueError( "invalid default answer: '%s'" % default )

    while 1:
        sys.stdout.write( colorize( question, colors.PROMPT ) + prompt )
        # Changed to be cross-python
        choice = myInput().lower()
        if default is not None and choice == '':
            return default
        elif choice in valid.keys():
            return valid[choice]
        else:
            printFailure( "Please respond with 'yes' or 'no' "\
                             "(or 'y' or 'n').\n" )
## end of http://code.activestate.com/recipes/577058/ }}}

def isValidDomainFormat( domain ):
	if ( domain == '' ):
		print ( "You didn\'t enter a domain. Try again." )
		return False
	domainRegex = re.compile( "www\d{0,3}[.]|https?" )
	if ( domainRegex.match( domain )):
		print ( "The domain " + domain + " is not valid. Do not include www.domain.com or http(s)://domain.com. Try again." )
		return False
	else:
		return True

# Colors
class colors:
    PROMPT 	= '\033[94m'
    SUCCESS = '\033[92m'
    FAIL 	= '\033[91m'
    ENDC 	= '\033[0m'

def colorize( text, color ):
	return color + text + colors.ENDC

def printSuccess( text ):
	print ( colorize(text, colors.SUCCESS ))

def printFailure( text ):
	print ( colorize( text, colors.FAIL ))
# End Helper Functions

if __name__ == "__main__":
	main()