mirror of
https://github.com/StevenBlack/hosts.git
synced 2024-07-12 07:12:52 +02:00
Normalizing hosts and better duplicate detction added
This commit is contained in:
parent
c030fbd223
commit
f459f1a765
@ -2,7 +2,7 @@
|
|||||||
|
|
||||||
This repo consolidates several reputable `hosts` files and consolidates them into a single hosts file that you can use.
|
This repo consolidates several reputable `hosts` files and consolidates them into a single hosts file that you can use.
|
||||||
|
|
||||||
**Currently this hosts file contains 469741 unique entries.**
|
**Currently this hosts file contains 465030 unique entries.**
|
||||||
|
|
||||||
## Source of host data amalgamated here
|
## Source of host data amalgamated here
|
||||||
|
|
||||||
|
@ -25,6 +25,7 @@ UPDATE_URL_FILENAME = 'update.info'
|
|||||||
SOURCES = os.listdir(DATA_PATH)
|
SOURCES = os.listdir(DATA_PATH)
|
||||||
README_TEMPLATE = BASEDIR_PATH + '/readme_template.md'
|
README_TEMPLATE = BASEDIR_PATH + '/readme_template.md'
|
||||||
README_FILE = BASEDIR_PATH + '/readme.md'
|
README_FILE = BASEDIR_PATH + '/readme.md'
|
||||||
|
TARGET_HOST = '0.0.0.0'
|
||||||
|
|
||||||
# Exclusions
|
# Exclusions
|
||||||
EXCLUSION_PATTERN = '([a-zA-Z\d-]+\.){0,}' #append domain the end
|
EXCLUSION_PATTERN = '([a-zA-Z\d-]+\.){0,}' #append domain the end
|
||||||
@ -162,7 +163,7 @@ def removeDups(mergeFile):
|
|||||||
finalFile = open(BASEDIR_PATH + '/hosts', 'w+b')
|
finalFile = open(BASEDIR_PATH + '/hosts', 'w+b')
|
||||||
mergeFile.seek(0) # reset file pointer
|
mergeFile.seek(0) # reset file pointer
|
||||||
|
|
||||||
rules_seen = set()
|
hostnames = set()
|
||||||
for line in mergeFile.readlines():
|
for line in mergeFile.readlines():
|
||||||
if line[0].startswith("#") or line[0] == '\n':
|
if line[0].startswith("#") or line[0] == '\n':
|
||||||
finalFile.write(line) #maintain the comments for readability
|
finalFile.write(line) #maintain the comments for readability
|
||||||
@ -170,15 +171,27 @@ def removeDups(mergeFile):
|
|||||||
strippedRule = stripRule(line) #strip comments
|
strippedRule = stripRule(line) #strip comments
|
||||||
if matchesExclusions(strippedRule):
|
if matchesExclusions(strippedRule):
|
||||||
continue
|
continue
|
||||||
if strippedRule not in rules_seen:
|
hostname, normalizedRule = normalizeRule(strippedRule) # normalize rule
|
||||||
finalFile.write(line)
|
|
||||||
rules_seen.add(strippedRule)
|
if normalizedRule and hostname not in hostnames:
|
||||||
|
finalFile.write(normalizedRule)
|
||||||
|
hostnames.add(hostname)
|
||||||
numberOfRules += 1
|
numberOfRules += 1
|
||||||
|
else:
|
||||||
|
finalFile.write(line)
|
||||||
|
|
||||||
mergeFile.close()
|
mergeFile.close()
|
||||||
|
|
||||||
return finalFile
|
return finalFile
|
||||||
|
|
||||||
|
def normalizeRule(rule):
|
||||||
|
result = re.search(r'^\s*(\d+\.\d+\.\d+\.\d+)\s+([\w\.-]+)(.*)',rule)
|
||||||
|
if result:
|
||||||
|
target, hostname, suffix = result.groups()
|
||||||
|
return hostname, "%s\t%s%s\n" % (TARGET_HOST, hostname, suffix)
|
||||||
|
print '==>%s<==' % rule
|
||||||
|
return None, None
|
||||||
|
|
||||||
def finalizeFile(finalFile):
|
def finalizeFile(finalFile):
|
||||||
writeOpeningHeader(finalFile)
|
writeOpeningHeader(finalFile)
|
||||||
finalFile.close()
|
finalFile.close()
|
||||||
|
Loading…
Reference in New Issue
Block a user