mirror of
https://github.com/StevenBlack/hosts.git
synced 2024-07-06 12:32:35 +02:00
Normalizing hosts and better duplicate detction added
This commit is contained in:
parent
c030fbd223
commit
f459f1a765
@ -2,7 +2,7 @@
|
||||
|
||||
This repo consolidates several reputable `hosts` files and consolidates them into a single hosts file that you can use.
|
||||
|
||||
**Currently this hosts file contains 469741 unique entries.**
|
||||
**Currently this hosts file contains 465030 unique entries.**
|
||||
|
||||
## Source of host data amalgamated here
|
||||
|
||||
|
@ -25,6 +25,7 @@ UPDATE_URL_FILENAME = 'update.info'
|
||||
SOURCES = os.listdir(DATA_PATH)
|
||||
README_TEMPLATE = BASEDIR_PATH + '/readme_template.md'
|
||||
README_FILE = BASEDIR_PATH + '/readme.md'
|
||||
TARGET_HOST = '0.0.0.0'
|
||||
|
||||
# Exclusions
|
||||
EXCLUSION_PATTERN = '([a-zA-Z\d-]+\.){0,}' #append domain the end
|
||||
@ -162,7 +163,7 @@ def removeDups(mergeFile):
|
||||
finalFile = open(BASEDIR_PATH + '/hosts', 'w+b')
|
||||
mergeFile.seek(0) # reset file pointer
|
||||
|
||||
rules_seen = set()
|
||||
hostnames = set()
|
||||
for line in mergeFile.readlines():
|
||||
if line[0].startswith("#") or line[0] == '\n':
|
||||
finalFile.write(line) #maintain the comments for readability
|
||||
@ -170,15 +171,27 @@ def removeDups(mergeFile):
|
||||
strippedRule = stripRule(line) #strip comments
|
||||
if matchesExclusions(strippedRule):
|
||||
continue
|
||||
if strippedRule not in rules_seen:
|
||||
finalFile.write(line)
|
||||
rules_seen.add(strippedRule)
|
||||
hostname, normalizedRule = normalizeRule(strippedRule) # normalize rule
|
||||
|
||||
if normalizedRule and hostname not in hostnames:
|
||||
finalFile.write(normalizedRule)
|
||||
hostnames.add(hostname)
|
||||
numberOfRules += 1
|
||||
else:
|
||||
finalFile.write(line)
|
||||
|
||||
mergeFile.close()
|
||||
|
||||
return finalFile
|
||||
|
||||
def normalizeRule(rule):
|
||||
result = re.search(r'^\s*(\d+\.\d+\.\d+\.\d+)\s+([\w\.-]+)(.*)',rule)
|
||||
if result:
|
||||
target, hostname, suffix = result.groups()
|
||||
return hostname, "%s\t%s%s\n" % (TARGET_HOST, hostname, suffix)
|
||||
print '==>%s<==' % rule
|
||||
return None, None
|
||||
|
||||
def finalizeFile(finalFile):
|
||||
writeOpeningHeader(finalFile)
|
||||
finalFile.close()
|
||||
|
Loading…
Reference in New Issue
Block a user