Normalizing hosts and better duplicate detction added

This commit is contained in:
Peter Naudus 2014-05-16 08:13:11 -04:00
parent c030fbd223
commit f459f1a765
3 changed files with 469882 additions and 465036 deletions

934895
hosts

File diff suppressed because it is too large Load Diff

View File

@ -2,7 +2,7 @@
This repo consolidates several reputable `hosts` files and consolidates them into a single hosts file that you can use.
**Currently this hosts file contains 469741 unique entries.**
**Currently this hosts file contains 465030 unique entries.**
## Source of host data amalgamated here

View File

@ -25,6 +25,7 @@ UPDATE_URL_FILENAME = 'update.info'
SOURCES = os.listdir(DATA_PATH)
README_TEMPLATE = BASEDIR_PATH + '/readme_template.md'
README_FILE = BASEDIR_PATH + '/readme.md'
TARGET_HOST = '0.0.0.0'
# Exclusions
EXCLUSION_PATTERN = '([a-zA-Z\d-]+\.){0,}' #append domain the end
@ -162,7 +163,7 @@ def removeDups(mergeFile):
finalFile = open(BASEDIR_PATH + '/hosts', 'w+b')
mergeFile.seek(0) # reset file pointer
rules_seen = set()
hostnames = set()
for line in mergeFile.readlines():
if line[0].startswith("#") or line[0] == '\n':
finalFile.write(line) #maintain the comments for readability
@ -170,15 +171,27 @@ def removeDups(mergeFile):
strippedRule = stripRule(line) #strip comments
if matchesExclusions(strippedRule):
continue
if strippedRule not in rules_seen:
finalFile.write(line)
rules_seen.add(strippedRule)
hostname, normalizedRule = normalizeRule(strippedRule) # normalize rule
if normalizedRule and hostname not in hostnames:
finalFile.write(normalizedRule)
hostnames.add(hostname)
numberOfRules += 1
else:
finalFile.write(line)
mergeFile.close()
return finalFile
def normalizeRule(rule):
result = re.search(r'^\s*(\d+\.\d+\.\d+\.\d+)\s+([\w\.-]+)(.*)',rule)
if result:
target, hostname, suffix = result.groups()
return hostname, "%s\t%s%s\n" % (TARGET_HOST, hostname, suffix)
print '==>%s<==' % rule
return None, None
def finalizeFile(finalFile):
writeOpeningHeader(finalFile)
finalFile.close()