Normalizing hosts and better duplicate detction added

This commit is contained in:
Peter Naudus 2014-05-16 08:13:11 -04:00
parent c030fbd223
commit f459f1a765
3 changed files with 469882 additions and 465036 deletions

934895
hosts

File diff suppressed because it is too large Load Diff

View File

@ -2,7 +2,7 @@
This repo consolidates several reputable `hosts` files and consolidates them into a single hosts file that you can use. This repo consolidates several reputable `hosts` files and consolidates them into a single hosts file that you can use.
**Currently this hosts file contains 469741 unique entries.** **Currently this hosts file contains 465030 unique entries.**
## Source of host data amalgamated here ## Source of host data amalgamated here

View File

@ -25,6 +25,7 @@ UPDATE_URL_FILENAME = 'update.info'
SOURCES = os.listdir(DATA_PATH) SOURCES = os.listdir(DATA_PATH)
README_TEMPLATE = BASEDIR_PATH + '/readme_template.md' README_TEMPLATE = BASEDIR_PATH + '/readme_template.md'
README_FILE = BASEDIR_PATH + '/readme.md' README_FILE = BASEDIR_PATH + '/readme.md'
TARGET_HOST = '0.0.0.0'
# Exclusions # Exclusions
EXCLUSION_PATTERN = '([a-zA-Z\d-]+\.){0,}' #append domain the end EXCLUSION_PATTERN = '([a-zA-Z\d-]+\.){0,}' #append domain the end
@ -162,7 +163,7 @@ def removeDups(mergeFile):
finalFile = open(BASEDIR_PATH + '/hosts', 'w+b') finalFile = open(BASEDIR_PATH + '/hosts', 'w+b')
mergeFile.seek(0) # reset file pointer mergeFile.seek(0) # reset file pointer
rules_seen = set() hostnames = set()
for line in mergeFile.readlines(): for line in mergeFile.readlines():
if line[0].startswith("#") or line[0] == '\n': if line[0].startswith("#") or line[0] == '\n':
finalFile.write(line) #maintain the comments for readability finalFile.write(line) #maintain the comments for readability
@ -170,15 +171,27 @@ def removeDups(mergeFile):
strippedRule = stripRule(line) #strip comments strippedRule = stripRule(line) #strip comments
if matchesExclusions(strippedRule): if matchesExclusions(strippedRule):
continue continue
if strippedRule not in rules_seen: hostname, normalizedRule = normalizeRule(strippedRule) # normalize rule
finalFile.write(line)
rules_seen.add(strippedRule) if normalizedRule and hostname not in hostnames:
finalFile.write(normalizedRule)
hostnames.add(hostname)
numberOfRules += 1 numberOfRules += 1
else:
finalFile.write(line)
mergeFile.close() mergeFile.close()
return finalFile return finalFile
def normalizeRule(rule):
result = re.search(r'^\s*(\d+\.\d+\.\d+\.\d+)\s+([\w\.-]+)(.*)',rule)
if result:
target, hostname, suffix = result.groups()
return hostname, "%s\t%s%s\n" % (TARGET_HOST, hostname, suffix)
print '==>%s<==' % rule
return None, None
def finalizeFile(finalFile): def finalizeFile(finalFile):
writeOpeningHeader(finalFile) writeOpeningHeader(finalFile)
finalFile.close() finalFile.close()