filter-repo: group repo analysis functions into a class

Signed-off-by: Elijah Newren <newren@gmail.com>
This commit is contained in:
Elijah Newren 2018-12-25 21:54:16 -08:00
parent 9887dd5cbe
commit 4e2110136e

View File

@ -1891,10 +1891,15 @@ class FilteringOptions(object):
FilteringOptions.sanity_check_args(args)
return args
def analyze_commit(stats, graph, commit, parents, date, file_changes):
def equiv_class(filename):
class RepoAnalyze(object):
# First, several helper functions for analyze_commit()
@staticmethod
def equiv_class(stats, filename):
return stats['equivalence'].get(filename, (filename,))
@staticmethod
def setup_equivalence_for_rename(stats, oldname, newname):
# if A is renamed to B and B is renamed to C, then the user thinks of
# A, B, and C as all being different names for the same 'file'. We record
@ -1911,18 +1916,22 @@ def analyze_commit(stats, graph, commit, parents, date, file_changes):
for f in new_tuple:
stats['equivalence'][f] = new_tuple
@staticmethod
def setup_or_update_rename_history(stats, commit, oldname, newname):
rename_commits = stats['rename_history'].get(oldname, set())
rename_commits.add(commit)
stats['rename_history'][oldname] = rename_commits
@staticmethod
def handle_renames(stats, commit, change_types, filenames):
for index, change_type in enumerate(change_types):
if change_type == 'R':
oldname, newname = filenames[index], filenames[-1]
setup_equivalence_for_rename(stats, oldname, newname)
setup_or_update_rename_history(stats, commit, oldname, newname)
RepoAnalyze.setup_equivalence_for_rename(stats, oldname, newname)
RepoAnalyze.setup_or_update_rename_history(stats, commit,
oldname, newname)
@staticmethod
def handle_file(stats, graph, commit, modes, shas, filenames):
mode, sha, filename = modes[-1], shas[-1], filenames[-1]
@ -1936,7 +1945,7 @@ def analyze_commit(stats, graph, commit, parents, date, file_changes):
# If the file (or equivalence class of files) was recorded as deleted,
# clearly it isn't anymore
equiv = equiv_class(filename)
equiv = RepoAnalyze.equiv_class(stats, filename)
for f in equiv:
stats[delmode].pop(f, None)
@ -1954,6 +1963,8 @@ def analyze_commit(stats, graph, commit, parents, date, file_changes):
if f in stats['equivalence']:
del stats['equivalence'][f]
@staticmethod
def analyze_commit(stats, graph, commit, parents, date, file_changes):
graph.add_commit_and_parents(commit, parents)
for change in file_changes:
modes, shas, change_types, filenames = change
@ -1962,30 +1973,32 @@ def analyze_commit(stats, graph, commit, parents, date, file_changes):
if modes[-1] == '160000':
continue
elif modes[-1] == '000000':
# Track when files/directories are deleted; see 'R' below about equiv_class
for f in equiv_class(filenames[-1]):
# Track when files/directories are deleted
for f in RepoAnalyze.equiv_class(stats, filenames[-1]):
if any(x == '040000' for x in modes[0:-1]):
stats['tree_deletions'][f] = date
else:
stats['file_deletions'][f] = date
elif change_types.strip('AMT') == '':
handle_file(stats, graph, commit, modes, shas, filenames)
RepoAnalyze.handle_file(stats, graph, commit, modes, shas, filenames)
elif modes[-1] == '040000' and change_types.strip('RAM') == '':
handle_file(stats, graph, commit, modes, shas, filenames)
RepoAnalyze.handle_file(stats, graph, commit, modes, shas, filenames)
elif change_types.strip('RAM') == '':
handle_file(stats, graph, commit, modes, shas, filenames)
handle_renames(stats, commit, change_types, filenames)
RepoAnalyze.handle_file(stats, graph, commit, modes, shas, filenames)
RepoAnalyze.handle_renames(stats, commit, change_types, filenames)
else:
raise SystemExit("Unhandled change type(s): {} (in commit {})"
.format(change_types, commit))
def gather_data(args):
@staticmethod
def gather_data(args):
blob_size_progress = ProgressWriter()
num_blobs = 0
# Get sizes of blobs by sha1
a='--batch-check=%(objectname) %(objecttype) %(objectsize) %(objectsize:disk)'
cf = subprocess.Popen(['git', 'cat-file', '--batch-all-objects', a],
cmd = '--batch-check=%(objectname) %(objecttype) ' + \
'%(objectsize) %(objectsize:disk)'
cf = subprocess.Popen(['git', 'cat-file', '--batch-all-objects', cmd],
bufsize = -1,
stdout = subprocess.PIPE)
unpacked_size = {}
@ -2013,7 +2026,9 @@ def gather_data(args):
# Setup the rev-list/diff-tree process
commit_parse_progress = ProgressWriter()
num_commits = 0
cmd = 'git rev-list --topo-order --reverse {} | git diff-tree --stdin --always --root --format="%H%n%P%n%cd" --date=short -M -t -c --raw --combined-all-paths'.format(' '.join(args.refs))
cmd = ('git rev-list --topo-order --reverse {}'.format(' '.join(args.refs)) +
' | git diff-tree --stdin --always --root --format=%H%n%P%n%cd' +
' --date=short -M -t -c --raw --combined-all-paths')
dtp = subprocess.Popen(cmd, shell=True, bufsize=-1, stdout=subprocess.PIPE)
f = dtp.stdout
line = f.next()
@ -2056,7 +2071,8 @@ def gather_data(args):
file_changes.append([modes, shas, change_types, filenames])
# Analyze this commit and update progress
analyze_commit(stats, graph, commit, parents, date, file_changes)
RepoAnalyze.analyze_commit(stats, graph, commit, parents, date,
file_changes)
num_commits += 1
commit_parse_progress.show("Processed {} commits".format(num_commits))
@ -2071,7 +2087,8 @@ def gather_data(args):
return stats
def write_report(reportdir, stats):
@staticmethod
def write_report(reportdir, stats):
def datestr(datetimestr):
return datetimestr if datetimestr else '<present>'
@ -2312,7 +2329,8 @@ def write_report(reportdir, stats):
size,
names_with_sha))
def do_analysis(args, git_dir):
@staticmethod
def run(args, git_dir):
# Create the report directory as necessary
results_tmp_dir = os.path.join(git_dir, 'filter-repo')
if not os.path.isdir(results_tmp_dir):
@ -2324,12 +2342,12 @@ def do_analysis(args, git_dir):
os.mkdir(reportdir)
# Gather the data we need
stats = gather_data(args)
stats = RepoAnalyze.gather_data(args)
# Write the reports
sys.stdout.write("Writing reports to {}...".format(reportdir))
sys.stdout.flush()
write_report(reportdir, stats)
RepoAnalyze.write_report(reportdir, stats)
sys.stdout.write("done.\n")
def sanity_check(refs, is_bare):
@ -2506,7 +2524,7 @@ def run_fast_filter():
# Do analysis, if requested
if args.analyze:
do_analysis(args, git_dir)
RepoAnalyze.run(args, git_dir)
return
# Do sanity checks