filter-repo: switch --analyze to use rev-list|diff-tree pipeline

As suggested by Peff, use rev-list & diff-tree to get the information we
need, instead of relying on fast-export (with some out-of-tree patches)
to get that information.

Signed-off-by: Elijah Newren <newren@gmail.com>
This commit is contained in:
Elijah Newren 2018-11-20 10:15:46 -08:00
parent beff0b958f
commit 554c7e39af

View File

@ -1824,52 +1824,93 @@ def get_refs():
output = '' output = ''
return dict(reversed(x.split()) for x in output.splitlines()) return dict(reversed(x.split()) for x in output.splitlines())
def analyze_commit(args, commit): def analyze_commit(stats, graph, commit, parents, date, file_changes):
def equiv_class(filename): def equiv_class(filename):
return args.stats['equivalence'].get(filename, (filename,)) return stats['equivalence'].get(filename, (filename,))
for change in commit.file_changes: def setup_equivalence_for_rename(stats, oldname, newname):
if change.mode == '160000': # if A is renamed to B and B is renamed to C, then the user thinks of
continue # A, B, and C as all being different names for the same 'file'. We record
if change.type == 'D': # this as an equivalence class:
# Track when files are deleted; see 'R' below about equiv_class # stats['equivalence'][name] = (A,B,C)
for f in equiv_class(change.filename): # for name being each of A, B, and C.
args.stats['deletions'][f] = commit.committer_date old_tuple = stats['equivalence'].get(oldname, ())
elif change.type == 'R': if newname in old_tuple:
# Since we want to know when files are deleted, renames make it slightly return
# harder to track. When we have a rename, track that the files are elif old_tuple:
# equivalent; i.e. that they refer to different versions of same file. new_tuple = tuple(list(old_tuple)+[newname])
oldname, newname = change.filename
old_tuple = args.stats['equivalence'].get(oldname, ())
if newname in old_tuple:
continue
if old_tuple:
new_tuple = tuple(list(old_tuple)+[newname])
else:
new_tuple = (oldname, newname)
for f in new_tuple:
args.stats['equivalence'][f] = new_tuple
# Note, we require that we get an 'M' for every 'R' since the rename
# comes without information about sha1sum. So we can handle setting
# a few things for newname in the 'M' section below.
elif change.type == 'M':
args.stats['names'][change.blob_id].add(change.filename)
args.stats['allnames'].add(change.filename)
# If we get an 'M', clearly the file isn't deleted anymore
equiv = equiv_class(change.filename)
for f in equiv:
args.stats['deletions'].pop(f, None)
# If we get an 'M' for a file that wasn't the latest in a rename chain,
# then that equivalence class isn't valid anymore.
if equiv[-1] != change.filename:
for f in equiv:
if f in args.stats['equivalence']:
del args.stats['equivalence'][f]
else: else:
raise SystemExit("Unhandled change type: {}".format(change.type)) new_tuple = (oldname, newname)
for f in new_tuple:
stats['equivalence'][f] = new_tuple
# We're just gathering data; don't spend time dumping the commit def setup_or_update_rename_history(stats, commit, oldname, newname):
commit.dumped = 2 rename_commits = stats['rename_history'].get(oldname, set())
rename_commits.add(commit)
stats['rename_history'][oldname] = rename_commits
def handle_renames(stats, commit, change_types, filenames):
for index, change_type in enumerate(change_types):
if change_type == 'R':
oldname, newname = filenames[index], filenames[-1]
setup_equivalence_for_rename(stats, oldname, newname)
setup_or_update_rename_history(stats, commit, oldname, newname)
def handle_file(stats, graph, commit, modes, shas, filenames):
mode, sha, filename = modes[-1], shas[-1], filenames[-1]
# Figure out kind of deletions to undo for this file, and update lists
# of all-names-by-sha and all-filenames
delmode = 'tree_deletions'
if mode != '040000':
delmode = 'file_deletions'
stats['names'][sha].add(filename)
stats['allnames'].add(filename)
# If the file (or equivalence class of files) was recorded as deleted,
# clearly it isn't anymore
equiv = equiv_class(filename)
for f in equiv:
stats[delmode].pop(f, None)
# If we get a modify/add for a path that was renamed, we may need to break
# the equivalence class. However, if the modify/add was on a branch that
# doesn't have the rename in its history, we are still okay.
need_to_break_equivalence = False
if equiv[-1] != filename:
for rename_commit in stats['rename_history'][filename]:
if graph.is_ancestor(rename_commit, commit):
need_to_break_equivalence = True
if need_to_break_equivalence:
for f in equiv:
if f in stats['equivalence']:
del stats['equivalence'][f]
graph.add_commit_and_parents(commit, parents)
for change in file_changes:
modes, shas, change_types, filenames = change
if len(parents) == 1 and change_types.startswith('R'):
change_types = 'R' # remove the rename score; we don't care
if modes[-1] == '160000':
continue
elif modes[-1] == '000000':
# Track when files/directories are deleted; see 'R' below about equiv_class
for f in equiv_class(filenames[-1]):
if any(x == '040000' for x in modes[0:-1]):
stats['tree_deletions'][f] = date
else:
stats['file_deletions'][f] = date
elif change_types.strip('AMT') == '':
handle_file(stats, graph, commit, modes, shas, filenames)
elif modes[-1] == '040000' and change_types.strip('RAM') == '':
handle_file(stats, graph, commit, modes, shas, filenames)
elif change_types.strip('RAM') == '':
handle_file(stats, graph, commit, modes, shas, filenames)
handle_renames(stats, commit, change_types, filenames)
else:
raise SystemExit("Unhandled change type(s): {} (in commit {})"
.format(change_types, commit))
def gather_data(args): def gather_data(args):
blob_size_progress = ProgressWriter() blob_size_progress = ProgressWriter()
@ -1893,36 +1934,74 @@ def gather_data(args):
blob_size_progress.finish() blob_size_progress.finish()
stats = {'names': collections.defaultdict(set), stats = {'names': collections.defaultdict(set),
'allnames' : set(), 'allnames' : set(),
'deletions': {}, 'file_deletions': {},
'tree_deletions': {},
'equivalence': {}, 'equivalence': {},
'rename_history': collections.defaultdict(set),
'unpacked_size': unpacked_size, 'unpacked_size': unpacked_size,
'packed_size': packed_size} 'packed_size': packed_size,
'num_commits': 0}
# Setup the fast-export process # Setup the rev-list/diff-tree process
fep_cmd = ['git', 'fast-export', commit_parse_progress = ProgressWriter()
'-M', num_commits = 0
'--no-data', cmd = 'git rev-list --topo-order --reverse {} | git diff-tree --stdin --always --root --format="%H%n%P%n%cd" --date=short -M -t -c --raw --combined-all-paths'.format(' '.join(args.refs))
'--show-original-ids', dtp = subprocess.Popen(cmd, shell=True, stdout=subprocess.PIPE)
'--always-show-modify-after-rename', f = dtp.stdout
'--signed-tags=strip', line = f.next()
'--tag-of-filtered-object=rewrite', cont = bool(line)
'--use-done-feature'] + args.refs graph = AncestryGraph()
fep = subprocess.Popen(fep_cmd, stdout=subprocess.PIPE) while cont:
input = fep.stdout commit = line.rstrip()
output = open(os.devnull, 'w') parents = f.next().split()
date = f.next().rstrip()
# Create and run the filter # We expect a blank line next; if we get a non-blank line then
setattr(args, 'stats', stats) # this commit modified no files and we need to move on to the next.
analyze_filter = FastExportFilter( # If there is no line, we've reached end-of-input.
commit_callback = lambda c : analyze_commit(args, c), try:
) line = f.next().rstrip()
analyze_filter.run(input, output, quiet = args.quiet) cont = True
setattr(args, 'num_commits', analyze_filter.num_commits_parsed()) except StopIteration:
cont = False
# Close the output, ensure fast-export have completed # If we haven't reached end of input, and we got a blank line meaning
output.close() # a commit that has modified files, then get the file changes associated
if fep.wait(): # with this commit.
raise SystemExit("Error: fast-export failed; see above.") file_changes = []
if cont and not line:
cont = False
for line in f:
if not line.startswith(':'):
cont = True
break
n = 1+max(1, len(parents))
assert line.startswith(':'*(n-1))
relevant = line[n-1:-1]
splits = relevant.split(None, n)
modes = splits[0:n]
splits = splits[n].split(None, n)
shas = splits[0:n]
splits = splits[n].split('\t')
change_types = splits[0]
filenames = [PathQuoting.dequote(x) for x in splits[1:]]
file_changes.append([modes, shas, change_types, filenames])
# Analyze this commit and update progress
analyze_commit(stats, graph, commit, parents, date, file_changes)
num_commits += 1
commit_parse_progress.show("Processed {} commits".format(num_commits))
# Show the final commits processed message and record the number of commits
commit_parse_progress.finish()
stats['num_commits'] = num_commits
# Close the output, ensure rev-list|diff-tree pipeline completed successfully
dtp.stdout.close()
if dtp.wait():
raise SystemExit("Error: rev-list|diff-tree pipeline failed; see above.")
return stats
def do_analysis(args, git_dir): def do_analysis(args, git_dir):
# Create the report file as necessary # Create the report file as necessary
@ -1936,10 +2015,10 @@ def do_analysis(args, git_dir):
os.mkdir(reportdir) os.mkdir(reportdir)
# Now gather the data we need # Now gather the data we need
gather_data(args) stats = gather_data(args)
def datestr(datetimeobj): def datestr(datetimestr):
return datetimeobj.strftime('%F') if datetimeobj else '<present>' return datetimestr if datetimestr else '<present>'
def dirnames(path): def dirnames(path):
while True: while True:
@ -1956,11 +2035,11 @@ def do_analysis(args, git_dir):
'unpacked': collections.defaultdict(int)} 'unpacked': collections.defaultdict(int)}
dir_size = {'packed': collections.defaultdict(int), dir_size = {'packed': collections.defaultdict(int),
'unpacked': collections.defaultdict(int)} 'unpacked': collections.defaultdict(int)}
for sha in args.stats['names']: for sha in stats['names']:
size = {'packed': args.stats['packed_size'][sha], size = {'packed': stats['packed_size'][sha],
'unpacked': args.stats['unpacked_size'][sha]} 'unpacked': stats['unpacked_size'][sha]}
for which in ('packed', 'unpacked'): for which in ('packed', 'unpacked'):
for name in args.stats['names'][sha]: for name in stats['names'][sha]:
total_size[which] += size[which] total_size[which] += size[which]
path_size[which][name] += size[which] path_size[which][name] += size[which]
basename, ext = os.path.splitext(name) basename, ext = os.path.splitext(name)
@ -1970,9 +2049,8 @@ def do_analysis(args, git_dir):
# Determine if and when extensions and directories were deleted # Determine if and when extensions and directories were deleted
ext_deleted_data = {} ext_deleted_data = {}
dir_deleted_data = {} for name in stats['allnames']:
for name in args.stats['allnames']: when = stats['file_deletions'].get(name, None)
when = args.stats['deletions'].get(name, None)
# Update the extension # Update the extension
basename, ext = os.path.splitext(name) basename, ext = os.path.splitext(name)
@ -1984,20 +2062,14 @@ def do_analysis(args, git_dir):
else: else:
ext_deleted_data[ext] = when ext_deleted_data[ext] = when
# Update the dirs dir_deleted_data = {}
for dirname in dirnames(name): for name in dir_size['packed']:
if when is None: dir_deleted_data[name] = stats['tree_deletions'].get(name, None)
dir_deleted_data[dirname] = None
elif dirname in dir_deleted_data:
if dir_deleted_data[dirname] is not None:
dir_deleted_data[dirname] = max(dir_deleted_data[dirname], when)
else:
dir_deleted_data[dirname] = when
with open(os.path.join(reportdir, "README"), 'w') as f: with open(os.path.join(reportdir, "README"), 'w') as f:
# Give a basic overview of this file # Give a basic overview of this file
f.write("== Overal Statistics ==\n") f.write("== Overal Statistics ==\n")
f.write(" Number of commits: {}\n".format(args.num_commits)) f.write(" Number of commits: {}\n".format(stats['num_commits']))
f.write(" Number of filenames: {}\n".format(len(path_size['packed']))) f.write(" Number of filenames: {}\n".format(len(path_size['packed'])))
f.write(" Number of directories: {}\n".format(len(dir_size['packed']))) f.write(" Number of directories: {}\n".format(len(dir_size['packed'])))
f.write(" Number of file extensions: {}\n".format(len(ext_size['packed']))) f.write(" Number of file extensions: {}\n".format(len(ext_size['packed'])))
@ -2071,17 +2143,6 @@ def do_analysis(args, git_dir):
anymore. We could try to portray this to the user, but it's easier anymore. We could try to portray this to the user, but it's easier
for the user to just break the pairing and only report unbroken for the user to just break the pairing and only report unbroken
rename pairings to the user. rename pairings to the user.
* Since modifying a renamed file on the side of history that doesn't
rename it should be expected to be common (unlike modifying a deleted
file on the side of history that doesn't delete it), tracking history
becomes more important to avoid incorrectly breaking rename chains.
This has not yet been implemented. This seriously raises the risk
of erroneously breaking rename pairings; a future release may address
this shortcoming.
* We only use rename detection, not copy detection. However, that
means that if some commit in history renamed two files into the same
location, we won't pick up one of the two renames and will instead
report that branch as having been deleted.
* The ability for users to rename files differently in different * The ability for users to rename files differently in different
branches means that our chains of renames will not necessarily be branches means that our chains of renames will not necessarily be
linear but may branch out. linear but may branch out.
@ -2093,7 +2154,7 @@ def do_analysis(args, git_dir):
# too. # too.
with open(os.path.join(reportdir, "renames.txt"), 'w') as f: with open(os.path.join(reportdir, "renames.txt"), 'w') as f:
seen = set() seen = set()
for pathname,equiv_group in sorted(args.stats['equivalence'].iteritems(), for pathname,equiv_group in sorted(stats['equivalence'].iteritems(),
key=lambda x:x[1]): key=lambda x:x[1]):
if equiv_group in seen: if equiv_group in seen:
continue continue
@ -2156,7 +2217,7 @@ def do_analysis(args, git_dir):
f.write("Format: unpacked size, packed size, date deleted, path name(s)\n") f.write("Format: unpacked size, packed size, date deleted, path name(s)\n")
for pathname, size in sorted(path_size['packed'].iteritems(), for pathname, size in sorted(path_size['packed'].iteritems(),
key=lambda x:x[1], reverse=True): key=lambda x:x[1], reverse=True):
when = args.stats['deletions'].get(pathname, None) when = stats['file_deletions'].get(pathname, None)
if when: if when:
f.write(" {:10d} {:10d} {:10s} {}\n" f.write(" {:10d} {:10d} {:10s} {}\n"
.format(path_size['unpacked'][pathname], .format(path_size['unpacked'][pathname],
@ -2169,7 +2230,7 @@ def do_analysis(args, git_dir):
f.write("Format: unpacked size, packed size, date deleted, pathectory name\n") f.write("Format: unpacked size, packed size, date deleted, pathectory name\n")
for pathname, size in sorted(path_size['packed'].iteritems(), for pathname, size in sorted(path_size['packed'].iteritems(),
key=lambda x:x[1], reverse=True): key=lambda x:x[1], reverse=True):
when = args.stats['deletions'].get(pathname, None) when = stats['file_deletions'].get(pathname, None)
f.write(" {:10d} {:10d} {:10s} {}\n" f.write(" {:10d} {:10d} {:10s} {}\n"
.format(path_size['unpacked'][pathname], .format(path_size['unpacked'][pathname],
size, size,
@ -2180,19 +2241,19 @@ def do_analysis(args, git_dir):
with open(os.path.join(reportdir, "blob-shas-and-paths.txt"), 'w') as f: with open(os.path.join(reportdir, "blob-shas-and-paths.txt"), 'w') as f:
f.write("== Files by sha and associated pathnames in reverse size ==\n") f.write("== Files by sha and associated pathnames in reverse size ==\n")
f.write("Format: sha, unpacked size, packed size, filename(s) object stored as\n") f.write("Format: sha, unpacked size, packed size, filename(s) object stored as\n")
for sha, size in sorted(args.stats['packed_size'].iteritems(), for sha, size in sorted(stats['packed_size'].iteritems(),
key=lambda x:x[1], reverse=True): key=lambda x:x[1], reverse=True):
if sha not in args.stats['names']: if sha not in stats['names']:
# Some objects in the repository might not be referenced, or not # Some objects in the repository might not be referenced, or not
# referenced by the branches/tags the user cares about; skip them. # referenced by the branches/tags the user cares about; skip them.
continue continue
names_with_sha = args.stats['names'][sha] names_with_sha = stats['names'][sha]
if len(names_with_sha) == 1: if len(names_with_sha) == 1:
names_with_sha = names_with_sha.pop() names_with_sha = names_with_sha.pop()
else: else:
names_with_sha = sorted(list(names_with_sha)) names_with_sha = sorted(list(names_with_sha))
f.write(" {} {:10d} {:10d} {}\n".format(sha, f.write(" {} {:10d} {:10d} {}\n".format(sha,
args.stats['unpacked_size'][sha], stats['unpacked_size'][sha],
size, size,
names_with_sha)) names_with_sha))