mirror of
https://github.com/newren/git-filter-repo.git
synced 2024-07-06 18:32:14 +02:00
filter-repo: switch --analyze to use rev-list|diff-tree pipeline
As suggested by Peff, use rev-list & diff-tree to get the information we need, instead of relying on fast-export (with some out-of-tree patches) to get that information. Signed-off-by: Elijah Newren <newren@gmail.com>
This commit is contained in:
parent
beff0b958f
commit
554c7e39af
271
git-filter-repo
271
git-filter-repo
@ -1824,52 +1824,93 @@ def get_refs():
|
||||
output = ''
|
||||
return dict(reversed(x.split()) for x in output.splitlines())
|
||||
|
||||
def analyze_commit(args, commit):
|
||||
def analyze_commit(stats, graph, commit, parents, date, file_changes):
|
||||
def equiv_class(filename):
|
||||
return args.stats['equivalence'].get(filename, (filename,))
|
||||
return stats['equivalence'].get(filename, (filename,))
|
||||
|
||||
for change in commit.file_changes:
|
||||
if change.mode == '160000':
|
||||
continue
|
||||
if change.type == 'D':
|
||||
# Track when files are deleted; see 'R' below about equiv_class
|
||||
for f in equiv_class(change.filename):
|
||||
args.stats['deletions'][f] = commit.committer_date
|
||||
elif change.type == 'R':
|
||||
# Since we want to know when files are deleted, renames make it slightly
|
||||
# harder to track. When we have a rename, track that the files are
|
||||
# equivalent; i.e. that they refer to different versions of same file.
|
||||
oldname, newname = change.filename
|
||||
old_tuple = args.stats['equivalence'].get(oldname, ())
|
||||
if newname in old_tuple:
|
||||
continue
|
||||
if old_tuple:
|
||||
new_tuple = tuple(list(old_tuple)+[newname])
|
||||
else:
|
||||
new_tuple = (oldname, newname)
|
||||
for f in new_tuple:
|
||||
args.stats['equivalence'][f] = new_tuple
|
||||
# Note, we require that we get an 'M' for every 'R' since the rename
|
||||
# comes without information about sha1sum. So we can handle setting
|
||||
# a few things for newname in the 'M' section below.
|
||||
elif change.type == 'M':
|
||||
args.stats['names'][change.blob_id].add(change.filename)
|
||||
args.stats['allnames'].add(change.filename)
|
||||
# If we get an 'M', clearly the file isn't deleted anymore
|
||||
equiv = equiv_class(change.filename)
|
||||
for f in equiv:
|
||||
args.stats['deletions'].pop(f, None)
|
||||
# If we get an 'M' for a file that wasn't the latest in a rename chain,
|
||||
# then that equivalence class isn't valid anymore.
|
||||
if equiv[-1] != change.filename:
|
||||
for f in equiv:
|
||||
if f in args.stats['equivalence']:
|
||||
del args.stats['equivalence'][f]
|
||||
def setup_equivalence_for_rename(stats, oldname, newname):
|
||||
# if A is renamed to B and B is renamed to C, then the user thinks of
|
||||
# A, B, and C as all being different names for the same 'file'. We record
|
||||
# this as an equivalence class:
|
||||
# stats['equivalence'][name] = (A,B,C)
|
||||
# for name being each of A, B, and C.
|
||||
old_tuple = stats['equivalence'].get(oldname, ())
|
||||
if newname in old_tuple:
|
||||
return
|
||||
elif old_tuple:
|
||||
new_tuple = tuple(list(old_tuple)+[newname])
|
||||
else:
|
||||
raise SystemExit("Unhandled change type: {}".format(change.type))
|
||||
new_tuple = (oldname, newname)
|
||||
for f in new_tuple:
|
||||
stats['equivalence'][f] = new_tuple
|
||||
|
||||
# We're just gathering data; don't spend time dumping the commit
|
||||
commit.dumped = 2
|
||||
def setup_or_update_rename_history(stats, commit, oldname, newname):
|
||||
rename_commits = stats['rename_history'].get(oldname, set())
|
||||
rename_commits.add(commit)
|
||||
stats['rename_history'][oldname] = rename_commits
|
||||
|
||||
def handle_renames(stats, commit, change_types, filenames):
|
||||
for index, change_type in enumerate(change_types):
|
||||
if change_type == 'R':
|
||||
oldname, newname = filenames[index], filenames[-1]
|
||||
setup_equivalence_for_rename(stats, oldname, newname)
|
||||
setup_or_update_rename_history(stats, commit, oldname, newname)
|
||||
|
||||
def handle_file(stats, graph, commit, modes, shas, filenames):
|
||||
mode, sha, filename = modes[-1], shas[-1], filenames[-1]
|
||||
|
||||
# Figure out kind of deletions to undo for this file, and update lists
|
||||
# of all-names-by-sha and all-filenames
|
||||
delmode = 'tree_deletions'
|
||||
if mode != '040000':
|
||||
delmode = 'file_deletions'
|
||||
stats['names'][sha].add(filename)
|
||||
stats['allnames'].add(filename)
|
||||
|
||||
# If the file (or equivalence class of files) was recorded as deleted,
|
||||
# clearly it isn't anymore
|
||||
equiv = equiv_class(filename)
|
||||
for f in equiv:
|
||||
stats[delmode].pop(f, None)
|
||||
|
||||
# If we get a modify/add for a path that was renamed, we may need to break
|
||||
# the equivalence class. However, if the modify/add was on a branch that
|
||||
# doesn't have the rename in its history, we are still okay.
|
||||
need_to_break_equivalence = False
|
||||
if equiv[-1] != filename:
|
||||
for rename_commit in stats['rename_history'][filename]:
|
||||
if graph.is_ancestor(rename_commit, commit):
|
||||
need_to_break_equivalence = True
|
||||
|
||||
if need_to_break_equivalence:
|
||||
for f in equiv:
|
||||
if f in stats['equivalence']:
|
||||
del stats['equivalence'][f]
|
||||
|
||||
graph.add_commit_and_parents(commit, parents)
|
||||
for change in file_changes:
|
||||
modes, shas, change_types, filenames = change
|
||||
if len(parents) == 1 and change_types.startswith('R'):
|
||||
change_types = 'R' # remove the rename score; we don't care
|
||||
if modes[-1] == '160000':
|
||||
continue
|
||||
elif modes[-1] == '000000':
|
||||
# Track when files/directories are deleted; see 'R' below about equiv_class
|
||||
for f in equiv_class(filenames[-1]):
|
||||
if any(x == '040000' for x in modes[0:-1]):
|
||||
stats['tree_deletions'][f] = date
|
||||
else:
|
||||
stats['file_deletions'][f] = date
|
||||
elif change_types.strip('AMT') == '':
|
||||
handle_file(stats, graph, commit, modes, shas, filenames)
|
||||
elif modes[-1] == '040000' and change_types.strip('RAM') == '':
|
||||
handle_file(stats, graph, commit, modes, shas, filenames)
|
||||
elif change_types.strip('RAM') == '':
|
||||
handle_file(stats, graph, commit, modes, shas, filenames)
|
||||
handle_renames(stats, commit, change_types, filenames)
|
||||
else:
|
||||
raise SystemExit("Unhandled change type(s): {} (in commit {})"
|
||||
.format(change_types, commit))
|
||||
|
||||
def gather_data(args):
|
||||
blob_size_progress = ProgressWriter()
|
||||
@ -1893,36 +1934,74 @@ def gather_data(args):
|
||||
blob_size_progress.finish()
|
||||
stats = {'names': collections.defaultdict(set),
|
||||
'allnames' : set(),
|
||||
'deletions': {},
|
||||
'file_deletions': {},
|
||||
'tree_deletions': {},
|
||||
'equivalence': {},
|
||||
'rename_history': collections.defaultdict(set),
|
||||
'unpacked_size': unpacked_size,
|
||||
'packed_size': packed_size}
|
||||
'packed_size': packed_size,
|
||||
'num_commits': 0}
|
||||
|
||||
# Setup the fast-export process
|
||||
fep_cmd = ['git', 'fast-export',
|
||||
'-M',
|
||||
'--no-data',
|
||||
'--show-original-ids',
|
||||
'--always-show-modify-after-rename',
|
||||
'--signed-tags=strip',
|
||||
'--tag-of-filtered-object=rewrite',
|
||||
'--use-done-feature'] + args.refs
|
||||
fep = subprocess.Popen(fep_cmd, stdout=subprocess.PIPE)
|
||||
input = fep.stdout
|
||||
output = open(os.devnull, 'w')
|
||||
# Setup the rev-list/diff-tree process
|
||||
commit_parse_progress = ProgressWriter()
|
||||
num_commits = 0
|
||||
cmd = 'git rev-list --topo-order --reverse {} | git diff-tree --stdin --always --root --format="%H%n%P%n%cd" --date=short -M -t -c --raw --combined-all-paths'.format(' '.join(args.refs))
|
||||
dtp = subprocess.Popen(cmd, shell=True, stdout=subprocess.PIPE)
|
||||
f = dtp.stdout
|
||||
line = f.next()
|
||||
cont = bool(line)
|
||||
graph = AncestryGraph()
|
||||
while cont:
|
||||
commit = line.rstrip()
|
||||
parents = f.next().split()
|
||||
date = f.next().rstrip()
|
||||
|
||||
# Create and run the filter
|
||||
setattr(args, 'stats', stats)
|
||||
analyze_filter = FastExportFilter(
|
||||
commit_callback = lambda c : analyze_commit(args, c),
|
||||
)
|
||||
analyze_filter.run(input, output, quiet = args.quiet)
|
||||
setattr(args, 'num_commits', analyze_filter.num_commits_parsed())
|
||||
# We expect a blank line next; if we get a non-blank line then
|
||||
# this commit modified no files and we need to move on to the next.
|
||||
# If there is no line, we've reached end-of-input.
|
||||
try:
|
||||
line = f.next().rstrip()
|
||||
cont = True
|
||||
except StopIteration:
|
||||
cont = False
|
||||
|
||||
# Close the output, ensure fast-export have completed
|
||||
output.close()
|
||||
if fep.wait():
|
||||
raise SystemExit("Error: fast-export failed; see above.")
|
||||
# If we haven't reached end of input, and we got a blank line meaning
|
||||
# a commit that has modified files, then get the file changes associated
|
||||
# with this commit.
|
||||
file_changes = []
|
||||
if cont and not line:
|
||||
cont = False
|
||||
for line in f:
|
||||
if not line.startswith(':'):
|
||||
cont = True
|
||||
break
|
||||
n = 1+max(1, len(parents))
|
||||
assert line.startswith(':'*(n-1))
|
||||
relevant = line[n-1:-1]
|
||||
splits = relevant.split(None, n)
|
||||
modes = splits[0:n]
|
||||
splits = splits[n].split(None, n)
|
||||
shas = splits[0:n]
|
||||
splits = splits[n].split('\t')
|
||||
change_types = splits[0]
|
||||
filenames = [PathQuoting.dequote(x) for x in splits[1:]]
|
||||
file_changes.append([modes, shas, change_types, filenames])
|
||||
|
||||
# Analyze this commit and update progress
|
||||
analyze_commit(stats, graph, commit, parents, date, file_changes)
|
||||
num_commits += 1
|
||||
commit_parse_progress.show("Processed {} commits".format(num_commits))
|
||||
|
||||
# Show the final commits processed message and record the number of commits
|
||||
commit_parse_progress.finish()
|
||||
stats['num_commits'] = num_commits
|
||||
|
||||
# Close the output, ensure rev-list|diff-tree pipeline completed successfully
|
||||
dtp.stdout.close()
|
||||
if dtp.wait():
|
||||
raise SystemExit("Error: rev-list|diff-tree pipeline failed; see above.")
|
||||
|
||||
return stats
|
||||
|
||||
def do_analysis(args, git_dir):
|
||||
# Create the report file as necessary
|
||||
@ -1936,10 +2015,10 @@ def do_analysis(args, git_dir):
|
||||
os.mkdir(reportdir)
|
||||
|
||||
# Now gather the data we need
|
||||
gather_data(args)
|
||||
stats = gather_data(args)
|
||||
|
||||
def datestr(datetimeobj):
|
||||
return datetimeobj.strftime('%F') if datetimeobj else '<present>'
|
||||
def datestr(datetimestr):
|
||||
return datetimestr if datetimestr else '<present>'
|
||||
|
||||
def dirnames(path):
|
||||
while True:
|
||||
@ -1956,11 +2035,11 @@ def do_analysis(args, git_dir):
|
||||
'unpacked': collections.defaultdict(int)}
|
||||
dir_size = {'packed': collections.defaultdict(int),
|
||||
'unpacked': collections.defaultdict(int)}
|
||||
for sha in args.stats['names']:
|
||||
size = {'packed': args.stats['packed_size'][sha],
|
||||
'unpacked': args.stats['unpacked_size'][sha]}
|
||||
for sha in stats['names']:
|
||||
size = {'packed': stats['packed_size'][sha],
|
||||
'unpacked': stats['unpacked_size'][sha]}
|
||||
for which in ('packed', 'unpacked'):
|
||||
for name in args.stats['names'][sha]:
|
||||
for name in stats['names'][sha]:
|
||||
total_size[which] += size[which]
|
||||
path_size[which][name] += size[which]
|
||||
basename, ext = os.path.splitext(name)
|
||||
@ -1970,9 +2049,8 @@ def do_analysis(args, git_dir):
|
||||
|
||||
# Determine if and when extensions and directories were deleted
|
||||
ext_deleted_data = {}
|
||||
dir_deleted_data = {}
|
||||
for name in args.stats['allnames']:
|
||||
when = args.stats['deletions'].get(name, None)
|
||||
for name in stats['allnames']:
|
||||
when = stats['file_deletions'].get(name, None)
|
||||
|
||||
# Update the extension
|
||||
basename, ext = os.path.splitext(name)
|
||||
@ -1984,20 +2062,14 @@ def do_analysis(args, git_dir):
|
||||
else:
|
||||
ext_deleted_data[ext] = when
|
||||
|
||||
# Update the dirs
|
||||
for dirname in dirnames(name):
|
||||
if when is None:
|
||||
dir_deleted_data[dirname] = None
|
||||
elif dirname in dir_deleted_data:
|
||||
if dir_deleted_data[dirname] is not None:
|
||||
dir_deleted_data[dirname] = max(dir_deleted_data[dirname], when)
|
||||
else:
|
||||
dir_deleted_data[dirname] = when
|
||||
dir_deleted_data = {}
|
||||
for name in dir_size['packed']:
|
||||
dir_deleted_data[name] = stats['tree_deletions'].get(name, None)
|
||||
|
||||
with open(os.path.join(reportdir, "README"), 'w') as f:
|
||||
# Give a basic overview of this file
|
||||
f.write("== Overal Statistics ==\n")
|
||||
f.write(" Number of commits: {}\n".format(args.num_commits))
|
||||
f.write(" Number of commits: {}\n".format(stats['num_commits']))
|
||||
f.write(" Number of filenames: {}\n".format(len(path_size['packed'])))
|
||||
f.write(" Number of directories: {}\n".format(len(dir_size['packed'])))
|
||||
f.write(" Number of file extensions: {}\n".format(len(ext_size['packed'])))
|
||||
@ -2071,17 +2143,6 @@ def do_analysis(args, git_dir):
|
||||
anymore. We could try to portray this to the user, but it's easier
|
||||
for the user to just break the pairing and only report unbroken
|
||||
rename pairings to the user.
|
||||
* Since modifying a renamed file on the side of history that doesn't
|
||||
rename it should be expected to be common (unlike modifying a deleted
|
||||
file on the side of history that doesn't delete it), tracking history
|
||||
becomes more important to avoid incorrectly breaking rename chains.
|
||||
This has not yet been implemented. This seriously raises the risk
|
||||
of erroneously breaking rename pairings; a future release may address
|
||||
this shortcoming.
|
||||
* We only use rename detection, not copy detection. However, that
|
||||
means that if some commit in history renamed two files into the same
|
||||
location, we won't pick up one of the two renames and will instead
|
||||
report that branch as having been deleted.
|
||||
* The ability for users to rename files differently in different
|
||||
branches means that our chains of renames will not necessarily be
|
||||
linear but may branch out.
|
||||
@ -2093,7 +2154,7 @@ def do_analysis(args, git_dir):
|
||||
# too.
|
||||
with open(os.path.join(reportdir, "renames.txt"), 'w') as f:
|
||||
seen = set()
|
||||
for pathname,equiv_group in sorted(args.stats['equivalence'].iteritems(),
|
||||
for pathname,equiv_group in sorted(stats['equivalence'].iteritems(),
|
||||
key=lambda x:x[1]):
|
||||
if equiv_group in seen:
|
||||
continue
|
||||
@ -2156,7 +2217,7 @@ def do_analysis(args, git_dir):
|
||||
f.write("Format: unpacked size, packed size, date deleted, path name(s)\n")
|
||||
for pathname, size in sorted(path_size['packed'].iteritems(),
|
||||
key=lambda x:x[1], reverse=True):
|
||||
when = args.stats['deletions'].get(pathname, None)
|
||||
when = stats['file_deletions'].get(pathname, None)
|
||||
if when:
|
||||
f.write(" {:10d} {:10d} {:10s} {}\n"
|
||||
.format(path_size['unpacked'][pathname],
|
||||
@ -2169,7 +2230,7 @@ def do_analysis(args, git_dir):
|
||||
f.write("Format: unpacked size, packed size, date deleted, pathectory name\n")
|
||||
for pathname, size in sorted(path_size['packed'].iteritems(),
|
||||
key=lambda x:x[1], reverse=True):
|
||||
when = args.stats['deletions'].get(pathname, None)
|
||||
when = stats['file_deletions'].get(pathname, None)
|
||||
f.write(" {:10d} {:10d} {:10s} {}\n"
|
||||
.format(path_size['unpacked'][pathname],
|
||||
size,
|
||||
@ -2180,19 +2241,19 @@ def do_analysis(args, git_dir):
|
||||
with open(os.path.join(reportdir, "blob-shas-and-paths.txt"), 'w') as f:
|
||||
f.write("== Files by sha and associated pathnames in reverse size ==\n")
|
||||
f.write("Format: sha, unpacked size, packed size, filename(s) object stored as\n")
|
||||
for sha, size in sorted(args.stats['packed_size'].iteritems(),
|
||||
for sha, size in sorted(stats['packed_size'].iteritems(),
|
||||
key=lambda x:x[1], reverse=True):
|
||||
if sha not in args.stats['names']:
|
||||
if sha not in stats['names']:
|
||||
# Some objects in the repository might not be referenced, or not
|
||||
# referenced by the branches/tags the user cares about; skip them.
|
||||
continue
|
||||
names_with_sha = args.stats['names'][sha]
|
||||
names_with_sha = stats['names'][sha]
|
||||
if len(names_with_sha) == 1:
|
||||
names_with_sha = names_with_sha.pop()
|
||||
else:
|
||||
names_with_sha = sorted(list(names_with_sha))
|
||||
f.write(" {} {:10d} {:10d} {}\n".format(sha,
|
||||
args.stats['unpacked_size'][sha],
|
||||
stats['unpacked_size'][sha],
|
||||
size,
|
||||
names_with_sha))
|
||||
|
||||
|
Loading…
Reference in New Issue
Block a user