mirror of
https://github.com/newren/git-filter-repo.git
synced 2024-07-06 18:32:14 +02:00
filter-repo: group repo analysis functions into a class
Signed-off-by: Elijah Newren <newren@gmail.com>
This commit is contained in:
parent
9887dd5cbe
commit
4e2110136e
734
git-filter-repo
734
git-filter-repo
@ -1891,10 +1891,15 @@ class FilteringOptions(object):
|
|||||||
FilteringOptions.sanity_check_args(args)
|
FilteringOptions.sanity_check_args(args)
|
||||||
return args
|
return args
|
||||||
|
|
||||||
def analyze_commit(stats, graph, commit, parents, date, file_changes):
|
class RepoAnalyze(object):
|
||||||
def equiv_class(filename):
|
|
||||||
|
# First, several helper functions for analyze_commit()
|
||||||
|
|
||||||
|
@staticmethod
|
||||||
|
def equiv_class(stats, filename):
|
||||||
return stats['equivalence'].get(filename, (filename,))
|
return stats['equivalence'].get(filename, (filename,))
|
||||||
|
|
||||||
|
@staticmethod
|
||||||
def setup_equivalence_for_rename(stats, oldname, newname):
|
def setup_equivalence_for_rename(stats, oldname, newname):
|
||||||
# if A is renamed to B and B is renamed to C, then the user thinks of
|
# if A is renamed to B and B is renamed to C, then the user thinks of
|
||||||
# A, B, and C as all being different names for the same 'file'. We record
|
# A, B, and C as all being different names for the same 'file'. We record
|
||||||
@ -1911,18 +1916,22 @@ def analyze_commit(stats, graph, commit, parents, date, file_changes):
|
|||||||
for f in new_tuple:
|
for f in new_tuple:
|
||||||
stats['equivalence'][f] = new_tuple
|
stats['equivalence'][f] = new_tuple
|
||||||
|
|
||||||
|
@staticmethod
|
||||||
def setup_or_update_rename_history(stats, commit, oldname, newname):
|
def setup_or_update_rename_history(stats, commit, oldname, newname):
|
||||||
rename_commits = stats['rename_history'].get(oldname, set())
|
rename_commits = stats['rename_history'].get(oldname, set())
|
||||||
rename_commits.add(commit)
|
rename_commits.add(commit)
|
||||||
stats['rename_history'][oldname] = rename_commits
|
stats['rename_history'][oldname] = rename_commits
|
||||||
|
|
||||||
|
@staticmethod
|
||||||
def handle_renames(stats, commit, change_types, filenames):
|
def handle_renames(stats, commit, change_types, filenames):
|
||||||
for index, change_type in enumerate(change_types):
|
for index, change_type in enumerate(change_types):
|
||||||
if change_type == 'R':
|
if change_type == 'R':
|
||||||
oldname, newname = filenames[index], filenames[-1]
|
oldname, newname = filenames[index], filenames[-1]
|
||||||
setup_equivalence_for_rename(stats, oldname, newname)
|
RepoAnalyze.setup_equivalence_for_rename(stats, oldname, newname)
|
||||||
setup_or_update_rename_history(stats, commit, oldname, newname)
|
RepoAnalyze.setup_or_update_rename_history(stats, commit,
|
||||||
|
oldname, newname)
|
||||||
|
|
||||||
|
@staticmethod
|
||||||
def handle_file(stats, graph, commit, modes, shas, filenames):
|
def handle_file(stats, graph, commit, modes, shas, filenames):
|
||||||
mode, sha, filename = modes[-1], shas[-1], filenames[-1]
|
mode, sha, filename = modes[-1], shas[-1], filenames[-1]
|
||||||
|
|
||||||
@ -1936,7 +1945,7 @@ def analyze_commit(stats, graph, commit, parents, date, file_changes):
|
|||||||
|
|
||||||
# If the file (or equivalence class of files) was recorded as deleted,
|
# If the file (or equivalence class of files) was recorded as deleted,
|
||||||
# clearly it isn't anymore
|
# clearly it isn't anymore
|
||||||
equiv = equiv_class(filename)
|
equiv = RepoAnalyze.equiv_class(stats, filename)
|
||||||
for f in equiv:
|
for f in equiv:
|
||||||
stats[delmode].pop(f, None)
|
stats[delmode].pop(f, None)
|
||||||
|
|
||||||
@ -1954,383 +1963,392 @@ def analyze_commit(stats, graph, commit, parents, date, file_changes):
|
|||||||
if f in stats['equivalence']:
|
if f in stats['equivalence']:
|
||||||
del stats['equivalence'][f]
|
del stats['equivalence'][f]
|
||||||
|
|
||||||
graph.add_commit_and_parents(commit, parents)
|
@staticmethod
|
||||||
for change in file_changes:
|
def analyze_commit(stats, graph, commit, parents, date, file_changes):
|
||||||
modes, shas, change_types, filenames = change
|
graph.add_commit_and_parents(commit, parents)
|
||||||
if len(parents) == 1 and change_types.startswith('R'):
|
for change in file_changes:
|
||||||
change_types = 'R' # remove the rename score; we don't care
|
modes, shas, change_types, filenames = change
|
||||||
if modes[-1] == '160000':
|
if len(parents) == 1 and change_types.startswith('R'):
|
||||||
continue
|
change_types = 'R' # remove the rename score; we don't care
|
||||||
elif modes[-1] == '000000':
|
if modes[-1] == '160000':
|
||||||
# Track when files/directories are deleted; see 'R' below about equiv_class
|
|
||||||
for f in equiv_class(filenames[-1]):
|
|
||||||
if any(x == '040000' for x in modes[0:-1]):
|
|
||||||
stats['tree_deletions'][f] = date
|
|
||||||
else:
|
|
||||||
stats['file_deletions'][f] = date
|
|
||||||
elif change_types.strip('AMT') == '':
|
|
||||||
handle_file(stats, graph, commit, modes, shas, filenames)
|
|
||||||
elif modes[-1] == '040000' and change_types.strip('RAM') == '':
|
|
||||||
handle_file(stats, graph, commit, modes, shas, filenames)
|
|
||||||
elif change_types.strip('RAM') == '':
|
|
||||||
handle_file(stats, graph, commit, modes, shas, filenames)
|
|
||||||
handle_renames(stats, commit, change_types, filenames)
|
|
||||||
else:
|
|
||||||
raise SystemExit("Unhandled change type(s): {} (in commit {})"
|
|
||||||
.format(change_types, commit))
|
|
||||||
|
|
||||||
def gather_data(args):
|
|
||||||
blob_size_progress = ProgressWriter()
|
|
||||||
num_blobs = 0
|
|
||||||
|
|
||||||
# Get sizes of blobs by sha1
|
|
||||||
a='--batch-check=%(objectname) %(objecttype) %(objectsize) %(objectsize:disk)'
|
|
||||||
cf = subprocess.Popen(['git', 'cat-file', '--batch-all-objects', a],
|
|
||||||
bufsize = -1,
|
|
||||||
stdout = subprocess.PIPE)
|
|
||||||
unpacked_size = {}
|
|
||||||
packed_size = {}
|
|
||||||
for line in cf.stdout:
|
|
||||||
sha, objtype, objsize, objdisksize = line.split()
|
|
||||||
objsize, objdisksize = int(objsize), int(objdisksize)
|
|
||||||
if objtype == 'blob':
|
|
||||||
unpacked_size[sha] = objsize
|
|
||||||
packed_size[sha] = objdisksize
|
|
||||||
num_blobs += 1
|
|
||||||
blob_size_progress.show("Processed {} blob sizes".format(num_blobs))
|
|
||||||
cf.wait()
|
|
||||||
blob_size_progress.finish()
|
|
||||||
stats = {'names': collections.defaultdict(set),
|
|
||||||
'allnames' : set(),
|
|
||||||
'file_deletions': {},
|
|
||||||
'tree_deletions': {},
|
|
||||||
'equivalence': {},
|
|
||||||
'rename_history': collections.defaultdict(set),
|
|
||||||
'unpacked_size': unpacked_size,
|
|
||||||
'packed_size': packed_size,
|
|
||||||
'num_commits': 0}
|
|
||||||
|
|
||||||
# Setup the rev-list/diff-tree process
|
|
||||||
commit_parse_progress = ProgressWriter()
|
|
||||||
num_commits = 0
|
|
||||||
cmd = 'git rev-list --topo-order --reverse {} | git diff-tree --stdin --always --root --format="%H%n%P%n%cd" --date=short -M -t -c --raw --combined-all-paths'.format(' '.join(args.refs))
|
|
||||||
dtp = subprocess.Popen(cmd, shell=True, bufsize=-1, stdout=subprocess.PIPE)
|
|
||||||
f = dtp.stdout
|
|
||||||
line = f.next()
|
|
||||||
cont = bool(line)
|
|
||||||
graph = AncestryGraph()
|
|
||||||
while cont:
|
|
||||||
commit = line.rstrip()
|
|
||||||
parents = f.next().split()
|
|
||||||
date = f.next().rstrip()
|
|
||||||
|
|
||||||
# We expect a blank line next; if we get a non-blank line then
|
|
||||||
# this commit modified no files and we need to move on to the next.
|
|
||||||
# If there is no line, we've reached end-of-input.
|
|
||||||
try:
|
|
||||||
line = f.next().rstrip()
|
|
||||||
cont = True
|
|
||||||
except StopIteration:
|
|
||||||
cont = False
|
|
||||||
|
|
||||||
# If we haven't reached end of input, and we got a blank line meaning
|
|
||||||
# a commit that has modified files, then get the file changes associated
|
|
||||||
# with this commit.
|
|
||||||
file_changes = []
|
|
||||||
if cont and not line:
|
|
||||||
cont = False
|
|
||||||
for line in f:
|
|
||||||
if not line.startswith(':'):
|
|
||||||
cont = True
|
|
||||||
break
|
|
||||||
n = 1+max(1, len(parents))
|
|
||||||
assert line.startswith(':'*(n-1))
|
|
||||||
relevant = line[n-1:-1]
|
|
||||||
splits = relevant.split(None, n)
|
|
||||||
modes = splits[0:n]
|
|
||||||
splits = splits[n].split(None, n)
|
|
||||||
shas = splits[0:n]
|
|
||||||
splits = splits[n].split('\t')
|
|
||||||
change_types = splits[0]
|
|
||||||
filenames = [PathQuoting.dequote(x) for x in splits[1:]]
|
|
||||||
file_changes.append([modes, shas, change_types, filenames])
|
|
||||||
|
|
||||||
# Analyze this commit and update progress
|
|
||||||
analyze_commit(stats, graph, commit, parents, date, file_changes)
|
|
||||||
num_commits += 1
|
|
||||||
commit_parse_progress.show("Processed {} commits".format(num_commits))
|
|
||||||
|
|
||||||
# Show the final commits processed message and record the number of commits
|
|
||||||
commit_parse_progress.finish()
|
|
||||||
stats['num_commits'] = num_commits
|
|
||||||
|
|
||||||
# Close the output, ensure rev-list|diff-tree pipeline completed successfully
|
|
||||||
dtp.stdout.close()
|
|
||||||
if dtp.wait():
|
|
||||||
raise SystemExit("Error: rev-list|diff-tree pipeline failed; see above.")
|
|
||||||
|
|
||||||
return stats
|
|
||||||
|
|
||||||
def write_report(reportdir, stats):
|
|
||||||
def datestr(datetimestr):
|
|
||||||
return datetimestr if datetimestr else '<present>'
|
|
||||||
|
|
||||||
def dirnames(path):
|
|
||||||
while True:
|
|
||||||
path = os.path.dirname(path)
|
|
||||||
yield path
|
|
||||||
if path == '':
|
|
||||||
break
|
|
||||||
|
|
||||||
# Compute aggregate size information for paths, extensions, and dirs
|
|
||||||
total_size = {'packed': 0, 'unpacked': 0}
|
|
||||||
path_size = {'packed': collections.defaultdict(int),
|
|
||||||
'unpacked': collections.defaultdict(int)}
|
|
||||||
ext_size = {'packed': collections.defaultdict(int),
|
|
||||||
'unpacked': collections.defaultdict(int)}
|
|
||||||
dir_size = {'packed': collections.defaultdict(int),
|
|
||||||
'unpacked': collections.defaultdict(int)}
|
|
||||||
for sha in stats['names']:
|
|
||||||
size = {'packed': stats['packed_size'][sha],
|
|
||||||
'unpacked': stats['unpacked_size'][sha]}
|
|
||||||
for which in ('packed', 'unpacked'):
|
|
||||||
for name in stats['names'][sha]:
|
|
||||||
total_size[which] += size[which]
|
|
||||||
path_size[which][name] += size[which]
|
|
||||||
basename, ext = os.path.splitext(name)
|
|
||||||
ext_size[which][ext] += size[which]
|
|
||||||
for dirname in dirnames(name):
|
|
||||||
dir_size[which][dirname] += size[which]
|
|
||||||
|
|
||||||
# Determine if and when extensions and directories were deleted
|
|
||||||
ext_deleted_data = {}
|
|
||||||
for name in stats['allnames']:
|
|
||||||
when = stats['file_deletions'].get(name, None)
|
|
||||||
|
|
||||||
# Update the extension
|
|
||||||
basename, ext = os.path.splitext(name)
|
|
||||||
if when is None:
|
|
||||||
ext_deleted_data[ext] = None
|
|
||||||
elif ext in ext_deleted_data:
|
|
||||||
if ext_deleted_data[ext] is not None:
|
|
||||||
ext_deleted_data[ext] = max(ext_deleted_data[ext], when)
|
|
||||||
else:
|
|
||||||
ext_deleted_data[ext] = when
|
|
||||||
|
|
||||||
dir_deleted_data = {}
|
|
||||||
for name in dir_size['packed']:
|
|
||||||
dir_deleted_data[name] = stats['tree_deletions'].get(name, None)
|
|
||||||
|
|
||||||
with open(os.path.join(reportdir, "README"), 'w') as f:
|
|
||||||
# Give a basic overview of this file
|
|
||||||
f.write("== Overal Statistics ==\n")
|
|
||||||
f.write(" Number of commits: {}\n".format(stats['num_commits']))
|
|
||||||
f.write(" Number of filenames: {}\n".format(len(path_size['packed'])))
|
|
||||||
f.write(" Number of directories: {}\n".format(len(dir_size['packed'])))
|
|
||||||
f.write(" Number of file extensions: {}\n".format(len(ext_size['packed'])))
|
|
||||||
f.write("\n")
|
|
||||||
f.write(" Total unpacked size (bytes): {:10d}\n"
|
|
||||||
.format(total_size['unpacked']))
|
|
||||||
f.write(" Total packed size (bytes): {:10d}\n"
|
|
||||||
.format(total_size['packed']))
|
|
||||||
f.write("\n")
|
|
||||||
|
|
||||||
# Mention issues with the report
|
|
||||||
f.write("== Caveats ==\n")
|
|
||||||
f.write("=== Sizes ===\n")
|
|
||||||
f.write(textwrap.dedent("""
|
|
||||||
Packed size represents what size your repository would be if no
|
|
||||||
trees, commits, tags, or other metadata were included (though it may
|
|
||||||
fail to represent de-duplication; see below). It also represents the
|
|
||||||
current packing, which may be suboptimal if you haven't gc'ed for a
|
|
||||||
while.
|
|
||||||
|
|
||||||
Unpacked size represents what size your repository would be if no if
|
|
||||||
no trees, commits, tags, or other metadata were included AND if no
|
|
||||||
files were packed; i.e., without delta-ing or compression.
|
|
||||||
|
|
||||||
Both unpacked and packed sizes can be slightly misleading. Deleting
|
|
||||||
a blob from history not save as much space as the unpacked size,
|
|
||||||
because it is obviously normally stored in packed form. Also,
|
|
||||||
deleting a blob from history may not save as much space as its packed
|
|
||||||
size either, because another blob could be stored as a delta against
|
|
||||||
that blob, so when you remove one blob another blob's packed size may
|
|
||||||
grow.
|
|
||||||
|
|
||||||
Also, the sum of the packed sizes can add up to more than the
|
|
||||||
repository size; if the same contents appeared in the repository in
|
|
||||||
multiple places, git will automatically de-dupe and store only one
|
|
||||||
copy, while the way sizes are added in this analysis adds the size
|
|
||||||
for each file path that has those contents. Further, if a file is
|
|
||||||
ever reverted to a previous version's contents, the previous
|
|
||||||
version's size will be counted multiple times in this analysis, even
|
|
||||||
though git will only store it once.
|
|
||||||
"""[1:]))
|
|
||||||
f.write("\n")
|
|
||||||
f.write("=== Deletions ===\n")
|
|
||||||
f.write(textwrap.dedent("""
|
|
||||||
Whether a file is deleted is not a binary quality, since it can be
|
|
||||||
deleted on some branches but still exist in others. Also, it might
|
|
||||||
exist in an old tag, but have been deleted in versions newer than
|
|
||||||
that. More thorough tracking could be done, including looking at
|
|
||||||
merge commits where one side of history deleted and the other modified,
|
|
||||||
in order to give a more holistic picture of deletions. However, that
|
|
||||||
algorithm would not only be more complex to implement, it'd also be
|
|
||||||
quite difficult to present and interpret by users. Since --analyze
|
|
||||||
is just about getting a high-level rough picture of history, it instead
|
|
||||||
implements the simplistic rule that is good enough for 98% of cases:
|
|
||||||
A file is marked as deleted if the last commit in the fast-export
|
|
||||||
stream that mentions the file lists it as deleted.
|
|
||||||
This makes it dependent on topological ordering, but generally gives
|
|
||||||
the "right" answer.
|
|
||||||
"""[1:]))
|
|
||||||
f.write("\n")
|
|
||||||
f.write("=== Renames ===\n")
|
|
||||||
f.write(textwrap.dedent("""
|
|
||||||
Renames share the same non-binary nature that deletions do, plus
|
|
||||||
additional challenges:
|
|
||||||
* If the renamed file is renamed again, instead of just two names for
|
|
||||||
a path you can have three or more.
|
|
||||||
* Rename pairs of the form (oldname, newname) that we consider to be
|
|
||||||
different names of the "same file" might only be valid over certain
|
|
||||||
commit ranges. For example, if a new commit reintroduces a file
|
|
||||||
named oldname, then new versions of oldname aren't the "same file"
|
|
||||||
anymore. We could try to portray this to the user, but it's easier
|
|
||||||
for the user to just break the pairing and only report unbroken
|
|
||||||
rename pairings to the user.
|
|
||||||
* The ability for users to rename files differently in different
|
|
||||||
branches means that our chains of renames will not necessarily be
|
|
||||||
linear but may branch out.
|
|
||||||
"""[1:]))
|
|
||||||
f.write("\n")
|
|
||||||
|
|
||||||
# Equivalence classes for names, so if folks only want to keep a
|
|
||||||
# certain set of paths, they know the old names they want to include
|
|
||||||
# too.
|
|
||||||
with open(os.path.join(reportdir, "renames.txt"), 'w') as f:
|
|
||||||
seen = set()
|
|
||||||
for pathname,equiv_group in sorted(stats['equivalence'].iteritems(),
|
|
||||||
key=lambda x:x[1]):
|
|
||||||
if equiv_group in seen:
|
|
||||||
continue
|
continue
|
||||||
seen.add(equiv_group)
|
elif modes[-1] == '000000':
|
||||||
f.write("{} ->\n ".format(equiv_group[0]) +
|
# Track when files/directories are deleted
|
||||||
"\n ".join(equiv_group[1:]) +
|
for f in RepoAnalyze.equiv_class(stats, filenames[-1]):
|
||||||
"\n")
|
if any(x == '040000' for x in modes[0:-1]):
|
||||||
|
stats['tree_deletions'][f] = date
|
||||||
|
else:
|
||||||
|
stats['file_deletions'][f] = date
|
||||||
|
elif change_types.strip('AMT') == '':
|
||||||
|
RepoAnalyze.handle_file(stats, graph, commit, modes, shas, filenames)
|
||||||
|
elif modes[-1] == '040000' and change_types.strip('RAM') == '':
|
||||||
|
RepoAnalyze.handle_file(stats, graph, commit, modes, shas, filenames)
|
||||||
|
elif change_types.strip('RAM') == '':
|
||||||
|
RepoAnalyze.handle_file(stats, graph, commit, modes, shas, filenames)
|
||||||
|
RepoAnalyze.handle_renames(stats, commit, change_types, filenames)
|
||||||
|
else:
|
||||||
|
raise SystemExit("Unhandled change type(s): {} (in commit {})"
|
||||||
|
.format(change_types, commit))
|
||||||
|
|
||||||
# List directories in reverse sorted order of unpacked size
|
@staticmethod
|
||||||
with open(os.path.join(reportdir, "directories-deleted-sizes.txt"), 'w') as f:
|
def gather_data(args):
|
||||||
f.write("=== Deleted directories by reverse size ===\n")
|
blob_size_progress = ProgressWriter()
|
||||||
f.write("Format: unpacked size, packed size, date deleted, directory name\n")
|
num_blobs = 0
|
||||||
for dirname, size in sorted(dir_size['packed'].iteritems(),
|
|
||||||
key=lambda x:x[1], reverse=True):
|
# Get sizes of blobs by sha1
|
||||||
if (dir_deleted_data[dirname]):
|
cmd = '--batch-check=%(objectname) %(objecttype) ' + \
|
||||||
|
'%(objectsize) %(objectsize:disk)'
|
||||||
|
cf = subprocess.Popen(['git', 'cat-file', '--batch-all-objects', cmd],
|
||||||
|
bufsize = -1,
|
||||||
|
stdout = subprocess.PIPE)
|
||||||
|
unpacked_size = {}
|
||||||
|
packed_size = {}
|
||||||
|
for line in cf.stdout:
|
||||||
|
sha, objtype, objsize, objdisksize = line.split()
|
||||||
|
objsize, objdisksize = int(objsize), int(objdisksize)
|
||||||
|
if objtype == 'blob':
|
||||||
|
unpacked_size[sha] = objsize
|
||||||
|
packed_size[sha] = objdisksize
|
||||||
|
num_blobs += 1
|
||||||
|
blob_size_progress.show("Processed {} blob sizes".format(num_blobs))
|
||||||
|
cf.wait()
|
||||||
|
blob_size_progress.finish()
|
||||||
|
stats = {'names': collections.defaultdict(set),
|
||||||
|
'allnames' : set(),
|
||||||
|
'file_deletions': {},
|
||||||
|
'tree_deletions': {},
|
||||||
|
'equivalence': {},
|
||||||
|
'rename_history': collections.defaultdict(set),
|
||||||
|
'unpacked_size': unpacked_size,
|
||||||
|
'packed_size': packed_size,
|
||||||
|
'num_commits': 0}
|
||||||
|
|
||||||
|
# Setup the rev-list/diff-tree process
|
||||||
|
commit_parse_progress = ProgressWriter()
|
||||||
|
num_commits = 0
|
||||||
|
cmd = ('git rev-list --topo-order --reverse {}'.format(' '.join(args.refs)) +
|
||||||
|
' | git diff-tree --stdin --always --root --format=%H%n%P%n%cd' +
|
||||||
|
' --date=short -M -t -c --raw --combined-all-paths')
|
||||||
|
dtp = subprocess.Popen(cmd, shell=True, bufsize=-1, stdout=subprocess.PIPE)
|
||||||
|
f = dtp.stdout
|
||||||
|
line = f.next()
|
||||||
|
cont = bool(line)
|
||||||
|
graph = AncestryGraph()
|
||||||
|
while cont:
|
||||||
|
commit = line.rstrip()
|
||||||
|
parents = f.next().split()
|
||||||
|
date = f.next().rstrip()
|
||||||
|
|
||||||
|
# We expect a blank line next; if we get a non-blank line then
|
||||||
|
# this commit modified no files and we need to move on to the next.
|
||||||
|
# If there is no line, we've reached end-of-input.
|
||||||
|
try:
|
||||||
|
line = f.next().rstrip()
|
||||||
|
cont = True
|
||||||
|
except StopIteration:
|
||||||
|
cont = False
|
||||||
|
|
||||||
|
# If we haven't reached end of input, and we got a blank line meaning
|
||||||
|
# a commit that has modified files, then get the file changes associated
|
||||||
|
# with this commit.
|
||||||
|
file_changes = []
|
||||||
|
if cont and not line:
|
||||||
|
cont = False
|
||||||
|
for line in f:
|
||||||
|
if not line.startswith(':'):
|
||||||
|
cont = True
|
||||||
|
break
|
||||||
|
n = 1+max(1, len(parents))
|
||||||
|
assert line.startswith(':'*(n-1))
|
||||||
|
relevant = line[n-1:-1]
|
||||||
|
splits = relevant.split(None, n)
|
||||||
|
modes = splits[0:n]
|
||||||
|
splits = splits[n].split(None, n)
|
||||||
|
shas = splits[0:n]
|
||||||
|
splits = splits[n].split('\t')
|
||||||
|
change_types = splits[0]
|
||||||
|
filenames = [PathQuoting.dequote(x) for x in splits[1:]]
|
||||||
|
file_changes.append([modes, shas, change_types, filenames])
|
||||||
|
|
||||||
|
# Analyze this commit and update progress
|
||||||
|
RepoAnalyze.analyze_commit(stats, graph, commit, parents, date,
|
||||||
|
file_changes)
|
||||||
|
num_commits += 1
|
||||||
|
commit_parse_progress.show("Processed {} commits".format(num_commits))
|
||||||
|
|
||||||
|
# Show the final commits processed message and record the number of commits
|
||||||
|
commit_parse_progress.finish()
|
||||||
|
stats['num_commits'] = num_commits
|
||||||
|
|
||||||
|
# Close the output, ensure rev-list|diff-tree pipeline completed successfully
|
||||||
|
dtp.stdout.close()
|
||||||
|
if dtp.wait():
|
||||||
|
raise SystemExit("Error: rev-list|diff-tree pipeline failed; see above.")
|
||||||
|
|
||||||
|
return stats
|
||||||
|
|
||||||
|
@staticmethod
|
||||||
|
def write_report(reportdir, stats):
|
||||||
|
def datestr(datetimestr):
|
||||||
|
return datetimestr if datetimestr else '<present>'
|
||||||
|
|
||||||
|
def dirnames(path):
|
||||||
|
while True:
|
||||||
|
path = os.path.dirname(path)
|
||||||
|
yield path
|
||||||
|
if path == '':
|
||||||
|
break
|
||||||
|
|
||||||
|
# Compute aggregate size information for paths, extensions, and dirs
|
||||||
|
total_size = {'packed': 0, 'unpacked': 0}
|
||||||
|
path_size = {'packed': collections.defaultdict(int),
|
||||||
|
'unpacked': collections.defaultdict(int)}
|
||||||
|
ext_size = {'packed': collections.defaultdict(int),
|
||||||
|
'unpacked': collections.defaultdict(int)}
|
||||||
|
dir_size = {'packed': collections.defaultdict(int),
|
||||||
|
'unpacked': collections.defaultdict(int)}
|
||||||
|
for sha in stats['names']:
|
||||||
|
size = {'packed': stats['packed_size'][sha],
|
||||||
|
'unpacked': stats['unpacked_size'][sha]}
|
||||||
|
for which in ('packed', 'unpacked'):
|
||||||
|
for name in stats['names'][sha]:
|
||||||
|
total_size[which] += size[which]
|
||||||
|
path_size[which][name] += size[which]
|
||||||
|
basename, ext = os.path.splitext(name)
|
||||||
|
ext_size[which][ext] += size[which]
|
||||||
|
for dirname in dirnames(name):
|
||||||
|
dir_size[which][dirname] += size[which]
|
||||||
|
|
||||||
|
# Determine if and when extensions and directories were deleted
|
||||||
|
ext_deleted_data = {}
|
||||||
|
for name in stats['allnames']:
|
||||||
|
when = stats['file_deletions'].get(name, None)
|
||||||
|
|
||||||
|
# Update the extension
|
||||||
|
basename, ext = os.path.splitext(name)
|
||||||
|
if when is None:
|
||||||
|
ext_deleted_data[ext] = None
|
||||||
|
elif ext in ext_deleted_data:
|
||||||
|
if ext_deleted_data[ext] is not None:
|
||||||
|
ext_deleted_data[ext] = max(ext_deleted_data[ext], when)
|
||||||
|
else:
|
||||||
|
ext_deleted_data[ext] = when
|
||||||
|
|
||||||
|
dir_deleted_data = {}
|
||||||
|
for name in dir_size['packed']:
|
||||||
|
dir_deleted_data[name] = stats['tree_deletions'].get(name, None)
|
||||||
|
|
||||||
|
with open(os.path.join(reportdir, "README"), 'w') as f:
|
||||||
|
# Give a basic overview of this file
|
||||||
|
f.write("== Overal Statistics ==\n")
|
||||||
|
f.write(" Number of commits: {}\n".format(stats['num_commits']))
|
||||||
|
f.write(" Number of filenames: {}\n".format(len(path_size['packed'])))
|
||||||
|
f.write(" Number of directories: {}\n".format(len(dir_size['packed'])))
|
||||||
|
f.write(" Number of file extensions: {}\n".format(len(ext_size['packed'])))
|
||||||
|
f.write("\n")
|
||||||
|
f.write(" Total unpacked size (bytes): {:10d}\n"
|
||||||
|
.format(total_size['unpacked']))
|
||||||
|
f.write(" Total packed size (bytes): {:10d}\n"
|
||||||
|
.format(total_size['packed']))
|
||||||
|
f.write("\n")
|
||||||
|
|
||||||
|
# Mention issues with the report
|
||||||
|
f.write("== Caveats ==\n")
|
||||||
|
f.write("=== Sizes ===\n")
|
||||||
|
f.write(textwrap.dedent("""
|
||||||
|
Packed size represents what size your repository would be if no
|
||||||
|
trees, commits, tags, or other metadata were included (though it may
|
||||||
|
fail to represent de-duplication; see below). It also represents the
|
||||||
|
current packing, which may be suboptimal if you haven't gc'ed for a
|
||||||
|
while.
|
||||||
|
|
||||||
|
Unpacked size represents what size your repository would be if no if
|
||||||
|
no trees, commits, tags, or other metadata were included AND if no
|
||||||
|
files were packed; i.e., without delta-ing or compression.
|
||||||
|
|
||||||
|
Both unpacked and packed sizes can be slightly misleading. Deleting
|
||||||
|
a blob from history not save as much space as the unpacked size,
|
||||||
|
because it is obviously normally stored in packed form. Also,
|
||||||
|
deleting a blob from history may not save as much space as its packed
|
||||||
|
size either, because another blob could be stored as a delta against
|
||||||
|
that blob, so when you remove one blob another blob's packed size may
|
||||||
|
grow.
|
||||||
|
|
||||||
|
Also, the sum of the packed sizes can add up to more than the
|
||||||
|
repository size; if the same contents appeared in the repository in
|
||||||
|
multiple places, git will automatically de-dupe and store only one
|
||||||
|
copy, while the way sizes are added in this analysis adds the size
|
||||||
|
for each file path that has those contents. Further, if a file is
|
||||||
|
ever reverted to a previous version's contents, the previous
|
||||||
|
version's size will be counted multiple times in this analysis, even
|
||||||
|
though git will only store it once.
|
||||||
|
"""[1:]))
|
||||||
|
f.write("\n")
|
||||||
|
f.write("=== Deletions ===\n")
|
||||||
|
f.write(textwrap.dedent("""
|
||||||
|
Whether a file is deleted is not a binary quality, since it can be
|
||||||
|
deleted on some branches but still exist in others. Also, it might
|
||||||
|
exist in an old tag, but have been deleted in versions newer than
|
||||||
|
that. More thorough tracking could be done, including looking at
|
||||||
|
merge commits where one side of history deleted and the other modified,
|
||||||
|
in order to give a more holistic picture of deletions. However, that
|
||||||
|
algorithm would not only be more complex to implement, it'd also be
|
||||||
|
quite difficult to present and interpret by users. Since --analyze
|
||||||
|
is just about getting a high-level rough picture of history, it instead
|
||||||
|
implements the simplistic rule that is good enough for 98% of cases:
|
||||||
|
A file is marked as deleted if the last commit in the fast-export
|
||||||
|
stream that mentions the file lists it as deleted.
|
||||||
|
This makes it dependent on topological ordering, but generally gives
|
||||||
|
the "right" answer.
|
||||||
|
"""[1:]))
|
||||||
|
f.write("\n")
|
||||||
|
f.write("=== Renames ===\n")
|
||||||
|
f.write(textwrap.dedent("""
|
||||||
|
Renames share the same non-binary nature that deletions do, plus
|
||||||
|
additional challenges:
|
||||||
|
* If the renamed file is renamed again, instead of just two names for
|
||||||
|
a path you can have three or more.
|
||||||
|
* Rename pairs of the form (oldname, newname) that we consider to be
|
||||||
|
different names of the "same file" might only be valid over certain
|
||||||
|
commit ranges. For example, if a new commit reintroduces a file
|
||||||
|
named oldname, then new versions of oldname aren't the "same file"
|
||||||
|
anymore. We could try to portray this to the user, but it's easier
|
||||||
|
for the user to just break the pairing and only report unbroken
|
||||||
|
rename pairings to the user.
|
||||||
|
* The ability for users to rename files differently in different
|
||||||
|
branches means that our chains of renames will not necessarily be
|
||||||
|
linear but may branch out.
|
||||||
|
"""[1:]))
|
||||||
|
f.write("\n")
|
||||||
|
|
||||||
|
# Equivalence classes for names, so if folks only want to keep a
|
||||||
|
# certain set of paths, they know the old names they want to include
|
||||||
|
# too.
|
||||||
|
with open(os.path.join(reportdir, "renames.txt"), 'w') as f:
|
||||||
|
seen = set()
|
||||||
|
for pathname,equiv_group in sorted(stats['equivalence'].iteritems(),
|
||||||
|
key=lambda x:x[1]):
|
||||||
|
if equiv_group in seen:
|
||||||
|
continue
|
||||||
|
seen.add(equiv_group)
|
||||||
|
f.write("{} ->\n ".format(equiv_group[0]) +
|
||||||
|
"\n ".join(equiv_group[1:]) +
|
||||||
|
"\n")
|
||||||
|
|
||||||
|
# List directories in reverse sorted order of unpacked size
|
||||||
|
with open(os.path.join(reportdir, "directories-deleted-sizes.txt"), 'w') as f:
|
||||||
|
f.write("=== Deleted directories by reverse size ===\n")
|
||||||
|
f.write("Format: unpacked size, packed size, date deleted, directory name\n")
|
||||||
|
for dirname, size in sorted(dir_size['packed'].iteritems(),
|
||||||
|
key=lambda x:x[1], reverse=True):
|
||||||
|
if (dir_deleted_data[dirname]):
|
||||||
|
f.write(" {:10d} {:10d} {:10s} {}\n"
|
||||||
|
.format(dir_size['unpacked'][dirname],
|
||||||
|
size,
|
||||||
|
datestr(dir_deleted_data[dirname]),
|
||||||
|
dirname or '<toplevel>'))
|
||||||
|
|
||||||
|
with open(os.path.join(reportdir, "directories-all-sizes.txt"), 'w') as f:
|
||||||
|
f.write("=== All directories by reverse size ===\n")
|
||||||
|
f.write("Format: unpacked size, packed size, date deleted, directory name\n")
|
||||||
|
for dirname, size in sorted(dir_size['packed'].iteritems(),
|
||||||
|
key=lambda x:x[1], reverse=True):
|
||||||
f.write(" {:10d} {:10d} {:10s} {}\n"
|
f.write(" {:10d} {:10d} {:10s} {}\n"
|
||||||
.format(dir_size['unpacked'][dirname],
|
.format(dir_size['unpacked'][dirname],
|
||||||
size,
|
size,
|
||||||
datestr(dir_deleted_data[dirname]),
|
datestr(dir_deleted_data[dirname]),
|
||||||
dirname or '<toplevel>'))
|
dirname or '<toplevel>'))
|
||||||
|
|
||||||
with open(os.path.join(reportdir, "directories-all-sizes.txt"), 'w') as f:
|
# List extensions in reverse sorted order of unpacked size
|
||||||
f.write("=== All directories by reverse size ===\n")
|
with open(os.path.join(reportdir, "extensions-deleted-sizes.txt"), 'w') as f:
|
||||||
f.write("Format: unpacked size, packed size, date deleted, directory name\n")
|
f.write("=== Deleted extensions by reverse size ===\n")
|
||||||
for dirname, size in sorted(dir_size['packed'].iteritems(),
|
f.write("Format: unpacked size, packed size, date deleted, extension name\n")
|
||||||
key=lambda x:x[1], reverse=True):
|
for extname, size in sorted(ext_size['packed'].iteritems(),
|
||||||
f.write(" {:10d} {:10d} {:10s} {}\n"
|
key=lambda x:x[1], reverse=True):
|
||||||
.format(dir_size['unpacked'][dirname],
|
if (ext_deleted_data[extname]):
|
||||||
size,
|
f.write(" {:10d} {:10d} {:10s} {}\n"
|
||||||
datestr(dir_deleted_data[dirname]),
|
.format(ext_size['unpacked'][extname],
|
||||||
dirname or '<toplevel>'))
|
size,
|
||||||
|
datestr(ext_deleted_data[extname]),
|
||||||
|
extname or '<no extension>'))
|
||||||
|
|
||||||
# List extensions in reverse sorted order of unpacked size
|
with open(os.path.join(reportdir, "extensions-all-sizes.txt"), 'w') as f:
|
||||||
with open(os.path.join(reportdir, "extensions-deleted-sizes.txt"), 'w') as f:
|
f.write("=== All extensions by reverse size ===\n")
|
||||||
f.write("=== Deleted extensions by reverse size ===\n")
|
f.write("Format: unpacked size, packed size, date deleted, extension name\n")
|
||||||
f.write("Format: unpacked size, packed size, date deleted, extension name\n")
|
for extname, size in sorted(ext_size['packed'].iteritems(),
|
||||||
for extname, size in sorted(ext_size['packed'].iteritems(),
|
key=lambda x:x[1], reverse=True):
|
||||||
key=lambda x:x[1], reverse=True):
|
|
||||||
if (ext_deleted_data[extname]):
|
|
||||||
f.write(" {:10d} {:10d} {:10s} {}\n"
|
f.write(" {:10d} {:10d} {:10s} {}\n"
|
||||||
.format(ext_size['unpacked'][extname],
|
.format(ext_size['unpacked'][extname],
|
||||||
size,
|
size,
|
||||||
datestr(ext_deleted_data[extname]),
|
datestr(ext_deleted_data[extname]),
|
||||||
extname or '<no extension>'))
|
extname or '<no extension>'))
|
||||||
|
|
||||||
with open(os.path.join(reportdir, "extensions-all-sizes.txt"), 'w') as f:
|
# List files in reverse sorted order of unpacked size
|
||||||
f.write("=== All extensions by reverse size ===\n")
|
with open(os.path.join(reportdir, "path-deleted-sizes.txt"), 'w') as f:
|
||||||
f.write("Format: unpacked size, packed size, date deleted, extension name\n")
|
f.write("=== Deleted paths by reverse accumulated size ===\n")
|
||||||
for extname, size in sorted(ext_size['packed'].iteritems(),
|
f.write("Format: unpacked size, packed size, date deleted, path name(s)\n")
|
||||||
key=lambda x:x[1], reverse=True):
|
for pathname, size in sorted(path_size['packed'].iteritems(),
|
||||||
f.write(" {:10d} {:10d} {:10s} {}\n"
|
key=lambda x:x[1], reverse=True):
|
||||||
.format(ext_size['unpacked'][extname],
|
when = stats['file_deletions'].get(pathname, None)
|
||||||
size,
|
if when:
|
||||||
datestr(ext_deleted_data[extname]),
|
f.write(" {:10d} {:10d} {:10s} {}\n"
|
||||||
extname or '<no extension>'))
|
.format(path_size['unpacked'][pathname],
|
||||||
|
size,
|
||||||
|
datestr(when),
|
||||||
|
pathname))
|
||||||
|
|
||||||
# List files in reverse sorted order of unpacked size
|
with open(os.path.join(reportdir, "path-all-sizes.txt"), 'w') as f:
|
||||||
with open(os.path.join(reportdir, "path-deleted-sizes.txt"), 'w') as f:
|
f.write("=== All paths by reverse accumulated size ===\n")
|
||||||
f.write("=== Deleted paths by reverse accumulated size ===\n")
|
f.write("Format: unpacked size, packed size, date deleted, pathectory name\n")
|
||||||
f.write("Format: unpacked size, packed size, date deleted, path name(s)\n")
|
for pathname, size in sorted(path_size['packed'].iteritems(),
|
||||||
for pathname, size in sorted(path_size['packed'].iteritems(),
|
key=lambda x:x[1], reverse=True):
|
||||||
key=lambda x:x[1], reverse=True):
|
when = stats['file_deletions'].get(pathname, None)
|
||||||
when = stats['file_deletions'].get(pathname, None)
|
|
||||||
if when:
|
|
||||||
f.write(" {:10d} {:10d} {:10s} {}\n"
|
f.write(" {:10d} {:10d} {:10s} {}\n"
|
||||||
.format(path_size['unpacked'][pathname],
|
.format(path_size['unpacked'][pathname],
|
||||||
size,
|
size,
|
||||||
datestr(when),
|
datestr(when),
|
||||||
pathname))
|
pathname))
|
||||||
|
|
||||||
with open(os.path.join(reportdir, "path-all-sizes.txt"), 'w') as f:
|
# List of filenames and sizes in descending order
|
||||||
f.write("=== All paths by reverse accumulated size ===\n")
|
with open(os.path.join(reportdir, "blob-shas-and-paths.txt"), 'w') as f:
|
||||||
f.write("Format: unpacked size, packed size, date deleted, pathectory name\n")
|
f.write("== Files by sha and associated pathnames in reverse size ==\n")
|
||||||
for pathname, size in sorted(path_size['packed'].iteritems(),
|
f.write("Format: sha, unpacked size, packed size, filename(s) object stored as\n")
|
||||||
key=lambda x:x[1], reverse=True):
|
for sha, size in sorted(stats['packed_size'].iteritems(),
|
||||||
when = stats['file_deletions'].get(pathname, None)
|
key=lambda x:x[1], reverse=True):
|
||||||
f.write(" {:10d} {:10d} {:10s} {}\n"
|
if sha not in stats['names']:
|
||||||
.format(path_size['unpacked'][pathname],
|
# Some objects in the repository might not be referenced, or not
|
||||||
size,
|
# referenced by the branches/tags the user cares about; skip them.
|
||||||
datestr(when),
|
continue
|
||||||
pathname))
|
names_with_sha = stats['names'][sha]
|
||||||
|
if len(names_with_sha) == 1:
|
||||||
|
names_with_sha = names_with_sha.pop()
|
||||||
|
else:
|
||||||
|
names_with_sha = sorted(list(names_with_sha))
|
||||||
|
f.write(" {} {:10d} {:10d} {}\n".format(sha,
|
||||||
|
stats['unpacked_size'][sha],
|
||||||
|
size,
|
||||||
|
names_with_sha))
|
||||||
|
|
||||||
# List of filenames and sizes in descending order
|
@staticmethod
|
||||||
with open(os.path.join(reportdir, "blob-shas-and-paths.txt"), 'w') as f:
|
def run(args, git_dir):
|
||||||
f.write("== Files by sha and associated pathnames in reverse size ==\n")
|
# Create the report directory as necessary
|
||||||
f.write("Format: sha, unpacked size, packed size, filename(s) object stored as\n")
|
results_tmp_dir = os.path.join(git_dir, 'filter-repo')
|
||||||
for sha, size in sorted(stats['packed_size'].iteritems(),
|
if not os.path.isdir(results_tmp_dir):
|
||||||
key=lambda x:x[1], reverse=True):
|
os.mkdir(results_tmp_dir)
|
||||||
if sha not in stats['names']:
|
reportdir = os.path.join(results_tmp_dir, "analysis")
|
||||||
# Some objects in the repository might not be referenced, or not
|
if not args.force and os.path.isdir(reportdir):
|
||||||
# referenced by the branches/tags the user cares about; skip them.
|
raise SystemExit("Error: {} already exists; refusing to overwrite!".
|
||||||
continue
|
format(reportdir))
|
||||||
names_with_sha = stats['names'][sha]
|
os.mkdir(reportdir)
|
||||||
if len(names_with_sha) == 1:
|
|
||||||
names_with_sha = names_with_sha.pop()
|
|
||||||
else:
|
|
||||||
names_with_sha = sorted(list(names_with_sha))
|
|
||||||
f.write(" {} {:10d} {:10d} {}\n".format(sha,
|
|
||||||
stats['unpacked_size'][sha],
|
|
||||||
size,
|
|
||||||
names_with_sha))
|
|
||||||
|
|
||||||
def do_analysis(args, git_dir):
|
# Gather the data we need
|
||||||
# Create the report directory as necessary
|
stats = RepoAnalyze.gather_data(args)
|
||||||
results_tmp_dir = os.path.join(git_dir, 'filter-repo')
|
|
||||||
if not os.path.isdir(results_tmp_dir):
|
|
||||||
os.mkdir(results_tmp_dir)
|
|
||||||
reportdir = os.path.join(results_tmp_dir, "analysis")
|
|
||||||
if not args.force and os.path.isdir(reportdir):
|
|
||||||
raise SystemExit("Error: {} already exists; refusing to overwrite!".
|
|
||||||
format(reportdir))
|
|
||||||
os.mkdir(reportdir)
|
|
||||||
|
|
||||||
# Gather the data we need
|
# Write the reports
|
||||||
stats = gather_data(args)
|
sys.stdout.write("Writing reports to {}...".format(reportdir))
|
||||||
|
sys.stdout.flush()
|
||||||
# Write the reports
|
RepoAnalyze.write_report(reportdir, stats)
|
||||||
sys.stdout.write("Writing reports to {}...".format(reportdir))
|
sys.stdout.write("done.\n")
|
||||||
sys.stdout.flush()
|
|
||||||
write_report(reportdir, stats)
|
|
||||||
sys.stdout.write("done.\n")
|
|
||||||
|
|
||||||
def sanity_check(refs, is_bare):
|
def sanity_check(refs, is_bare):
|
||||||
def abort(reason):
|
def abort(reason):
|
||||||
@ -2506,7 +2524,7 @@ def run_fast_filter():
|
|||||||
|
|
||||||
# Do analysis, if requested
|
# Do analysis, if requested
|
||||||
if args.analyze:
|
if args.analyze:
|
||||||
do_analysis(args, git_dir)
|
RepoAnalyze.run(args, git_dir)
|
||||||
return
|
return
|
||||||
|
|
||||||
# Do sanity checks
|
# Do sanity checks
|
||||||
|
Loading…
Reference in New Issue
Block a user