filter-repo: group repo analysis functions into a class

Signed-off-by: Elijah Newren <>
This commit is contained in:
Elijah Newren 2018-12-25 21:54:16 -08:00
parent 9887dd5cbe
commit 4e2110136e

View File

@ -1891,10 +1891,15 @@ class FilteringOptions(object):
return args
def analyze_commit(stats, graph, commit, parents, date, file_changes):
def equiv_class(filename):
class RepoAnalyze(object):
# First, several helper functions for analyze_commit()
def equiv_class(stats, filename):
return stats['equivalence'].get(filename, (filename,))
def setup_equivalence_for_rename(stats, oldname, newname):
# if A is renamed to B and B is renamed to C, then the user thinks of
# A, B, and C as all being different names for the same 'file'. We record
@ -1911,18 +1916,22 @@ def analyze_commit(stats, graph, commit, parents, date, file_changes):
for f in new_tuple:
stats['equivalence'][f] = new_tuple
def setup_or_update_rename_history(stats, commit, oldname, newname):
rename_commits = stats['rename_history'].get(oldname, set())
stats['rename_history'][oldname] = rename_commits
def handle_renames(stats, commit, change_types, filenames):
for index, change_type in enumerate(change_types):
if change_type == 'R':
oldname, newname = filenames[index], filenames[-1]
setup_equivalence_for_rename(stats, oldname, newname)
setup_or_update_rename_history(stats, commit, oldname, newname)
RepoAnalyze.setup_equivalence_for_rename(stats, oldname, newname)
RepoAnalyze.setup_or_update_rename_history(stats, commit,
oldname, newname)
def handle_file(stats, graph, commit, modes, shas, filenames):
mode, sha, filename = modes[-1], shas[-1], filenames[-1]
@ -1936,7 +1945,7 @@ def analyze_commit(stats, graph, commit, parents, date, file_changes):
# If the file (or equivalence class of files) was recorded as deleted,
# clearly it isn't anymore
equiv = equiv_class(filename)
equiv = RepoAnalyze.equiv_class(stats, filename)
for f in equiv:
stats[delmode].pop(f, None)
@ -1954,383 +1963,392 @@ def analyze_commit(stats, graph, commit, parents, date, file_changes):
if f in stats['equivalence']:
del stats['equivalence'][f]
graph.add_commit_and_parents(commit, parents)
for change in file_changes:
modes, shas, change_types, filenames = change
if len(parents) == 1 and change_types.startswith('R'):
change_types = 'R' # remove the rename score; we don't care
if modes[-1] == '160000':
elif modes[-1] == '000000':
# Track when files/directories are deleted; see 'R' below about equiv_class
for f in equiv_class(filenames[-1]):
if any(x == '040000' for x in modes[0:-1]):
stats['tree_deletions'][f] = date
stats['file_deletions'][f] = date
elif change_types.strip('AMT') == '':
handle_file(stats, graph, commit, modes, shas, filenames)
elif modes[-1] == '040000' and change_types.strip('RAM') == '':
handle_file(stats, graph, commit, modes, shas, filenames)
elif change_types.strip('RAM') == '':
handle_file(stats, graph, commit, modes, shas, filenames)
handle_renames(stats, commit, change_types, filenames)
raise SystemExit("Unhandled change type(s): {} (in commit {})"
.format(change_types, commit))
def gather_data(args):
blob_size_progress = ProgressWriter()
num_blobs = 0
# Get sizes of blobs by sha1
a='--batch-check=%(objectname) %(objecttype) %(objectsize) %(objectsize:disk)'
cf = subprocess.Popen(['git', 'cat-file', '--batch-all-objects', a],
bufsize = -1,
stdout = subprocess.PIPE)
unpacked_size = {}
packed_size = {}
for line in cf.stdout:
sha, objtype, objsize, objdisksize = line.split()
objsize, objdisksize = int(objsize), int(objdisksize)
if objtype == 'blob':
unpacked_size[sha] = objsize
packed_size[sha] = objdisksize
num_blobs += 1"Processed {} blob sizes".format(num_blobs))
stats = {'names': collections.defaultdict(set),
'allnames' : set(),
'file_deletions': {},
'tree_deletions': {},
'equivalence': {},
'rename_history': collections.defaultdict(set),
'unpacked_size': unpacked_size,
'packed_size': packed_size,
'num_commits': 0}
# Setup the rev-list/diff-tree process
commit_parse_progress = ProgressWriter()
num_commits = 0
cmd = 'git rev-list --topo-order --reverse {} | git diff-tree --stdin --always --root --format="%H%n%P%n%cd" --date=short -M -t -c --raw --combined-all-paths'.format(' '.join(args.refs))
dtp = subprocess.Popen(cmd, shell=True, bufsize=-1, stdout=subprocess.PIPE)
f = dtp.stdout
line =
cont = bool(line)
graph = AncestryGraph()
while cont:
commit = line.rstrip()
parents =
date =
# We expect a blank line next; if we get a non-blank line then
# this commit modified no files and we need to move on to the next.
# If there is no line, we've reached end-of-input.
line =
cont = True
except StopIteration:
cont = False
# If we haven't reached end of input, and we got a blank line meaning
# a commit that has modified files, then get the file changes associated
# with this commit.
file_changes = []
if cont and not line:
cont = False
for line in f:
if not line.startswith(':'):
cont = True
n = 1+max(1, len(parents))
assert line.startswith(':'*(n-1))
relevant = line[n-1:-1]
splits = relevant.split(None, n)
modes = splits[0:n]
splits = splits[n].split(None, n)
shas = splits[0:n]
splits = splits[n].split('\t')
change_types = splits[0]
filenames = [PathQuoting.dequote(x) for x in splits[1:]]
file_changes.append([modes, shas, change_types, filenames])
# Analyze this commit and update progress
analyze_commit(stats, graph, commit, parents, date, file_changes)
num_commits += 1"Processed {} commits".format(num_commits))
# Show the final commits processed message and record the number of commits
stats['num_commits'] = num_commits
# Close the output, ensure rev-list|diff-tree pipeline completed successfully
if dtp.wait():
raise SystemExit("Error: rev-list|diff-tree pipeline failed; see above.")
return stats
def write_report(reportdir, stats):
def datestr(datetimestr):
return datetimestr if datetimestr else '<present>'
def dirnames(path):
while True:
path = os.path.dirname(path)
yield path
if path == '':
# Compute aggregate size information for paths, extensions, and dirs
total_size = {'packed': 0, 'unpacked': 0}
path_size = {'packed': collections.defaultdict(int),
'unpacked': collections.defaultdict(int)}
ext_size = {'packed': collections.defaultdict(int),
'unpacked': collections.defaultdict(int)}
dir_size = {'packed': collections.defaultdict(int),
'unpacked': collections.defaultdict(int)}
for sha in stats['names']:
size = {'packed': stats['packed_size'][sha],
'unpacked': stats['unpacked_size'][sha]}
for which in ('packed', 'unpacked'):
for name in stats['names'][sha]:
total_size[which] += size[which]
path_size[which][name] += size[which]
basename, ext = os.path.splitext(name)
ext_size[which][ext] += size[which]
for dirname in dirnames(name):
dir_size[which][dirname] += size[which]
# Determine if and when extensions and directories were deleted
ext_deleted_data = {}
for name in stats['allnames']:
when = stats['file_deletions'].get(name, None)
# Update the extension
basename, ext = os.path.splitext(name)
if when is None:
ext_deleted_data[ext] = None
elif ext in ext_deleted_data:
if ext_deleted_data[ext] is not None:
ext_deleted_data[ext] = max(ext_deleted_data[ext], when)
ext_deleted_data[ext] = when
dir_deleted_data = {}
for name in dir_size['packed']:
dir_deleted_data[name] = stats['tree_deletions'].get(name, None)
with open(os.path.join(reportdir, "README"), 'w') as f:
# Give a basic overview of this file
f.write("== Overal Statistics ==\n")
f.write(" Number of commits: {}\n".format(stats['num_commits']))
f.write(" Number of filenames: {}\n".format(len(path_size['packed'])))
f.write(" Number of directories: {}\n".format(len(dir_size['packed'])))
f.write(" Number of file extensions: {}\n".format(len(ext_size['packed'])))
f.write(" Total unpacked size (bytes): {:10d}\n"
f.write(" Total packed size (bytes): {:10d}\n"
# Mention issues with the report
f.write("== Caveats ==\n")
f.write("=== Sizes ===\n")
Packed size represents what size your repository would be if no
trees, commits, tags, or other metadata were included (though it may
fail to represent de-duplication; see below). It also represents the
current packing, which may be suboptimal if you haven't gc'ed for a
Unpacked size represents what size your repository would be if no if
no trees, commits, tags, or other metadata were included AND if no
files were packed; i.e., without delta-ing or compression.
Both unpacked and packed sizes can be slightly misleading. Deleting
a blob from history not save as much space as the unpacked size,
because it is obviously normally stored in packed form. Also,
deleting a blob from history may not save as much space as its packed
size either, because another blob could be stored as a delta against
that blob, so when you remove one blob another blob's packed size may
Also, the sum of the packed sizes can add up to more than the
repository size; if the same contents appeared in the repository in
multiple places, git will automatically de-dupe and store only one
copy, while the way sizes are added in this analysis adds the size
for each file path that has those contents. Further, if a file is
ever reverted to a previous version's contents, the previous
version's size will be counted multiple times in this analysis, even
though git will only store it once.
f.write("=== Deletions ===\n")
Whether a file is deleted is not a binary quality, since it can be
deleted on some branches but still exist in others. Also, it might
exist in an old tag, but have been deleted in versions newer than
that. More thorough tracking could be done, including looking at
merge commits where one side of history deleted and the other modified,
in order to give a more holistic picture of deletions. However, that
algorithm would not only be more complex to implement, it'd also be
quite difficult to present and interpret by users. Since --analyze
is just about getting a high-level rough picture of history, it instead
implements the simplistic rule that is good enough for 98% of cases:
A file is marked as deleted if the last commit in the fast-export
stream that mentions the file lists it as deleted.
This makes it dependent on topological ordering, but generally gives
the "right" answer.
f.write("=== Renames ===\n")
Renames share the same non-binary nature that deletions do, plus
additional challenges:
* If the renamed file is renamed again, instead of just two names for
a path you can have three or more.
* Rename pairs of the form (oldname, newname) that we consider to be
different names of the "same file" might only be valid over certain
commit ranges. For example, if a new commit reintroduces a file
named oldname, then new versions of oldname aren't the "same file"
anymore. We could try to portray this to the user, but it's easier
for the user to just break the pairing and only report unbroken
rename pairings to the user.
* The ability for users to rename files differently in different
branches means that our chains of renames will not necessarily be
linear but may branch out.
# Equivalence classes for names, so if folks only want to keep a
# certain set of paths, they know the old names they want to include
# too.
with open(os.path.join(reportdir, "renames.txt"), 'w') as f:
seen = set()
for pathname,equiv_group in sorted(stats['equivalence'].iteritems(),
key=lambda x:x[1]):
if equiv_group in seen:
def analyze_commit(stats, graph, commit, parents, date, file_changes):
graph.add_commit_and_parents(commit, parents)
for change in file_changes:
modes, shas, change_types, filenames = change
if len(parents) == 1 and change_types.startswith('R'):
change_types = 'R' # remove the rename score; we don't care
if modes[-1] == '160000':
f.write("{} ->\n ".format(equiv_group[0]) +
"\n ".join(equiv_group[1:]) +
elif modes[-1] == '000000':
# Track when files/directories are deleted
for f in RepoAnalyze.equiv_class(stats, filenames[-1]):
if any(x == '040000' for x in modes[0:-1]):
stats['tree_deletions'][f] = date
stats['file_deletions'][f] = date
elif change_types.strip('AMT') == '':
RepoAnalyze.handle_file(stats, graph, commit, modes, shas, filenames)
elif modes[-1] == '040000' and change_types.strip('RAM') == '':
RepoAnalyze.handle_file(stats, graph, commit, modes, shas, filenames)
elif change_types.strip('RAM') == '':
RepoAnalyze.handle_file(stats, graph, commit, modes, shas, filenames)
RepoAnalyze.handle_renames(stats, commit, change_types, filenames)
raise SystemExit("Unhandled change type(s): {} (in commit {})"
.format(change_types, commit))
# List directories in reverse sorted order of unpacked size
with open(os.path.join(reportdir, "directories-deleted-sizes.txt"), 'w') as f:
f.write("=== Deleted directories by reverse size ===\n")
f.write("Format: unpacked size, packed size, date deleted, directory name\n")
for dirname, size in sorted(dir_size['packed'].iteritems(),
key=lambda x:x[1], reverse=True):
if (dir_deleted_data[dirname]):
def gather_data(args):
blob_size_progress = ProgressWriter()
num_blobs = 0
# Get sizes of blobs by sha1
cmd = '--batch-check=%(objectname) %(objecttype) ' + \
'%(objectsize) %(objectsize:disk)'
cf = subprocess.Popen(['git', 'cat-file', '--batch-all-objects', cmd],
bufsize = -1,
stdout = subprocess.PIPE)
unpacked_size = {}
packed_size = {}
for line in cf.stdout:
sha, objtype, objsize, objdisksize = line.split()
objsize, objdisksize = int(objsize), int(objdisksize)
if objtype == 'blob':
unpacked_size[sha] = objsize
packed_size[sha] = objdisksize
num_blobs += 1"Processed {} blob sizes".format(num_blobs))
stats = {'names': collections.defaultdict(set),
'allnames' : set(),
'file_deletions': {},
'tree_deletions': {},
'equivalence': {},
'rename_history': collections.defaultdict(set),
'unpacked_size': unpacked_size,
'packed_size': packed_size,
'num_commits': 0}
# Setup the rev-list/diff-tree process
commit_parse_progress = ProgressWriter()
num_commits = 0
cmd = ('git rev-list --topo-order --reverse {}'.format(' '.join(args.refs)) +
' | git diff-tree --stdin --always --root --format=%H%n%P%n%cd' +
' --date=short -M -t -c --raw --combined-all-paths')
dtp = subprocess.Popen(cmd, shell=True, bufsize=-1, stdout=subprocess.PIPE)
f = dtp.stdout
line =
cont = bool(line)
graph = AncestryGraph()
while cont:
commit = line.rstrip()
parents =
date =
# We expect a blank line next; if we get a non-blank line then
# this commit modified no files and we need to move on to the next.
# If there is no line, we've reached end-of-input.
line =
cont = True
except StopIteration:
cont = False
# If we haven't reached end of input, and we got a blank line meaning
# a commit that has modified files, then get the file changes associated
# with this commit.
file_changes = []
if cont and not line:
cont = False
for line in f:
if not line.startswith(':'):
cont = True
n = 1+max(1, len(parents))
assert line.startswith(':'*(n-1))
relevant = line[n-1:-1]
splits = relevant.split(None, n)
modes = splits[0:n]
splits = splits[n].split(None, n)
shas = splits[0:n]
splits = splits[n].split('\t')
change_types = splits[0]
filenames = [PathQuoting.dequote(x) for x in splits[1:]]
file_changes.append([modes, shas, change_types, filenames])
# Analyze this commit and update progress
RepoAnalyze.analyze_commit(stats, graph, commit, parents, date,
num_commits += 1"Processed {} commits".format(num_commits))
# Show the final commits processed message and record the number of commits
stats['num_commits'] = num_commits
# Close the output, ensure rev-list|diff-tree pipeline completed successfully
if dtp.wait():
raise SystemExit("Error: rev-list|diff-tree pipeline failed; see above.")
return stats
def write_report(reportdir, stats):
def datestr(datetimestr):
return datetimestr if datetimestr else '<present>'
def dirnames(path):
while True:
path = os.path.dirname(path)
yield path
if path == '':
# Compute aggregate size information for paths, extensions, and dirs
total_size = {'packed': 0, 'unpacked': 0}
path_size = {'packed': collections.defaultdict(int),
'unpacked': collections.defaultdict(int)}
ext_size = {'packed': collections.defaultdict(int),
'unpacked': collections.defaultdict(int)}
dir_size = {'packed': collections.defaultdict(int),
'unpacked': collections.defaultdict(int)}
for sha in stats['names']:
size = {'packed': stats['packed_size'][sha],
'unpacked': stats['unpacked_size'][sha]}
for which in ('packed', 'unpacked'):
for name in stats['names'][sha]:
total_size[which] += size[which]
path_size[which][name] += size[which]
basename, ext = os.path.splitext(name)
ext_size[which][ext] += size[which]
for dirname in dirnames(name):
dir_size[which][dirname] += size[which]
# Determine if and when extensions and directories were deleted
ext_deleted_data = {}
for name in stats['allnames']:
when = stats['file_deletions'].get(name, None)
# Update the extension
basename, ext = os.path.splitext(name)
if when is None:
ext_deleted_data[ext] = None
elif ext in ext_deleted_data:
if ext_deleted_data[ext] is not None:
ext_deleted_data[ext] = max(ext_deleted_data[ext], when)
ext_deleted_data[ext] = when
dir_deleted_data = {}
for name in dir_size['packed']:
dir_deleted_data[name] = stats['tree_deletions'].get(name, None)
with open(os.path.join(reportdir, "README"), 'w') as f:
# Give a basic overview of this file
f.write("== Overal Statistics ==\n")
f.write(" Number of commits: {}\n".format(stats['num_commits']))
f.write(" Number of filenames: {}\n".format(len(path_size['packed'])))
f.write(" Number of directories: {}\n".format(len(dir_size['packed'])))
f.write(" Number of file extensions: {}\n".format(len(ext_size['packed'])))
f.write(" Total unpacked size (bytes): {:10d}\n"
f.write(" Total packed size (bytes): {:10d}\n"
# Mention issues with the report
f.write("== Caveats ==\n")
f.write("=== Sizes ===\n")
Packed size represents what size your repository would be if no
trees, commits, tags, or other metadata were included (though it may
fail to represent de-duplication; see below). It also represents the
current packing, which may be suboptimal if you haven't gc'ed for a
Unpacked size represents what size your repository would be if no if
no trees, commits, tags, or other metadata were included AND if no
files were packed; i.e., without delta-ing or compression.
Both unpacked and packed sizes can be slightly misleading. Deleting
a blob from history not save as much space as the unpacked size,
because it is obviously normally stored in packed form. Also,
deleting a blob from history may not save as much space as its packed
size either, because another blob could be stored as a delta against
that blob, so when you remove one blob another blob's packed size may
Also, the sum of the packed sizes can add up to more than the
repository size; if the same contents appeared in the repository in
multiple places, git will automatically de-dupe and store only one
copy, while the way sizes are added in this analysis adds the size
for each file path that has those contents. Further, if a file is
ever reverted to a previous version's contents, the previous
version's size will be counted multiple times in this analysis, even
though git will only store it once.
f.write("=== Deletions ===\n")
Whether a file is deleted is not a binary quality, since it can be
deleted on some branches but still exist in others. Also, it might
exist in an old tag, but have been deleted in versions newer than
that. More thorough tracking could be done, including looking at
merge commits where one side of history deleted and the other modified,
in order to give a more holistic picture of deletions. However, that
algorithm would not only be more complex to implement, it'd also be
quite difficult to present and interpret by users. Since --analyze
is just about getting a high-level rough picture of history, it instead
implements the simplistic rule that is good enough for 98% of cases:
A file is marked as deleted if the last commit in the fast-export
stream that mentions the file lists it as deleted.
This makes it dependent on topological ordering, but generally gives
the "right" answer.
f.write("=== Renames ===\n")
Renames share the same non-binary nature that deletions do, plus
additional challenges:
* If the renamed file is renamed again, instead of just two names for
a path you can have three or more.
* Rename pairs of the form (oldname, newname) that we consider to be
different names of the "same file" might only be valid over certain
commit ranges. For example, if a new commit reintroduces a file
named oldname, then new versions of oldname aren't the "same file"
anymore. We could try to portray this to the user, but it's easier
for the user to just break the pairing and only report unbroken
rename pairings to the user.
* The ability for users to rename files differently in different
branches means that our chains of renames will not necessarily be
linear but may branch out.
# Equivalence classes for names, so if folks only want to keep a
# certain set of paths, they know the old names they want to include
# too.
with open(os.path.join(reportdir, "renames.txt"), 'w') as f:
seen = set()
for pathname,equiv_group in sorted(stats['equivalence'].iteritems(),
key=lambda x:x[1]):
if equiv_group in seen:
f.write("{} ->\n ".format(equiv_group[0]) +
"\n ".join(equiv_group[1:]) +
# List directories in reverse sorted order of unpacked size
with open(os.path.join(reportdir, "directories-deleted-sizes.txt"), 'w') as f:
f.write("=== Deleted directories by reverse size ===\n")
f.write("Format: unpacked size, packed size, date deleted, directory name\n")
for dirname, size in sorted(dir_size['packed'].iteritems(),
key=lambda x:x[1], reverse=True):
if (dir_deleted_data[dirname]):
f.write(" {:10d} {:10d} {:10s} {}\n"
dirname or '<toplevel>'))
with open(os.path.join(reportdir, "directories-all-sizes.txt"), 'w') as f:
f.write("=== All directories by reverse size ===\n")
f.write("Format: unpacked size, packed size, date deleted, directory name\n")
for dirname, size in sorted(dir_size['packed'].iteritems(),
key=lambda x:x[1], reverse=True):
f.write(" {:10d} {:10d} {:10s} {}\n"
dirname or '<toplevel>'))
with open(os.path.join(reportdir, "directories-all-sizes.txt"), 'w') as f:
f.write("=== All directories by reverse size ===\n")
f.write("Format: unpacked size, packed size, date deleted, directory name\n")
for dirname, size in sorted(dir_size['packed'].iteritems(),
key=lambda x:x[1], reverse=True):
f.write(" {:10d} {:10d} {:10s} {}\n"
dirname or '<toplevel>'))
# List extensions in reverse sorted order of unpacked size
with open(os.path.join(reportdir, "extensions-deleted-sizes.txt"), 'w') as f:
f.write("=== Deleted extensions by reverse size ===\n")
f.write("Format: unpacked size, packed size, date deleted, extension name\n")
for extname, size in sorted(ext_size['packed'].iteritems(),
key=lambda x:x[1], reverse=True):
if (ext_deleted_data[extname]):
f.write(" {:10d} {:10d} {:10s} {}\n"
extname or '<no extension>'))
# List extensions in reverse sorted order of unpacked size
with open(os.path.join(reportdir, "extensions-deleted-sizes.txt"), 'w') as f:
f.write("=== Deleted extensions by reverse size ===\n")
f.write("Format: unpacked size, packed size, date deleted, extension name\n")
for extname, size in sorted(ext_size['packed'].iteritems(),
key=lambda x:x[1], reverse=True):
if (ext_deleted_data[extname]):
with open(os.path.join(reportdir, "extensions-all-sizes.txt"), 'w') as f:
f.write("=== All extensions by reverse size ===\n")
f.write("Format: unpacked size, packed size, date deleted, extension name\n")
for extname, size in sorted(ext_size['packed'].iteritems(),
key=lambda x:x[1], reverse=True):
f.write(" {:10d} {:10d} {:10s} {}\n"
extname or '<no extension>'))
with open(os.path.join(reportdir, "extensions-all-sizes.txt"), 'w') as f:
f.write("=== All extensions by reverse size ===\n")
f.write("Format: unpacked size, packed size, date deleted, extension name\n")
for extname, size in sorted(ext_size['packed'].iteritems(),
key=lambda x:x[1], reverse=True):
f.write(" {:10d} {:10d} {:10s} {}\n"
extname or '<no extension>'))
# List files in reverse sorted order of unpacked size
with open(os.path.join(reportdir, "path-deleted-sizes.txt"), 'w') as f:
f.write("=== Deleted paths by reverse accumulated size ===\n")
f.write("Format: unpacked size, packed size, date deleted, path name(s)\n")
for pathname, size in sorted(path_size['packed'].iteritems(),
key=lambda x:x[1], reverse=True):
when = stats['file_deletions'].get(pathname, None)
if when:
f.write(" {:10d} {:10d} {:10s} {}\n"
# List files in reverse sorted order of unpacked size
with open(os.path.join(reportdir, "path-deleted-sizes.txt"), 'w') as f:
f.write("=== Deleted paths by reverse accumulated size ===\n")
f.write("Format: unpacked size, packed size, date deleted, path name(s)\n")
for pathname, size in sorted(path_size['packed'].iteritems(),
key=lambda x:x[1], reverse=True):
when = stats['file_deletions'].get(pathname, None)
if when:
with open(os.path.join(reportdir, "path-all-sizes.txt"), 'w') as f:
f.write("=== All paths by reverse accumulated size ===\n")
f.write("Format: unpacked size, packed size, date deleted, pathectory name\n")
for pathname, size in sorted(path_size['packed'].iteritems(),
key=lambda x:x[1], reverse=True):
when = stats['file_deletions'].get(pathname, None)
f.write(" {:10d} {:10d} {:10s} {}\n"
with open(os.path.join(reportdir, "path-all-sizes.txt"), 'w') as f:
f.write("=== All paths by reverse accumulated size ===\n")
f.write("Format: unpacked size, packed size, date deleted, pathectory name\n")
for pathname, size in sorted(path_size['packed'].iteritems(),
key=lambda x:x[1], reverse=True):
when = stats['file_deletions'].get(pathname, None)
f.write(" {:10d} {:10d} {:10s} {}\n"
# List of filenames and sizes in descending order
with open(os.path.join(reportdir, "blob-shas-and-paths.txt"), 'w') as f:
f.write("== Files by sha and associated pathnames in reverse size ==\n")
f.write("Format: sha, unpacked size, packed size, filename(s) object stored as\n")
for sha, size in sorted(stats['packed_size'].iteritems(),
key=lambda x:x[1], reverse=True):
if sha not in stats['names']:
# Some objects in the repository might not be referenced, or not
# referenced by the branches/tags the user cares about; skip them.
names_with_sha = stats['names'][sha]
if len(names_with_sha) == 1:
names_with_sha = names_with_sha.pop()
names_with_sha = sorted(list(names_with_sha))
f.write(" {} {:10d} {:10d} {}\n".format(sha,
# List of filenames and sizes in descending order
with open(os.path.join(reportdir, "blob-shas-and-paths.txt"), 'w') as f:
f.write("== Files by sha and associated pathnames in reverse size ==\n")
f.write("Format: sha, unpacked size, packed size, filename(s) object stored as\n")
for sha, size in sorted(stats['packed_size'].iteritems(),
key=lambda x:x[1], reverse=True):
if sha not in stats['names']:
# Some objects in the repository might not be referenced, or not
# referenced by the branches/tags the user cares about; skip them.
names_with_sha = stats['names'][sha]
if len(names_with_sha) == 1:
names_with_sha = names_with_sha.pop()
names_with_sha = sorted(list(names_with_sha))
f.write(" {} {:10d} {:10d} {}\n".format(sha,
def run(args, git_dir):
# Create the report directory as necessary
results_tmp_dir = os.path.join(git_dir, 'filter-repo')
if not os.path.isdir(results_tmp_dir):
reportdir = os.path.join(results_tmp_dir, "analysis")
if not args.force and os.path.isdir(reportdir):
raise SystemExit("Error: {} already exists; refusing to overwrite!".
def do_analysis(args, git_dir):
# Create the report directory as necessary
results_tmp_dir = os.path.join(git_dir, 'filter-repo')
if not os.path.isdir(results_tmp_dir):
reportdir = os.path.join(results_tmp_dir, "analysis")
if not args.force and os.path.isdir(reportdir):
raise SystemExit("Error: {} already exists; refusing to overwrite!".
# Gather the data we need
stats = RepoAnalyze.gather_data(args)
# Gather the data we need
stats = gather_data(args)
# Write the reports
sys.stdout.write("Writing reports to {}...".format(reportdir))
write_report(reportdir, stats)
# Write the reports
sys.stdout.write("Writing reports to {}...".format(reportdir))
RepoAnalyze.write_report(reportdir, stats)
def sanity_check(refs, is_bare):
def abort(reason):
@ -2506,7 +2524,7 @@ def run_fast_filter():
# Do analysis, if requested
if args.analyze:
do_analysis(args, git_dir), git_dir)
# Do sanity checks