filter-repo: add packed sizes to --analyze reports

Signed-off-by: Elijah Newren <newren@gmail.com>
This commit is contained in:
Elijah Newren 2018-11-15 12:04:44 -08:00
parent 7048be2849
commit a2540f4087

View File

@ -1824,19 +1824,23 @@ def analyze_commit(args, commit):
def gather_data(args):
# Get sizes of blobs by sha1
cf = subprocess.Popen('git cat-file --batch-check --batch-all-objects'.split(),
a='--batch-check=%(objectname) %(objecttype) %(objectsize) %(objectsize:disk)'
cf = subprocess.Popen(['git', 'cat-file', '--batch-all-objects', a],
stdout = subprocess.PIPE)
size = {}
unpacked_size = {}
packed_size = {}
for line in cf.stdout:
sha, objtype, shasize = line.split()
shasize = int(shasize)
sha, objtype, objsize, objdisksize = line.split()
objsize, objdisksize = int(objsize), int(objdisksize)
if objtype == 'blob':
size[sha] = shasize
unpacked_size[sha] = objsize
packed_size[sha] = objdisksize
stats = {'names': collections.defaultdict(set),
'allnames' : set(),
'deletions': {},
'equivalence': {},
'size': size}
'unpacked_size': unpacked_size,
'packed_size': packed_size}
# Setup the fast-export process
fep_cmd = ['git', 'fast-export',
@ -1852,7 +1856,6 @@ def gather_data(args):
output = open(os.devnull, 'w')
# Create and run the filter
setattr(args, 'size', size)
setattr(args, 'stats', stats)
analyze_filter = FastExportFilter(
commit_callback = lambda c : analyze_commit(args, c),
@ -1890,20 +1893,25 @@ def do_analysis(args, git_dir):
if path == '':
break
# Compute aggregate unpacked size information for paths, extensions, and dirs
total_size = 0
path_size = collections.defaultdict(int)
ext_size = collections.defaultdict(int)
dir_size = collections.defaultdict(int)
# Compute aggregate size information for paths, extensions, and dirs
total_size = {'packed': 0, 'unpacked': 0}
path_size = {'packed': collections.defaultdict(int),
'unpacked': collections.defaultdict(int)}
ext_size = {'packed': collections.defaultdict(int),
'unpacked': collections.defaultdict(int)}
dir_size = {'packed': collections.defaultdict(int),
'unpacked': collections.defaultdict(int)}
for sha in args.stats['names']:
size = args.size[sha]
for name in args.stats['names'][sha]:
total_size += size
path_size[name] += size
basename, ext = os.path.splitext(name)
ext_size[ext] += size
for dirname in dirnames(name):
dir_size[dirname] += size
size = {'packed': args.stats['packed_size'][sha],
'unpacked': args.stats['unpacked_size'][sha]}
for which in ('packed', 'unpacked'):
for name in args.stats['names'][sha]:
total_size[which] += size[which]
path_size[which][name] += size[which]
basename, ext = os.path.splitext(name)
ext_size[which][ext] += size[which]
for dirname in dirnames(name):
dir_size[which][dirname] += size[which]
# Determine if and when extensions and directories were deleted
ext_deleted_data = {}
@ -1935,20 +1943,48 @@ def do_analysis(args, git_dir):
# Give a basic overview of this file
f.write("== Overal Statistics ==\n")
f.write(" Number of commits: {}\n".format(args.num_commits))
f.write(" Number of filenames: {}\n".format(len(path_size)))
f.write(" Number of directories: {}\n".format(len(dir_size)))
f.write(" Number of file extensions: {}\n".format(len(ext_size)))
f.write(" Number of filenames: {}\n".format(len(path_size['packed'])))
f.write(" Number of directories: {}\n".format(len(dir_size['packed'])))
f.write(" Number of file extensions: {}\n".format(len(ext_size['packed'])))
f.write("\n")
f.write(" Total unpacked size: {}\n".format(total_size))
f.write("\n")
f.write(" (Unpacked size represents what size your repository would be\n")
f.write(" if no trees, commits, tags, or other metadata were included\n")
f.write(" AND if no files were packed; i.e., without delta-ing and\n")
f.write(" without compression.)\n")
f.write(" Total unpacked size (bytes): {:10d}\n"
.format(total_size['unpacked']))
f.write(" Total packed size (bytes): {:10d}\n"
.format(total_size['packed']))
f.write("\n")
# Mention issues with the report
f.write("== Caveats ==\n")
f.write("=== Sizes ===\n")
f.write(textwrap.dedent("""
Packed size represents what size your repository would be if no
trees, commits, tags, or other metadata were included (though it may
fail to represent de-duplication; see below). It also represents the
current packing, which may be suboptimal if you haven't gc'ed for a
while.
Unpacked size represents what size your repository would be if no if
no trees, commits, tags, or other metadata were included AND if no
files were packed; i.e., without delta-ing or compression.
Both unpacked and packed sizes can be slightly misleading. Deleting
a blob from history not save as much space as the unpacked size,
because it is obviously normally stored in packed form. Also,
deleting a blob from history may not save as much space as its packed
size either, because another blob could be stored as a delta against
that blob, so when you remove one blob another blob's packed size may
grow.
Also, the sum of the packed sizes can add up to more than the
repository size; if the same contents appeared in the repository in
multiple places, git will automatically de-dupe and store only one
copy, while the way sizes are added in this analysis adds the size
for each file path that has those contents. Further, if a file is
ever reverted to a previous version's contents, the previous
version's size will be counted multiple times in this analysis, even
though git will only store it once.
"""[1:]))
f.write("\n")
f.write("=== Deletions ===\n")
f.write(textwrap.dedent("""
Whether a file is deleted is not a binary quality, since it can be
@ -2014,66 +2050,83 @@ def do_analysis(args, git_dir):
# List directories in reverse sorted order of unpacked size
with open(os.path.join(reportdir, "directories-deleted-sizes.txt"), 'w') as f:
f.write("=== Deleted directories by reverse size ===\n")
f.write("Format: size (bytes), date deleted, directory name\n")
for dirname, size in sorted(dir_size.iteritems(),
f.write("Format: unpacked size, packed size, date deleted, directory name\n")
for dirname, size in sorted(dir_size['packed'].iteritems(),
key=lambda x:x[1], reverse=True):
if (dir_deleted_data[dirname]):
f.write(" {:10d} {:10s} {}\n".format(size,
datestr(dir_deleted_data[dirname]),
dirname or '<toplevel>'))
f.write(" {:10d} {:10d} {:10s} {}\n"
.format(dir_size['unpacked'][dirname],
size,
datestr(dir_deleted_data[dirname]),
dirname or '<toplevel>'))
with open(os.path.join(reportdir, "directories-all-sizes.txt"), 'w') as f:
f.write("=== All directories by reverse size ===\n")
f.write("Format: size (bytes), date deleted, directory name\n")
for dirname, size in sorted(dir_size.iteritems(),
f.write("Format: unpacked size, packed size, date deleted, directory name\n")
for dirname, size in sorted(dir_size['packed'].iteritems(),
key=lambda x:x[1], reverse=True):
f.write(" {:10d} {:10s} {}\n".format(size, datestr(dir_deleted_data[dirname]),
dirname or '<toplevel>'))
f.write(" {:10d} {:10d} {:10s} {}\n"
.format(dir_size['unpacked'][dirname],
size,
datestr(dir_deleted_data[dirname]),
dirname or '<toplevel>'))
# List extensions in reverse sorted order of unpacked size
with open(os.path.join(reportdir, "extensions-deleted-sizes.txt"), 'w') as f:
f.write("=== Deleted extensions by reverse size ===\n")
f.write("Format: size (bytes), date deleted, extension name\n")
for extname, size in sorted(ext_size.iteritems(),
f.write("Format: unpacked size, packed size, date deleted, extension name\n")
for extname, size in sorted(ext_size['packed'].iteritems(),
key=lambda x:x[1], reverse=True):
if (ext_deleted_data[extname]):
f.write(" {:10d} {:10s} {}\n".format(size,
datestr(ext_deleted_data[extname]),
extname or '<no extension>'))
f.write(" {:10d} {:10d} {:10s} {}\n"
.format(ext_size['unpacked'][extname],
size,
datestr(ext_deleted_data[extname]),
extname or '<no extension>'))
with open(os.path.join(reportdir, "extensions-all-sizes.txt"), 'w') as f:
f.write("=== All extensions by reverse size ===\n")
f.write("Format: size (bytes), date deleted, extension name\n")
for extname, size in sorted(ext_size.iteritems(),
f.write("Format: unpacked size, packed size, date deleted, extension name\n")
for extname, size in sorted(ext_size['packed'].iteritems(),
key=lambda x:x[1], reverse=True):
f.write(" {:10d} {:10s} {}\n".format(size,
datestr(ext_deleted_data[extname]),
extname or '<no extension>'))
f.write(" {:10d} {:10d} {:10s} {}\n"
.format(ext_size['unpacked'][extname],
size,
datestr(ext_deleted_data[extname]),
extname or '<no extension>'))
# List files in reverse sorted order of unpacked size
with open(os.path.join(reportdir, "path-deleted-sizes.txt"), 'w') as f:
f.write("=== Deleted paths by reverse accumulated size ===\n")
f.write("Format: size (bytes), date deleted, path name(s)\n")
for pathname, size in sorted(path_size.iteritems(),
f.write("Format: unpacked size, packed size, date deleted, path name(s)\n")
for pathname, size in sorted(path_size['packed'].iteritems(),
key=lambda x:x[1], reverse=True):
when = args.stats['deletions'].get(pathname, None)
if when:
f.write(" {:10d} {:10s} {}\n".format(size, datestr(when), pathname))
f.write(" {:10d} {:10d} {:10s} {}\n"
.format(path_size['unpacked'][pathname],
size,
datestr(when),
pathname))
with open(os.path.join(reportdir, "path-all-sizes.txt"), 'w') as f:
f.write("=== All paths by reverse accumulated size ===\n")
f.write("Format: size (bytes), date deleted, pathectory name\n")
for pathname, size in sorted(path_size.iteritems(),
key=lambda x:x[1], reverse=True):
f.write("Format: unpacked size, packed size, date deleted, pathectory name\n")
for pathname, size in sorted(path_size['packed'].iteritems(),
key=lambda x:x[1], reverse=True):
when = args.stats['deletions'].get(pathname, None)
f.write(" {:10d} {:10s} {}\n".format(size, datestr(when), pathname))
f.write(" {:10d} {:10d} {:10s} {}\n"
.format(path_size['unpacked'][pathname],
size,
datestr(when),
pathname))
# List of filenames and sizes in descending order
with open(os.path.join(reportdir, "blob-shas-and-paths.txt"), 'w') as f:
f.write("== Files by sha and associated pathnames in reverse size ==\n")
f.write("Format: sha, size (bytes), filename(s) object stored as\n")
for sha, size in sorted(args.size.iteritems(), key=lambda x:x[1],
reverse=True):
f.write("Format: sha, unpacked size, packed size, filename(s) object stored as\n")
for sha, size in sorted(args.stats['packed_size'].iteritems(),
key=lambda x:x[1], reverse=True):
if sha not in args.stats['names']:
# Some objects in the repository might not be referenced, or not
# referenced by the branches/tags the user cares about; skip them.
@ -2083,7 +2136,10 @@ def do_analysis(args, git_dir):
names_with_sha = names_with_sha.pop()
else:
names_with_sha = sorted(list(names_with_sha))
f.write(" {} {:9d} {}\n".format(sha, size, names_with_sha))
f.write(" {} {:10d} {:10d} {}\n".format(sha,
args.stats['unpacked_size'][sha],
size,
names_with_sha))
# Notify the user where they can find the reports
print("Reports written to {}".format(reportdir))