filter-repo: add packed sizes to --analyze reports

Signed-off-by: Elijah Newren <newren@gmail.com>
This commit is contained in:
Elijah Newren 2018-11-15 12:04:44 -08:00
parent 7048be2849
commit a2540f4087

View File

@ -1824,19 +1824,23 @@ def analyze_commit(args, commit):
def gather_data(args): def gather_data(args):
# Get sizes of blobs by sha1 # Get sizes of blobs by sha1
cf = subprocess.Popen('git cat-file --batch-check --batch-all-objects'.split(), a='--batch-check=%(objectname) %(objecttype) %(objectsize) %(objectsize:disk)'
cf = subprocess.Popen(['git', 'cat-file', '--batch-all-objects', a],
stdout = subprocess.PIPE) stdout = subprocess.PIPE)
size = {} unpacked_size = {}
packed_size = {}
for line in cf.stdout: for line in cf.stdout:
sha, objtype, shasize = line.split() sha, objtype, objsize, objdisksize = line.split()
shasize = int(shasize) objsize, objdisksize = int(objsize), int(objdisksize)
if objtype == 'blob': if objtype == 'blob':
size[sha] = shasize unpacked_size[sha] = objsize
packed_size[sha] = objdisksize
stats = {'names': collections.defaultdict(set), stats = {'names': collections.defaultdict(set),
'allnames' : set(), 'allnames' : set(),
'deletions': {}, 'deletions': {},
'equivalence': {}, 'equivalence': {},
'size': size} 'unpacked_size': unpacked_size,
'packed_size': packed_size}
# Setup the fast-export process # Setup the fast-export process
fep_cmd = ['git', 'fast-export', fep_cmd = ['git', 'fast-export',
@ -1852,7 +1856,6 @@ def gather_data(args):
output = open(os.devnull, 'w') output = open(os.devnull, 'w')
# Create and run the filter # Create and run the filter
setattr(args, 'size', size)
setattr(args, 'stats', stats) setattr(args, 'stats', stats)
analyze_filter = FastExportFilter( analyze_filter = FastExportFilter(
commit_callback = lambda c : analyze_commit(args, c), commit_callback = lambda c : analyze_commit(args, c),
@ -1890,20 +1893,25 @@ def do_analysis(args, git_dir):
if path == '': if path == '':
break break
# Compute aggregate unpacked size information for paths, extensions, and dirs # Compute aggregate size information for paths, extensions, and dirs
total_size = 0 total_size = {'packed': 0, 'unpacked': 0}
path_size = collections.defaultdict(int) path_size = {'packed': collections.defaultdict(int),
ext_size = collections.defaultdict(int) 'unpacked': collections.defaultdict(int)}
dir_size = collections.defaultdict(int) ext_size = {'packed': collections.defaultdict(int),
'unpacked': collections.defaultdict(int)}
dir_size = {'packed': collections.defaultdict(int),
'unpacked': collections.defaultdict(int)}
for sha in args.stats['names']: for sha in args.stats['names']:
size = args.size[sha] size = {'packed': args.stats['packed_size'][sha],
for name in args.stats['names'][sha]: 'unpacked': args.stats['unpacked_size'][sha]}
total_size += size for which in ('packed', 'unpacked'):
path_size[name] += size for name in args.stats['names'][sha]:
basename, ext = os.path.splitext(name) total_size[which] += size[which]
ext_size[ext] += size path_size[which][name] += size[which]
for dirname in dirnames(name): basename, ext = os.path.splitext(name)
dir_size[dirname] += size ext_size[which][ext] += size[which]
for dirname in dirnames(name):
dir_size[which][dirname] += size[which]
# Determine if and when extensions and directories were deleted # Determine if and when extensions and directories were deleted
ext_deleted_data = {} ext_deleted_data = {}
@ -1935,20 +1943,48 @@ def do_analysis(args, git_dir):
# Give a basic overview of this file # Give a basic overview of this file
f.write("== Overal Statistics ==\n") f.write("== Overal Statistics ==\n")
f.write(" Number of commits: {}\n".format(args.num_commits)) f.write(" Number of commits: {}\n".format(args.num_commits))
f.write(" Number of filenames: {}\n".format(len(path_size))) f.write(" Number of filenames: {}\n".format(len(path_size['packed'])))
f.write(" Number of directories: {}\n".format(len(dir_size))) f.write(" Number of directories: {}\n".format(len(dir_size['packed'])))
f.write(" Number of file extensions: {}\n".format(len(ext_size))) f.write(" Number of file extensions: {}\n".format(len(ext_size['packed'])))
f.write("\n") f.write("\n")
f.write(" Total unpacked size: {}\n".format(total_size)) f.write(" Total unpacked size (bytes): {:10d}\n"
f.write("\n") .format(total_size['unpacked']))
f.write(" (Unpacked size represents what size your repository would be\n") f.write(" Total packed size (bytes): {:10d}\n"
f.write(" if no trees, commits, tags, or other metadata were included\n") .format(total_size['packed']))
f.write(" AND if no files were packed; i.e., without delta-ing and\n")
f.write(" without compression.)\n")
f.write("\n") f.write("\n")
# Mention issues with the report # Mention issues with the report
f.write("== Caveats ==\n") f.write("== Caveats ==\n")
f.write("=== Sizes ===\n")
f.write(textwrap.dedent("""
Packed size represents what size your repository would be if no
trees, commits, tags, or other metadata were included (though it may
fail to represent de-duplication; see below). It also represents the
current packing, which may be suboptimal if you haven't gc'ed for a
while.
Unpacked size represents what size your repository would be if no if
no trees, commits, tags, or other metadata were included AND if no
files were packed; i.e., without delta-ing or compression.
Both unpacked and packed sizes can be slightly misleading. Deleting
a blob from history not save as much space as the unpacked size,
because it is obviously normally stored in packed form. Also,
deleting a blob from history may not save as much space as its packed
size either, because another blob could be stored as a delta against
that blob, so when you remove one blob another blob's packed size may
grow.
Also, the sum of the packed sizes can add up to more than the
repository size; if the same contents appeared in the repository in
multiple places, git will automatically de-dupe and store only one
copy, while the way sizes are added in this analysis adds the size
for each file path that has those contents. Further, if a file is
ever reverted to a previous version's contents, the previous
version's size will be counted multiple times in this analysis, even
though git will only store it once.
"""[1:]))
f.write("\n")
f.write("=== Deletions ===\n") f.write("=== Deletions ===\n")
f.write(textwrap.dedent(""" f.write(textwrap.dedent("""
Whether a file is deleted is not a binary quality, since it can be Whether a file is deleted is not a binary quality, since it can be
@ -2014,66 +2050,83 @@ def do_analysis(args, git_dir):
# List directories in reverse sorted order of unpacked size # List directories in reverse sorted order of unpacked size
with open(os.path.join(reportdir, "directories-deleted-sizes.txt"), 'w') as f: with open(os.path.join(reportdir, "directories-deleted-sizes.txt"), 'w') as f:
f.write("=== Deleted directories by reverse size ===\n") f.write("=== Deleted directories by reverse size ===\n")
f.write("Format: size (bytes), date deleted, directory name\n") f.write("Format: unpacked size, packed size, date deleted, directory name\n")
for dirname, size in sorted(dir_size.iteritems(), for dirname, size in sorted(dir_size['packed'].iteritems(),
key=lambda x:x[1], reverse=True): key=lambda x:x[1], reverse=True):
if (dir_deleted_data[dirname]): if (dir_deleted_data[dirname]):
f.write(" {:10d} {:10s} {}\n".format(size, f.write(" {:10d} {:10d} {:10s} {}\n"
datestr(dir_deleted_data[dirname]), .format(dir_size['unpacked'][dirname],
dirname or '<toplevel>')) size,
datestr(dir_deleted_data[dirname]),
dirname or '<toplevel>'))
with open(os.path.join(reportdir, "directories-all-sizes.txt"), 'w') as f: with open(os.path.join(reportdir, "directories-all-sizes.txt"), 'w') as f:
f.write("=== All directories by reverse size ===\n") f.write("=== All directories by reverse size ===\n")
f.write("Format: size (bytes), date deleted, directory name\n") f.write("Format: unpacked size, packed size, date deleted, directory name\n")
for dirname, size in sorted(dir_size.iteritems(), for dirname, size in sorted(dir_size['packed'].iteritems(),
key=lambda x:x[1], reverse=True): key=lambda x:x[1], reverse=True):
f.write(" {:10d} {:10s} {}\n".format(size, datestr(dir_deleted_data[dirname]), f.write(" {:10d} {:10d} {:10s} {}\n"
dirname or '<toplevel>')) .format(dir_size['unpacked'][dirname],
size,
datestr(dir_deleted_data[dirname]),
dirname or '<toplevel>'))
# List extensions in reverse sorted order of unpacked size # List extensions in reverse sorted order of unpacked size
with open(os.path.join(reportdir, "extensions-deleted-sizes.txt"), 'w') as f: with open(os.path.join(reportdir, "extensions-deleted-sizes.txt"), 'w') as f:
f.write("=== Deleted extensions by reverse size ===\n") f.write("=== Deleted extensions by reverse size ===\n")
f.write("Format: size (bytes), date deleted, extension name\n") f.write("Format: unpacked size, packed size, date deleted, extension name\n")
for extname, size in sorted(ext_size.iteritems(), for extname, size in sorted(ext_size['packed'].iteritems(),
key=lambda x:x[1], reverse=True): key=lambda x:x[1], reverse=True):
if (ext_deleted_data[extname]): if (ext_deleted_data[extname]):
f.write(" {:10d} {:10s} {}\n".format(size, f.write(" {:10d} {:10d} {:10s} {}\n"
datestr(ext_deleted_data[extname]), .format(ext_size['unpacked'][extname],
extname or '<no extension>')) size,
datestr(ext_deleted_data[extname]),
extname or '<no extension>'))
with open(os.path.join(reportdir, "extensions-all-sizes.txt"), 'w') as f: with open(os.path.join(reportdir, "extensions-all-sizes.txt"), 'w') as f:
f.write("=== All extensions by reverse size ===\n") f.write("=== All extensions by reverse size ===\n")
f.write("Format: size (bytes), date deleted, extension name\n") f.write("Format: unpacked size, packed size, date deleted, extension name\n")
for extname, size in sorted(ext_size.iteritems(), for extname, size in sorted(ext_size['packed'].iteritems(),
key=lambda x:x[1], reverse=True): key=lambda x:x[1], reverse=True):
f.write(" {:10d} {:10s} {}\n".format(size, f.write(" {:10d} {:10d} {:10s} {}\n"
datestr(ext_deleted_data[extname]), .format(ext_size['unpacked'][extname],
extname or '<no extension>')) size,
datestr(ext_deleted_data[extname]),
extname or '<no extension>'))
# List files in reverse sorted order of unpacked size # List files in reverse sorted order of unpacked size
with open(os.path.join(reportdir, "path-deleted-sizes.txt"), 'w') as f: with open(os.path.join(reportdir, "path-deleted-sizes.txt"), 'w') as f:
f.write("=== Deleted paths by reverse accumulated size ===\n") f.write("=== Deleted paths by reverse accumulated size ===\n")
f.write("Format: size (bytes), date deleted, path name(s)\n") f.write("Format: unpacked size, packed size, date deleted, path name(s)\n")
for pathname, size in sorted(path_size.iteritems(), for pathname, size in sorted(path_size['packed'].iteritems(),
key=lambda x:x[1], reverse=True): key=lambda x:x[1], reverse=True):
when = args.stats['deletions'].get(pathname, None) when = args.stats['deletions'].get(pathname, None)
if when: if when:
f.write(" {:10d} {:10s} {}\n".format(size, datestr(when), pathname)) f.write(" {:10d} {:10d} {:10s} {}\n"
.format(path_size['unpacked'][pathname],
size,
datestr(when),
pathname))
with open(os.path.join(reportdir, "path-all-sizes.txt"), 'w') as f: with open(os.path.join(reportdir, "path-all-sizes.txt"), 'w') as f:
f.write("=== All paths by reverse accumulated size ===\n") f.write("=== All paths by reverse accumulated size ===\n")
f.write("Format: size (bytes), date deleted, pathectory name\n") f.write("Format: unpacked size, packed size, date deleted, pathectory name\n")
for pathname, size in sorted(path_size.iteritems(), for pathname, size in sorted(path_size['packed'].iteritems(),
key=lambda x:x[1], reverse=True): key=lambda x:x[1], reverse=True):
when = args.stats['deletions'].get(pathname, None) when = args.stats['deletions'].get(pathname, None)
f.write(" {:10d} {:10s} {}\n".format(size, datestr(when), pathname)) f.write(" {:10d} {:10d} {:10s} {}\n"
.format(path_size['unpacked'][pathname],
size,
datestr(when),
pathname))
# List of filenames and sizes in descending order # List of filenames and sizes in descending order
with open(os.path.join(reportdir, "blob-shas-and-paths.txt"), 'w') as f: with open(os.path.join(reportdir, "blob-shas-and-paths.txt"), 'w') as f:
f.write("== Files by sha and associated pathnames in reverse size ==\n") f.write("== Files by sha and associated pathnames in reverse size ==\n")
f.write("Format: sha, size (bytes), filename(s) object stored as\n") f.write("Format: sha, unpacked size, packed size, filename(s) object stored as\n")
for sha, size in sorted(args.size.iteritems(), key=lambda x:x[1], for sha, size in sorted(args.stats['packed_size'].iteritems(),
reverse=True): key=lambda x:x[1], reverse=True):
if sha not in args.stats['names']: if sha not in args.stats['names']:
# Some objects in the repository might not be referenced, or not # Some objects in the repository might not be referenced, or not
# referenced by the branches/tags the user cares about; skip them. # referenced by the branches/tags the user cares about; skip them.
@ -2083,7 +2136,10 @@ def do_analysis(args, git_dir):
names_with_sha = names_with_sha.pop() names_with_sha = names_with_sha.pop()
else: else:
names_with_sha = sorted(list(names_with_sha)) names_with_sha = sorted(list(names_with_sha))
f.write(" {} {:9d} {}\n".format(sha, size, names_with_sha)) f.write(" {} {:10d} {:10d} {}\n".format(sha,
args.stats['unpacked_size'][sha],
size,
names_with_sha))
# Notify the user where they can find the reports # Notify the user where they can find the reports
print("Reports written to {}".format(reportdir)) print("Reports written to {}".format(reportdir))