From a2540f408709798d9eb5929d6508ab7026a91c27 Mon Sep 17 00:00:00 2001 From: Elijah Newren Date: Thu, 15 Nov 2018 12:04:44 -0800 Subject: [PATCH] filter-repo: add packed sizes to --analyze reports Signed-off-by: Elijah Newren --- git-filter-repo | 174 ++++++++++++++++++++++++++++++++---------------- 1 file changed, 115 insertions(+), 59 deletions(-) diff --git a/git-filter-repo b/git-filter-repo index 9cb8450..07bc63b 100755 --- a/git-filter-repo +++ b/git-filter-repo @@ -1824,19 +1824,23 @@ def analyze_commit(args, commit): def gather_data(args): # Get sizes of blobs by sha1 - cf = subprocess.Popen('git cat-file --batch-check --batch-all-objects'.split(), + a='--batch-check=%(objectname) %(objecttype) %(objectsize) %(objectsize:disk)' + cf = subprocess.Popen(['git', 'cat-file', '--batch-all-objects', a], stdout = subprocess.PIPE) - size = {} + unpacked_size = {} + packed_size = {} for line in cf.stdout: - sha, objtype, shasize = line.split() - shasize = int(shasize) + sha, objtype, objsize, objdisksize = line.split() + objsize, objdisksize = int(objsize), int(objdisksize) if objtype == 'blob': - size[sha] = shasize + unpacked_size[sha] = objsize + packed_size[sha] = objdisksize stats = {'names': collections.defaultdict(set), 'allnames' : set(), 'deletions': {}, 'equivalence': {}, - 'size': size} + 'unpacked_size': unpacked_size, + 'packed_size': packed_size} # Setup the fast-export process fep_cmd = ['git', 'fast-export', @@ -1852,7 +1856,6 @@ def gather_data(args): output = open(os.devnull, 'w') # Create and run the filter - setattr(args, 'size', size) setattr(args, 'stats', stats) analyze_filter = FastExportFilter( commit_callback = lambda c : analyze_commit(args, c), @@ -1890,20 +1893,25 @@ def do_analysis(args, git_dir): if path == '': break - # Compute aggregate unpacked size information for paths, extensions, and dirs - total_size = 0 - path_size = collections.defaultdict(int) - ext_size = collections.defaultdict(int) - dir_size = collections.defaultdict(int) + # Compute aggregate size information for paths, extensions, and dirs + total_size = {'packed': 0, 'unpacked': 0} + path_size = {'packed': collections.defaultdict(int), + 'unpacked': collections.defaultdict(int)} + ext_size = {'packed': collections.defaultdict(int), + 'unpacked': collections.defaultdict(int)} + dir_size = {'packed': collections.defaultdict(int), + 'unpacked': collections.defaultdict(int)} for sha in args.stats['names']: - size = args.size[sha] - for name in args.stats['names'][sha]: - total_size += size - path_size[name] += size - basename, ext = os.path.splitext(name) - ext_size[ext] += size - for dirname in dirnames(name): - dir_size[dirname] += size + size = {'packed': args.stats['packed_size'][sha], + 'unpacked': args.stats['unpacked_size'][sha]} + for which in ('packed', 'unpacked'): + for name in args.stats['names'][sha]: + total_size[which] += size[which] + path_size[which][name] += size[which] + basename, ext = os.path.splitext(name) + ext_size[which][ext] += size[which] + for dirname in dirnames(name): + dir_size[which][dirname] += size[which] # Determine if and when extensions and directories were deleted ext_deleted_data = {} @@ -1935,20 +1943,48 @@ def do_analysis(args, git_dir): # Give a basic overview of this file f.write("== Overal Statistics ==\n") f.write(" Number of commits: {}\n".format(args.num_commits)) - f.write(" Number of filenames: {}\n".format(len(path_size))) - f.write(" Number of directories: {}\n".format(len(dir_size))) - f.write(" Number of file extensions: {}\n".format(len(ext_size))) + f.write(" Number of filenames: {}\n".format(len(path_size['packed']))) + f.write(" Number of directories: {}\n".format(len(dir_size['packed']))) + f.write(" Number of file extensions: {}\n".format(len(ext_size['packed']))) f.write("\n") - f.write(" Total unpacked size: {}\n".format(total_size)) - f.write("\n") - f.write(" (Unpacked size represents what size your repository would be\n") - f.write(" if no trees, commits, tags, or other metadata were included\n") - f.write(" AND if no files were packed; i.e., without delta-ing and\n") - f.write(" without compression.)\n") + f.write(" Total unpacked size (bytes): {:10d}\n" + .format(total_size['unpacked'])) + f.write(" Total packed size (bytes): {:10d}\n" + .format(total_size['packed'])) f.write("\n") # Mention issues with the report f.write("== Caveats ==\n") + f.write("=== Sizes ===\n") + f.write(textwrap.dedent(""" + Packed size represents what size your repository would be if no + trees, commits, tags, or other metadata were included (though it may + fail to represent de-duplication; see below). It also represents the + current packing, which may be suboptimal if you haven't gc'ed for a + while. + + Unpacked size represents what size your repository would be if no if + no trees, commits, tags, or other metadata were included AND if no + files were packed; i.e., without delta-ing or compression. + + Both unpacked and packed sizes can be slightly misleading. Deleting + a blob from history not save as much space as the unpacked size, + because it is obviously normally stored in packed form. Also, + deleting a blob from history may not save as much space as its packed + size either, because another blob could be stored as a delta against + that blob, so when you remove one blob another blob's packed size may + grow. + + Also, the sum of the packed sizes can add up to more than the + repository size; if the same contents appeared in the repository in + multiple places, git will automatically de-dupe and store only one + copy, while the way sizes are added in this analysis adds the size + for each file path that has those contents. Further, if a file is + ever reverted to a previous version's contents, the previous + version's size will be counted multiple times in this analysis, even + though git will only store it once. + """[1:])) + f.write("\n") f.write("=== Deletions ===\n") f.write(textwrap.dedent(""" Whether a file is deleted is not a binary quality, since it can be @@ -2014,66 +2050,83 @@ def do_analysis(args, git_dir): # List directories in reverse sorted order of unpacked size with open(os.path.join(reportdir, "directories-deleted-sizes.txt"), 'w') as f: f.write("=== Deleted directories by reverse size ===\n") - f.write("Format: size (bytes), date deleted, directory name\n") - for dirname, size in sorted(dir_size.iteritems(), + f.write("Format: unpacked size, packed size, date deleted, directory name\n") + for dirname, size in sorted(dir_size['packed'].iteritems(), key=lambda x:x[1], reverse=True): if (dir_deleted_data[dirname]): - f.write(" {:10d} {:10s} {}\n".format(size, - datestr(dir_deleted_data[dirname]), - dirname or '')) + f.write(" {:10d} {:10d} {:10s} {}\n" + .format(dir_size['unpacked'][dirname], + size, + datestr(dir_deleted_data[dirname]), + dirname or '')) with open(os.path.join(reportdir, "directories-all-sizes.txt"), 'w') as f: f.write("=== All directories by reverse size ===\n") - f.write("Format: size (bytes), date deleted, directory name\n") - for dirname, size in sorted(dir_size.iteritems(), + f.write("Format: unpacked size, packed size, date deleted, directory name\n") + for dirname, size in sorted(dir_size['packed'].iteritems(), key=lambda x:x[1], reverse=True): - f.write(" {:10d} {:10s} {}\n".format(size, datestr(dir_deleted_data[dirname]), - dirname or '')) + f.write(" {:10d} {:10d} {:10s} {}\n" + .format(dir_size['unpacked'][dirname], + size, + datestr(dir_deleted_data[dirname]), + dirname or '')) # List extensions in reverse sorted order of unpacked size with open(os.path.join(reportdir, "extensions-deleted-sizes.txt"), 'w') as f: f.write("=== Deleted extensions by reverse size ===\n") - f.write("Format: size (bytes), date deleted, extension name\n") - for extname, size in sorted(ext_size.iteritems(), + f.write("Format: unpacked size, packed size, date deleted, extension name\n") + for extname, size in sorted(ext_size['packed'].iteritems(), key=lambda x:x[1], reverse=True): if (ext_deleted_data[extname]): - f.write(" {:10d} {:10s} {}\n".format(size, - datestr(ext_deleted_data[extname]), - extname or '')) + f.write(" {:10d} {:10d} {:10s} {}\n" + .format(ext_size['unpacked'][extname], + size, + datestr(ext_deleted_data[extname]), + extname or '')) with open(os.path.join(reportdir, "extensions-all-sizes.txt"), 'w') as f: f.write("=== All extensions by reverse size ===\n") - f.write("Format: size (bytes), date deleted, extension name\n") - for extname, size in sorted(ext_size.iteritems(), + f.write("Format: unpacked size, packed size, date deleted, extension name\n") + for extname, size in sorted(ext_size['packed'].iteritems(), key=lambda x:x[1], reverse=True): - f.write(" {:10d} {:10s} {}\n".format(size, - datestr(ext_deleted_data[extname]), - extname or '')) + f.write(" {:10d} {:10d} {:10s} {}\n" + .format(ext_size['unpacked'][extname], + size, + datestr(ext_deleted_data[extname]), + extname or '')) # List files in reverse sorted order of unpacked size with open(os.path.join(reportdir, "path-deleted-sizes.txt"), 'w') as f: f.write("=== Deleted paths by reverse accumulated size ===\n") - f.write("Format: size (bytes), date deleted, path name(s)\n") - for pathname, size in sorted(path_size.iteritems(), + f.write("Format: unpacked size, packed size, date deleted, path name(s)\n") + for pathname, size in sorted(path_size['packed'].iteritems(), key=lambda x:x[1], reverse=True): when = args.stats['deletions'].get(pathname, None) if when: - f.write(" {:10d} {:10s} {}\n".format(size, datestr(when), pathname)) + f.write(" {:10d} {:10d} {:10s} {}\n" + .format(path_size['unpacked'][pathname], + size, + datestr(when), + pathname)) with open(os.path.join(reportdir, "path-all-sizes.txt"), 'w') as f: f.write("=== All paths by reverse accumulated size ===\n") - f.write("Format: size (bytes), date deleted, pathectory name\n") - for pathname, size in sorted(path_size.iteritems(), - key=lambda x:x[1], reverse=True): + f.write("Format: unpacked size, packed size, date deleted, pathectory name\n") + for pathname, size in sorted(path_size['packed'].iteritems(), + key=lambda x:x[1], reverse=True): when = args.stats['deletions'].get(pathname, None) - f.write(" {:10d} {:10s} {}\n".format(size, datestr(when), pathname)) + f.write(" {:10d} {:10d} {:10s} {}\n" + .format(path_size['unpacked'][pathname], + size, + datestr(when), + pathname)) # List of filenames and sizes in descending order with open(os.path.join(reportdir, "blob-shas-and-paths.txt"), 'w') as f: f.write("== Files by sha and associated pathnames in reverse size ==\n") - f.write("Format: sha, size (bytes), filename(s) object stored as\n") - for sha, size in sorted(args.size.iteritems(), key=lambda x:x[1], - reverse=True): + f.write("Format: sha, unpacked size, packed size, filename(s) object stored as\n") + for sha, size in sorted(args.stats['packed_size'].iteritems(), + key=lambda x:x[1], reverse=True): if sha not in args.stats['names']: # Some objects in the repository might not be referenced, or not # referenced by the branches/tags the user cares about; skip them. @@ -2083,7 +2136,10 @@ def do_analysis(args, git_dir): names_with_sha = names_with_sha.pop() else: names_with_sha = sorted(list(names_with_sha)) - f.write(" {} {:9d} {}\n".format(sha, size, names_with_sha)) + f.write(" {} {:10d} {:10d} {}\n".format(sha, + args.stats['unpacked_size'][sha], + size, + names_with_sha)) # Notify the user where they can find the reports print("Reports written to {}".format(reportdir))