mirror of
https://github.com/newren/git-filter-repo.git
synced 2024-07-06 18:32:14 +02:00
filter-repo: add packed sizes to --analyze reports
Signed-off-by: Elijah Newren <newren@gmail.com>
This commit is contained in:
parent
7048be2849
commit
a2540f4087
174
git-filter-repo
174
git-filter-repo
@ -1824,19 +1824,23 @@ def analyze_commit(args, commit):
|
||||
|
||||
def gather_data(args):
|
||||
# Get sizes of blobs by sha1
|
||||
cf = subprocess.Popen('git cat-file --batch-check --batch-all-objects'.split(),
|
||||
a='--batch-check=%(objectname) %(objecttype) %(objectsize) %(objectsize:disk)'
|
||||
cf = subprocess.Popen(['git', 'cat-file', '--batch-all-objects', a],
|
||||
stdout = subprocess.PIPE)
|
||||
size = {}
|
||||
unpacked_size = {}
|
||||
packed_size = {}
|
||||
for line in cf.stdout:
|
||||
sha, objtype, shasize = line.split()
|
||||
shasize = int(shasize)
|
||||
sha, objtype, objsize, objdisksize = line.split()
|
||||
objsize, objdisksize = int(objsize), int(objdisksize)
|
||||
if objtype == 'blob':
|
||||
size[sha] = shasize
|
||||
unpacked_size[sha] = objsize
|
||||
packed_size[sha] = objdisksize
|
||||
stats = {'names': collections.defaultdict(set),
|
||||
'allnames' : set(),
|
||||
'deletions': {},
|
||||
'equivalence': {},
|
||||
'size': size}
|
||||
'unpacked_size': unpacked_size,
|
||||
'packed_size': packed_size}
|
||||
|
||||
# Setup the fast-export process
|
||||
fep_cmd = ['git', 'fast-export',
|
||||
@ -1852,7 +1856,6 @@ def gather_data(args):
|
||||
output = open(os.devnull, 'w')
|
||||
|
||||
# Create and run the filter
|
||||
setattr(args, 'size', size)
|
||||
setattr(args, 'stats', stats)
|
||||
analyze_filter = FastExportFilter(
|
||||
commit_callback = lambda c : analyze_commit(args, c),
|
||||
@ -1890,20 +1893,25 @@ def do_analysis(args, git_dir):
|
||||
if path == '':
|
||||
break
|
||||
|
||||
# Compute aggregate unpacked size information for paths, extensions, and dirs
|
||||
total_size = 0
|
||||
path_size = collections.defaultdict(int)
|
||||
ext_size = collections.defaultdict(int)
|
||||
dir_size = collections.defaultdict(int)
|
||||
# Compute aggregate size information for paths, extensions, and dirs
|
||||
total_size = {'packed': 0, 'unpacked': 0}
|
||||
path_size = {'packed': collections.defaultdict(int),
|
||||
'unpacked': collections.defaultdict(int)}
|
||||
ext_size = {'packed': collections.defaultdict(int),
|
||||
'unpacked': collections.defaultdict(int)}
|
||||
dir_size = {'packed': collections.defaultdict(int),
|
||||
'unpacked': collections.defaultdict(int)}
|
||||
for sha in args.stats['names']:
|
||||
size = args.size[sha]
|
||||
for name in args.stats['names'][sha]:
|
||||
total_size += size
|
||||
path_size[name] += size
|
||||
basename, ext = os.path.splitext(name)
|
||||
ext_size[ext] += size
|
||||
for dirname in dirnames(name):
|
||||
dir_size[dirname] += size
|
||||
size = {'packed': args.stats['packed_size'][sha],
|
||||
'unpacked': args.stats['unpacked_size'][sha]}
|
||||
for which in ('packed', 'unpacked'):
|
||||
for name in args.stats['names'][sha]:
|
||||
total_size[which] += size[which]
|
||||
path_size[which][name] += size[which]
|
||||
basename, ext = os.path.splitext(name)
|
||||
ext_size[which][ext] += size[which]
|
||||
for dirname in dirnames(name):
|
||||
dir_size[which][dirname] += size[which]
|
||||
|
||||
# Determine if and when extensions and directories were deleted
|
||||
ext_deleted_data = {}
|
||||
@ -1935,20 +1943,48 @@ def do_analysis(args, git_dir):
|
||||
# Give a basic overview of this file
|
||||
f.write("== Overal Statistics ==\n")
|
||||
f.write(" Number of commits: {}\n".format(args.num_commits))
|
||||
f.write(" Number of filenames: {}\n".format(len(path_size)))
|
||||
f.write(" Number of directories: {}\n".format(len(dir_size)))
|
||||
f.write(" Number of file extensions: {}\n".format(len(ext_size)))
|
||||
f.write(" Number of filenames: {}\n".format(len(path_size['packed'])))
|
||||
f.write(" Number of directories: {}\n".format(len(dir_size['packed'])))
|
||||
f.write(" Number of file extensions: {}\n".format(len(ext_size['packed'])))
|
||||
f.write("\n")
|
||||
f.write(" Total unpacked size: {}\n".format(total_size))
|
||||
f.write("\n")
|
||||
f.write(" (Unpacked size represents what size your repository would be\n")
|
||||
f.write(" if no trees, commits, tags, or other metadata were included\n")
|
||||
f.write(" AND if no files were packed; i.e., without delta-ing and\n")
|
||||
f.write(" without compression.)\n")
|
||||
f.write(" Total unpacked size (bytes): {:10d}\n"
|
||||
.format(total_size['unpacked']))
|
||||
f.write(" Total packed size (bytes): {:10d}\n"
|
||||
.format(total_size['packed']))
|
||||
f.write("\n")
|
||||
|
||||
# Mention issues with the report
|
||||
f.write("== Caveats ==\n")
|
||||
f.write("=== Sizes ===\n")
|
||||
f.write(textwrap.dedent("""
|
||||
Packed size represents what size your repository would be if no
|
||||
trees, commits, tags, or other metadata were included (though it may
|
||||
fail to represent de-duplication; see below). It also represents the
|
||||
current packing, which may be suboptimal if you haven't gc'ed for a
|
||||
while.
|
||||
|
||||
Unpacked size represents what size your repository would be if no if
|
||||
no trees, commits, tags, or other metadata were included AND if no
|
||||
files were packed; i.e., without delta-ing or compression.
|
||||
|
||||
Both unpacked and packed sizes can be slightly misleading. Deleting
|
||||
a blob from history not save as much space as the unpacked size,
|
||||
because it is obviously normally stored in packed form. Also,
|
||||
deleting a blob from history may not save as much space as its packed
|
||||
size either, because another blob could be stored as a delta against
|
||||
that blob, so when you remove one blob another blob's packed size may
|
||||
grow.
|
||||
|
||||
Also, the sum of the packed sizes can add up to more than the
|
||||
repository size; if the same contents appeared in the repository in
|
||||
multiple places, git will automatically de-dupe and store only one
|
||||
copy, while the way sizes are added in this analysis adds the size
|
||||
for each file path that has those contents. Further, if a file is
|
||||
ever reverted to a previous version's contents, the previous
|
||||
version's size will be counted multiple times in this analysis, even
|
||||
though git will only store it once.
|
||||
"""[1:]))
|
||||
f.write("\n")
|
||||
f.write("=== Deletions ===\n")
|
||||
f.write(textwrap.dedent("""
|
||||
Whether a file is deleted is not a binary quality, since it can be
|
||||
@ -2014,66 +2050,83 @@ def do_analysis(args, git_dir):
|
||||
# List directories in reverse sorted order of unpacked size
|
||||
with open(os.path.join(reportdir, "directories-deleted-sizes.txt"), 'w') as f:
|
||||
f.write("=== Deleted directories by reverse size ===\n")
|
||||
f.write("Format: size (bytes), date deleted, directory name\n")
|
||||
for dirname, size in sorted(dir_size.iteritems(),
|
||||
f.write("Format: unpacked size, packed size, date deleted, directory name\n")
|
||||
for dirname, size in sorted(dir_size['packed'].iteritems(),
|
||||
key=lambda x:x[1], reverse=True):
|
||||
if (dir_deleted_data[dirname]):
|
||||
f.write(" {:10d} {:10s} {}\n".format(size,
|
||||
datestr(dir_deleted_data[dirname]),
|
||||
dirname or '<toplevel>'))
|
||||
f.write(" {:10d} {:10d} {:10s} {}\n"
|
||||
.format(dir_size['unpacked'][dirname],
|
||||
size,
|
||||
datestr(dir_deleted_data[dirname]),
|
||||
dirname or '<toplevel>'))
|
||||
|
||||
with open(os.path.join(reportdir, "directories-all-sizes.txt"), 'w') as f:
|
||||
f.write("=== All directories by reverse size ===\n")
|
||||
f.write("Format: size (bytes), date deleted, directory name\n")
|
||||
for dirname, size in sorted(dir_size.iteritems(),
|
||||
f.write("Format: unpacked size, packed size, date deleted, directory name\n")
|
||||
for dirname, size in sorted(dir_size['packed'].iteritems(),
|
||||
key=lambda x:x[1], reverse=True):
|
||||
f.write(" {:10d} {:10s} {}\n".format(size, datestr(dir_deleted_data[dirname]),
|
||||
dirname or '<toplevel>'))
|
||||
f.write(" {:10d} {:10d} {:10s} {}\n"
|
||||
.format(dir_size['unpacked'][dirname],
|
||||
size,
|
||||
datestr(dir_deleted_data[dirname]),
|
||||
dirname or '<toplevel>'))
|
||||
|
||||
# List extensions in reverse sorted order of unpacked size
|
||||
with open(os.path.join(reportdir, "extensions-deleted-sizes.txt"), 'w') as f:
|
||||
f.write("=== Deleted extensions by reverse size ===\n")
|
||||
f.write("Format: size (bytes), date deleted, extension name\n")
|
||||
for extname, size in sorted(ext_size.iteritems(),
|
||||
f.write("Format: unpacked size, packed size, date deleted, extension name\n")
|
||||
for extname, size in sorted(ext_size['packed'].iteritems(),
|
||||
key=lambda x:x[1], reverse=True):
|
||||
if (ext_deleted_data[extname]):
|
||||
f.write(" {:10d} {:10s} {}\n".format(size,
|
||||
datestr(ext_deleted_data[extname]),
|
||||
extname or '<no extension>'))
|
||||
f.write(" {:10d} {:10d} {:10s} {}\n"
|
||||
.format(ext_size['unpacked'][extname],
|
||||
size,
|
||||
datestr(ext_deleted_data[extname]),
|
||||
extname or '<no extension>'))
|
||||
|
||||
with open(os.path.join(reportdir, "extensions-all-sizes.txt"), 'w') as f:
|
||||
f.write("=== All extensions by reverse size ===\n")
|
||||
f.write("Format: size (bytes), date deleted, extension name\n")
|
||||
for extname, size in sorted(ext_size.iteritems(),
|
||||
f.write("Format: unpacked size, packed size, date deleted, extension name\n")
|
||||
for extname, size in sorted(ext_size['packed'].iteritems(),
|
||||
key=lambda x:x[1], reverse=True):
|
||||
f.write(" {:10d} {:10s} {}\n".format(size,
|
||||
datestr(ext_deleted_data[extname]),
|
||||
extname or '<no extension>'))
|
||||
f.write(" {:10d} {:10d} {:10s} {}\n"
|
||||
.format(ext_size['unpacked'][extname],
|
||||
size,
|
||||
datestr(ext_deleted_data[extname]),
|
||||
extname or '<no extension>'))
|
||||
|
||||
# List files in reverse sorted order of unpacked size
|
||||
with open(os.path.join(reportdir, "path-deleted-sizes.txt"), 'w') as f:
|
||||
f.write("=== Deleted paths by reverse accumulated size ===\n")
|
||||
f.write("Format: size (bytes), date deleted, path name(s)\n")
|
||||
for pathname, size in sorted(path_size.iteritems(),
|
||||
f.write("Format: unpacked size, packed size, date deleted, path name(s)\n")
|
||||
for pathname, size in sorted(path_size['packed'].iteritems(),
|
||||
key=lambda x:x[1], reverse=True):
|
||||
when = args.stats['deletions'].get(pathname, None)
|
||||
if when:
|
||||
f.write(" {:10d} {:10s} {}\n".format(size, datestr(when), pathname))
|
||||
f.write(" {:10d} {:10d} {:10s} {}\n"
|
||||
.format(path_size['unpacked'][pathname],
|
||||
size,
|
||||
datestr(when),
|
||||
pathname))
|
||||
|
||||
with open(os.path.join(reportdir, "path-all-sizes.txt"), 'w') as f:
|
||||
f.write("=== All paths by reverse accumulated size ===\n")
|
||||
f.write("Format: size (bytes), date deleted, pathectory name\n")
|
||||
for pathname, size in sorted(path_size.iteritems(),
|
||||
key=lambda x:x[1], reverse=True):
|
||||
f.write("Format: unpacked size, packed size, date deleted, pathectory name\n")
|
||||
for pathname, size in sorted(path_size['packed'].iteritems(),
|
||||
key=lambda x:x[1], reverse=True):
|
||||
when = args.stats['deletions'].get(pathname, None)
|
||||
f.write(" {:10d} {:10s} {}\n".format(size, datestr(when), pathname))
|
||||
f.write(" {:10d} {:10d} {:10s} {}\n"
|
||||
.format(path_size['unpacked'][pathname],
|
||||
size,
|
||||
datestr(when),
|
||||
pathname))
|
||||
|
||||
# List of filenames and sizes in descending order
|
||||
with open(os.path.join(reportdir, "blob-shas-and-paths.txt"), 'w') as f:
|
||||
f.write("== Files by sha and associated pathnames in reverse size ==\n")
|
||||
f.write("Format: sha, size (bytes), filename(s) object stored as\n")
|
||||
for sha, size in sorted(args.size.iteritems(), key=lambda x:x[1],
|
||||
reverse=True):
|
||||
f.write("Format: sha, unpacked size, packed size, filename(s) object stored as\n")
|
||||
for sha, size in sorted(args.stats['packed_size'].iteritems(),
|
||||
key=lambda x:x[1], reverse=True):
|
||||
if sha not in args.stats['names']:
|
||||
# Some objects in the repository might not be referenced, or not
|
||||
# referenced by the branches/tags the user cares about; skip them.
|
||||
@ -2083,7 +2136,10 @@ def do_analysis(args, git_dir):
|
||||
names_with_sha = names_with_sha.pop()
|
||||
else:
|
||||
names_with_sha = sorted(list(names_with_sha))
|
||||
f.write(" {} {:9d} {}\n".format(sha, size, names_with_sha))
|
||||
f.write(" {} {:10d} {:10d} {}\n".format(sha,
|
||||
args.stats['unpacked_size'][sha],
|
||||
size,
|
||||
names_with_sha))
|
||||
|
||||
# Notify the user where they can find the reports
|
||||
print("Reports written to {}".format(reportdir))
|
||||
|
Loading…
Reference in New Issue
Block a user