From 4e2110136ed4a7b145bb6d8d6e1f16f1df07f05a Mon Sep 17 00:00:00 2001 From: Elijah Newren Date: Tue, 25 Dec 2018 21:54:16 -0800 Subject: [PATCH] filter-repo: group repo analysis functions into a class Signed-off-by: Elijah Newren --- git-filter-repo | 734 +++++++++++++++++++++++++----------------------- 1 file changed, 376 insertions(+), 358 deletions(-) diff --git a/git-filter-repo b/git-filter-repo index 0d47588..e0f88b6 100755 --- a/git-filter-repo +++ b/git-filter-repo @@ -1891,10 +1891,15 @@ class FilteringOptions(object): FilteringOptions.sanity_check_args(args) return args -def analyze_commit(stats, graph, commit, parents, date, file_changes): - def equiv_class(filename): +class RepoAnalyze(object): + + # First, several helper functions for analyze_commit() + + @staticmethod + def equiv_class(stats, filename): return stats['equivalence'].get(filename, (filename,)) + @staticmethod def setup_equivalence_for_rename(stats, oldname, newname): # if A is renamed to B and B is renamed to C, then the user thinks of # A, B, and C as all being different names for the same 'file'. We record @@ -1911,18 +1916,22 @@ def analyze_commit(stats, graph, commit, parents, date, file_changes): for f in new_tuple: stats['equivalence'][f] = new_tuple + @staticmethod def setup_or_update_rename_history(stats, commit, oldname, newname): rename_commits = stats['rename_history'].get(oldname, set()) rename_commits.add(commit) stats['rename_history'][oldname] = rename_commits + @staticmethod def handle_renames(stats, commit, change_types, filenames): for index, change_type in enumerate(change_types): if change_type == 'R': oldname, newname = filenames[index], filenames[-1] - setup_equivalence_for_rename(stats, oldname, newname) - setup_or_update_rename_history(stats, commit, oldname, newname) + RepoAnalyze.setup_equivalence_for_rename(stats, oldname, newname) + RepoAnalyze.setup_or_update_rename_history(stats, commit, + oldname, newname) + @staticmethod def handle_file(stats, graph, commit, modes, shas, filenames): mode, sha, filename = modes[-1], shas[-1], filenames[-1] @@ -1936,7 +1945,7 @@ def analyze_commit(stats, graph, commit, parents, date, file_changes): # If the file (or equivalence class of files) was recorded as deleted, # clearly it isn't anymore - equiv = equiv_class(filename) + equiv = RepoAnalyze.equiv_class(stats, filename) for f in equiv: stats[delmode].pop(f, None) @@ -1954,383 +1963,392 @@ def analyze_commit(stats, graph, commit, parents, date, file_changes): if f in stats['equivalence']: del stats['equivalence'][f] - graph.add_commit_and_parents(commit, parents) - for change in file_changes: - modes, shas, change_types, filenames = change - if len(parents) == 1 and change_types.startswith('R'): - change_types = 'R' # remove the rename score; we don't care - if modes[-1] == '160000': - continue - elif modes[-1] == '000000': - # Track when files/directories are deleted; see 'R' below about equiv_class - for f in equiv_class(filenames[-1]): - if any(x == '040000' for x in modes[0:-1]): - stats['tree_deletions'][f] = date - else: - stats['file_deletions'][f] = date - elif change_types.strip('AMT') == '': - handle_file(stats, graph, commit, modes, shas, filenames) - elif modes[-1] == '040000' and change_types.strip('RAM') == '': - handle_file(stats, graph, commit, modes, shas, filenames) - elif change_types.strip('RAM') == '': - handle_file(stats, graph, commit, modes, shas, filenames) - handle_renames(stats, commit, change_types, filenames) - else: - raise SystemExit("Unhandled change type(s): {} (in commit {})" - .format(change_types, commit)) - -def gather_data(args): - blob_size_progress = ProgressWriter() - num_blobs = 0 - - # Get sizes of blobs by sha1 - a='--batch-check=%(objectname) %(objecttype) %(objectsize) %(objectsize:disk)' - cf = subprocess.Popen(['git', 'cat-file', '--batch-all-objects', a], - bufsize = -1, - stdout = subprocess.PIPE) - unpacked_size = {} - packed_size = {} - for line in cf.stdout: - sha, objtype, objsize, objdisksize = line.split() - objsize, objdisksize = int(objsize), int(objdisksize) - if objtype == 'blob': - unpacked_size[sha] = objsize - packed_size[sha] = objdisksize - num_blobs += 1 - blob_size_progress.show("Processed {} blob sizes".format(num_blobs)) - cf.wait() - blob_size_progress.finish() - stats = {'names': collections.defaultdict(set), - 'allnames' : set(), - 'file_deletions': {}, - 'tree_deletions': {}, - 'equivalence': {}, - 'rename_history': collections.defaultdict(set), - 'unpacked_size': unpacked_size, - 'packed_size': packed_size, - 'num_commits': 0} - - # Setup the rev-list/diff-tree process - commit_parse_progress = ProgressWriter() - num_commits = 0 - cmd = 'git rev-list --topo-order --reverse {} | git diff-tree --stdin --always --root --format="%H%n%P%n%cd" --date=short -M -t -c --raw --combined-all-paths'.format(' '.join(args.refs)) - dtp = subprocess.Popen(cmd, shell=True, bufsize=-1, stdout=subprocess.PIPE) - f = dtp.stdout - line = f.next() - cont = bool(line) - graph = AncestryGraph() - while cont: - commit = line.rstrip() - parents = f.next().split() - date = f.next().rstrip() - - # We expect a blank line next; if we get a non-blank line then - # this commit modified no files and we need to move on to the next. - # If there is no line, we've reached end-of-input. - try: - line = f.next().rstrip() - cont = True - except StopIteration: - cont = False - - # If we haven't reached end of input, and we got a blank line meaning - # a commit that has modified files, then get the file changes associated - # with this commit. - file_changes = [] - if cont and not line: - cont = False - for line in f: - if not line.startswith(':'): - cont = True - break - n = 1+max(1, len(parents)) - assert line.startswith(':'*(n-1)) - relevant = line[n-1:-1] - splits = relevant.split(None, n) - modes = splits[0:n] - splits = splits[n].split(None, n) - shas = splits[0:n] - splits = splits[n].split('\t') - change_types = splits[0] - filenames = [PathQuoting.dequote(x) for x in splits[1:]] - file_changes.append([modes, shas, change_types, filenames]) - - # Analyze this commit and update progress - analyze_commit(stats, graph, commit, parents, date, file_changes) - num_commits += 1 - commit_parse_progress.show("Processed {} commits".format(num_commits)) - - # Show the final commits processed message and record the number of commits - commit_parse_progress.finish() - stats['num_commits'] = num_commits - - # Close the output, ensure rev-list|diff-tree pipeline completed successfully - dtp.stdout.close() - if dtp.wait(): - raise SystemExit("Error: rev-list|diff-tree pipeline failed; see above.") - - return stats - -def write_report(reportdir, stats): - def datestr(datetimestr): - return datetimestr if datetimestr else '' - - def dirnames(path): - while True: - path = os.path.dirname(path) - yield path - if path == '': - break - - # Compute aggregate size information for paths, extensions, and dirs - total_size = {'packed': 0, 'unpacked': 0} - path_size = {'packed': collections.defaultdict(int), - 'unpacked': collections.defaultdict(int)} - ext_size = {'packed': collections.defaultdict(int), - 'unpacked': collections.defaultdict(int)} - dir_size = {'packed': collections.defaultdict(int), - 'unpacked': collections.defaultdict(int)} - for sha in stats['names']: - size = {'packed': stats['packed_size'][sha], - 'unpacked': stats['unpacked_size'][sha]} - for which in ('packed', 'unpacked'): - for name in stats['names'][sha]: - total_size[which] += size[which] - path_size[which][name] += size[which] - basename, ext = os.path.splitext(name) - ext_size[which][ext] += size[which] - for dirname in dirnames(name): - dir_size[which][dirname] += size[which] - - # Determine if and when extensions and directories were deleted - ext_deleted_data = {} - for name in stats['allnames']: - when = stats['file_deletions'].get(name, None) - - # Update the extension - basename, ext = os.path.splitext(name) - if when is None: - ext_deleted_data[ext] = None - elif ext in ext_deleted_data: - if ext_deleted_data[ext] is not None: - ext_deleted_data[ext] = max(ext_deleted_data[ext], when) - else: - ext_deleted_data[ext] = when - - dir_deleted_data = {} - for name in dir_size['packed']: - dir_deleted_data[name] = stats['tree_deletions'].get(name, None) - - with open(os.path.join(reportdir, "README"), 'w') as f: - # Give a basic overview of this file - f.write("== Overal Statistics ==\n") - f.write(" Number of commits: {}\n".format(stats['num_commits'])) - f.write(" Number of filenames: {}\n".format(len(path_size['packed']))) - f.write(" Number of directories: {}\n".format(len(dir_size['packed']))) - f.write(" Number of file extensions: {}\n".format(len(ext_size['packed']))) - f.write("\n") - f.write(" Total unpacked size (bytes): {:10d}\n" - .format(total_size['unpacked'])) - f.write(" Total packed size (bytes): {:10d}\n" - .format(total_size['packed'])) - f.write("\n") - - # Mention issues with the report - f.write("== Caveats ==\n") - f.write("=== Sizes ===\n") - f.write(textwrap.dedent(""" - Packed size represents what size your repository would be if no - trees, commits, tags, or other metadata were included (though it may - fail to represent de-duplication; see below). It also represents the - current packing, which may be suboptimal if you haven't gc'ed for a - while. - - Unpacked size represents what size your repository would be if no if - no trees, commits, tags, or other metadata were included AND if no - files were packed; i.e., without delta-ing or compression. - - Both unpacked and packed sizes can be slightly misleading. Deleting - a blob from history not save as much space as the unpacked size, - because it is obviously normally stored in packed form. Also, - deleting a blob from history may not save as much space as its packed - size either, because another blob could be stored as a delta against - that blob, so when you remove one blob another blob's packed size may - grow. - - Also, the sum of the packed sizes can add up to more than the - repository size; if the same contents appeared in the repository in - multiple places, git will automatically de-dupe and store only one - copy, while the way sizes are added in this analysis adds the size - for each file path that has those contents. Further, if a file is - ever reverted to a previous version's contents, the previous - version's size will be counted multiple times in this analysis, even - though git will only store it once. - """[1:])) - f.write("\n") - f.write("=== Deletions ===\n") - f.write(textwrap.dedent(""" - Whether a file is deleted is not a binary quality, since it can be - deleted on some branches but still exist in others. Also, it might - exist in an old tag, but have been deleted in versions newer than - that. More thorough tracking could be done, including looking at - merge commits where one side of history deleted and the other modified, - in order to give a more holistic picture of deletions. However, that - algorithm would not only be more complex to implement, it'd also be - quite difficult to present and interpret by users. Since --analyze - is just about getting a high-level rough picture of history, it instead - implements the simplistic rule that is good enough for 98% of cases: - A file is marked as deleted if the last commit in the fast-export - stream that mentions the file lists it as deleted. - This makes it dependent on topological ordering, but generally gives - the "right" answer. - """[1:])) - f.write("\n") - f.write("=== Renames ===\n") - f.write(textwrap.dedent(""" - Renames share the same non-binary nature that deletions do, plus - additional challenges: - * If the renamed file is renamed again, instead of just two names for - a path you can have three or more. - * Rename pairs of the form (oldname, newname) that we consider to be - different names of the "same file" might only be valid over certain - commit ranges. For example, if a new commit reintroduces a file - named oldname, then new versions of oldname aren't the "same file" - anymore. We could try to portray this to the user, but it's easier - for the user to just break the pairing and only report unbroken - rename pairings to the user. - * The ability for users to rename files differently in different - branches means that our chains of renames will not necessarily be - linear but may branch out. - """[1:])) - f.write("\n") - - # Equivalence classes for names, so if folks only want to keep a - # certain set of paths, they know the old names they want to include - # too. - with open(os.path.join(reportdir, "renames.txt"), 'w') as f: - seen = set() - for pathname,equiv_group in sorted(stats['equivalence'].iteritems(), - key=lambda x:x[1]): - if equiv_group in seen: + @staticmethod + def analyze_commit(stats, graph, commit, parents, date, file_changes): + graph.add_commit_and_parents(commit, parents) + for change in file_changes: + modes, shas, change_types, filenames = change + if len(parents) == 1 and change_types.startswith('R'): + change_types = 'R' # remove the rename score; we don't care + if modes[-1] == '160000': continue - seen.add(equiv_group) - f.write("{} ->\n ".format(equiv_group[0]) + - "\n ".join(equiv_group[1:]) + - "\n") + elif modes[-1] == '000000': + # Track when files/directories are deleted + for f in RepoAnalyze.equiv_class(stats, filenames[-1]): + if any(x == '040000' for x in modes[0:-1]): + stats['tree_deletions'][f] = date + else: + stats['file_deletions'][f] = date + elif change_types.strip('AMT') == '': + RepoAnalyze.handle_file(stats, graph, commit, modes, shas, filenames) + elif modes[-1] == '040000' and change_types.strip('RAM') == '': + RepoAnalyze.handle_file(stats, graph, commit, modes, shas, filenames) + elif change_types.strip('RAM') == '': + RepoAnalyze.handle_file(stats, graph, commit, modes, shas, filenames) + RepoAnalyze.handle_renames(stats, commit, change_types, filenames) + else: + raise SystemExit("Unhandled change type(s): {} (in commit {})" + .format(change_types, commit)) - # List directories in reverse sorted order of unpacked size - with open(os.path.join(reportdir, "directories-deleted-sizes.txt"), 'w') as f: - f.write("=== Deleted directories by reverse size ===\n") - f.write("Format: unpacked size, packed size, date deleted, directory name\n") - for dirname, size in sorted(dir_size['packed'].iteritems(), - key=lambda x:x[1], reverse=True): - if (dir_deleted_data[dirname]): + @staticmethod + def gather_data(args): + blob_size_progress = ProgressWriter() + num_blobs = 0 + + # Get sizes of blobs by sha1 + cmd = '--batch-check=%(objectname) %(objecttype) ' + \ + '%(objectsize) %(objectsize:disk)' + cf = subprocess.Popen(['git', 'cat-file', '--batch-all-objects', cmd], + bufsize = -1, + stdout = subprocess.PIPE) + unpacked_size = {} + packed_size = {} + for line in cf.stdout: + sha, objtype, objsize, objdisksize = line.split() + objsize, objdisksize = int(objsize), int(objdisksize) + if objtype == 'blob': + unpacked_size[sha] = objsize + packed_size[sha] = objdisksize + num_blobs += 1 + blob_size_progress.show("Processed {} blob sizes".format(num_blobs)) + cf.wait() + blob_size_progress.finish() + stats = {'names': collections.defaultdict(set), + 'allnames' : set(), + 'file_deletions': {}, + 'tree_deletions': {}, + 'equivalence': {}, + 'rename_history': collections.defaultdict(set), + 'unpacked_size': unpacked_size, + 'packed_size': packed_size, + 'num_commits': 0} + + # Setup the rev-list/diff-tree process + commit_parse_progress = ProgressWriter() + num_commits = 0 + cmd = ('git rev-list --topo-order --reverse {}'.format(' '.join(args.refs)) + + ' | git diff-tree --stdin --always --root --format=%H%n%P%n%cd' + + ' --date=short -M -t -c --raw --combined-all-paths') + dtp = subprocess.Popen(cmd, shell=True, bufsize=-1, stdout=subprocess.PIPE) + f = dtp.stdout + line = f.next() + cont = bool(line) + graph = AncestryGraph() + while cont: + commit = line.rstrip() + parents = f.next().split() + date = f.next().rstrip() + + # We expect a blank line next; if we get a non-blank line then + # this commit modified no files and we need to move on to the next. + # If there is no line, we've reached end-of-input. + try: + line = f.next().rstrip() + cont = True + except StopIteration: + cont = False + + # If we haven't reached end of input, and we got a blank line meaning + # a commit that has modified files, then get the file changes associated + # with this commit. + file_changes = [] + if cont and not line: + cont = False + for line in f: + if not line.startswith(':'): + cont = True + break + n = 1+max(1, len(parents)) + assert line.startswith(':'*(n-1)) + relevant = line[n-1:-1] + splits = relevant.split(None, n) + modes = splits[0:n] + splits = splits[n].split(None, n) + shas = splits[0:n] + splits = splits[n].split('\t') + change_types = splits[0] + filenames = [PathQuoting.dequote(x) for x in splits[1:]] + file_changes.append([modes, shas, change_types, filenames]) + + # Analyze this commit and update progress + RepoAnalyze.analyze_commit(stats, graph, commit, parents, date, + file_changes) + num_commits += 1 + commit_parse_progress.show("Processed {} commits".format(num_commits)) + + # Show the final commits processed message and record the number of commits + commit_parse_progress.finish() + stats['num_commits'] = num_commits + + # Close the output, ensure rev-list|diff-tree pipeline completed successfully + dtp.stdout.close() + if dtp.wait(): + raise SystemExit("Error: rev-list|diff-tree pipeline failed; see above.") + + return stats + + @staticmethod + def write_report(reportdir, stats): + def datestr(datetimestr): + return datetimestr if datetimestr else '' + + def dirnames(path): + while True: + path = os.path.dirname(path) + yield path + if path == '': + break + + # Compute aggregate size information for paths, extensions, and dirs + total_size = {'packed': 0, 'unpacked': 0} + path_size = {'packed': collections.defaultdict(int), + 'unpacked': collections.defaultdict(int)} + ext_size = {'packed': collections.defaultdict(int), + 'unpacked': collections.defaultdict(int)} + dir_size = {'packed': collections.defaultdict(int), + 'unpacked': collections.defaultdict(int)} + for sha in stats['names']: + size = {'packed': stats['packed_size'][sha], + 'unpacked': stats['unpacked_size'][sha]} + for which in ('packed', 'unpacked'): + for name in stats['names'][sha]: + total_size[which] += size[which] + path_size[which][name] += size[which] + basename, ext = os.path.splitext(name) + ext_size[which][ext] += size[which] + for dirname in dirnames(name): + dir_size[which][dirname] += size[which] + + # Determine if and when extensions and directories were deleted + ext_deleted_data = {} + for name in stats['allnames']: + when = stats['file_deletions'].get(name, None) + + # Update the extension + basename, ext = os.path.splitext(name) + if when is None: + ext_deleted_data[ext] = None + elif ext in ext_deleted_data: + if ext_deleted_data[ext] is not None: + ext_deleted_data[ext] = max(ext_deleted_data[ext], when) + else: + ext_deleted_data[ext] = when + + dir_deleted_data = {} + for name in dir_size['packed']: + dir_deleted_data[name] = stats['tree_deletions'].get(name, None) + + with open(os.path.join(reportdir, "README"), 'w') as f: + # Give a basic overview of this file + f.write("== Overal Statistics ==\n") + f.write(" Number of commits: {}\n".format(stats['num_commits'])) + f.write(" Number of filenames: {}\n".format(len(path_size['packed']))) + f.write(" Number of directories: {}\n".format(len(dir_size['packed']))) + f.write(" Number of file extensions: {}\n".format(len(ext_size['packed']))) + f.write("\n") + f.write(" Total unpacked size (bytes): {:10d}\n" + .format(total_size['unpacked'])) + f.write(" Total packed size (bytes): {:10d}\n" + .format(total_size['packed'])) + f.write("\n") + + # Mention issues with the report + f.write("== Caveats ==\n") + f.write("=== Sizes ===\n") + f.write(textwrap.dedent(""" + Packed size represents what size your repository would be if no + trees, commits, tags, or other metadata were included (though it may + fail to represent de-duplication; see below). It also represents the + current packing, which may be suboptimal if you haven't gc'ed for a + while. + + Unpacked size represents what size your repository would be if no if + no trees, commits, tags, or other metadata were included AND if no + files were packed; i.e., without delta-ing or compression. + + Both unpacked and packed sizes can be slightly misleading. Deleting + a blob from history not save as much space as the unpacked size, + because it is obviously normally stored in packed form. Also, + deleting a blob from history may not save as much space as its packed + size either, because another blob could be stored as a delta against + that blob, so when you remove one blob another blob's packed size may + grow. + + Also, the sum of the packed sizes can add up to more than the + repository size; if the same contents appeared in the repository in + multiple places, git will automatically de-dupe and store only one + copy, while the way sizes are added in this analysis adds the size + for each file path that has those contents. Further, if a file is + ever reverted to a previous version's contents, the previous + version's size will be counted multiple times in this analysis, even + though git will only store it once. + """[1:])) + f.write("\n") + f.write("=== Deletions ===\n") + f.write(textwrap.dedent(""" + Whether a file is deleted is not a binary quality, since it can be + deleted on some branches but still exist in others. Also, it might + exist in an old tag, but have been deleted in versions newer than + that. More thorough tracking could be done, including looking at + merge commits where one side of history deleted and the other modified, + in order to give a more holistic picture of deletions. However, that + algorithm would not only be more complex to implement, it'd also be + quite difficult to present and interpret by users. Since --analyze + is just about getting a high-level rough picture of history, it instead + implements the simplistic rule that is good enough for 98% of cases: + A file is marked as deleted if the last commit in the fast-export + stream that mentions the file lists it as deleted. + This makes it dependent on topological ordering, but generally gives + the "right" answer. + """[1:])) + f.write("\n") + f.write("=== Renames ===\n") + f.write(textwrap.dedent(""" + Renames share the same non-binary nature that deletions do, plus + additional challenges: + * If the renamed file is renamed again, instead of just two names for + a path you can have three or more. + * Rename pairs of the form (oldname, newname) that we consider to be + different names of the "same file" might only be valid over certain + commit ranges. For example, if a new commit reintroduces a file + named oldname, then new versions of oldname aren't the "same file" + anymore. We could try to portray this to the user, but it's easier + for the user to just break the pairing and only report unbroken + rename pairings to the user. + * The ability for users to rename files differently in different + branches means that our chains of renames will not necessarily be + linear but may branch out. + """[1:])) + f.write("\n") + + # Equivalence classes for names, so if folks only want to keep a + # certain set of paths, they know the old names they want to include + # too. + with open(os.path.join(reportdir, "renames.txt"), 'w') as f: + seen = set() + for pathname,equiv_group in sorted(stats['equivalence'].iteritems(), + key=lambda x:x[1]): + if equiv_group in seen: + continue + seen.add(equiv_group) + f.write("{} ->\n ".format(equiv_group[0]) + + "\n ".join(equiv_group[1:]) + + "\n") + + # List directories in reverse sorted order of unpacked size + with open(os.path.join(reportdir, "directories-deleted-sizes.txt"), 'w') as f: + f.write("=== Deleted directories by reverse size ===\n") + f.write("Format: unpacked size, packed size, date deleted, directory name\n") + for dirname, size in sorted(dir_size['packed'].iteritems(), + key=lambda x:x[1], reverse=True): + if (dir_deleted_data[dirname]): + f.write(" {:10d} {:10d} {:10s} {}\n" + .format(dir_size['unpacked'][dirname], + size, + datestr(dir_deleted_data[dirname]), + dirname or '')) + + with open(os.path.join(reportdir, "directories-all-sizes.txt"), 'w') as f: + f.write("=== All directories by reverse size ===\n") + f.write("Format: unpacked size, packed size, date deleted, directory name\n") + for dirname, size in sorted(dir_size['packed'].iteritems(), + key=lambda x:x[1], reverse=True): f.write(" {:10d} {:10d} {:10s} {}\n" .format(dir_size['unpacked'][dirname], size, datestr(dir_deleted_data[dirname]), dirname or '')) - with open(os.path.join(reportdir, "directories-all-sizes.txt"), 'w') as f: - f.write("=== All directories by reverse size ===\n") - f.write("Format: unpacked size, packed size, date deleted, directory name\n") - for dirname, size in sorted(dir_size['packed'].iteritems(), - key=lambda x:x[1], reverse=True): - f.write(" {:10d} {:10d} {:10s} {}\n" - .format(dir_size['unpacked'][dirname], - size, - datestr(dir_deleted_data[dirname]), - dirname or '')) + # List extensions in reverse sorted order of unpacked size + with open(os.path.join(reportdir, "extensions-deleted-sizes.txt"), 'w') as f: + f.write("=== Deleted extensions by reverse size ===\n") + f.write("Format: unpacked size, packed size, date deleted, extension name\n") + for extname, size in sorted(ext_size['packed'].iteritems(), + key=lambda x:x[1], reverse=True): + if (ext_deleted_data[extname]): + f.write(" {:10d} {:10d} {:10s} {}\n" + .format(ext_size['unpacked'][extname], + size, + datestr(ext_deleted_data[extname]), + extname or '')) - # List extensions in reverse sorted order of unpacked size - with open(os.path.join(reportdir, "extensions-deleted-sizes.txt"), 'w') as f: - f.write("=== Deleted extensions by reverse size ===\n") - f.write("Format: unpacked size, packed size, date deleted, extension name\n") - for extname, size in sorted(ext_size['packed'].iteritems(), - key=lambda x:x[1], reverse=True): - if (ext_deleted_data[extname]): + with open(os.path.join(reportdir, "extensions-all-sizes.txt"), 'w') as f: + f.write("=== All extensions by reverse size ===\n") + f.write("Format: unpacked size, packed size, date deleted, extension name\n") + for extname, size in sorted(ext_size['packed'].iteritems(), + key=lambda x:x[1], reverse=True): f.write(" {:10d} {:10d} {:10s} {}\n" .format(ext_size['unpacked'][extname], size, datestr(ext_deleted_data[extname]), extname or '')) - with open(os.path.join(reportdir, "extensions-all-sizes.txt"), 'w') as f: - f.write("=== All extensions by reverse size ===\n") - f.write("Format: unpacked size, packed size, date deleted, extension name\n") - for extname, size in sorted(ext_size['packed'].iteritems(), - key=lambda x:x[1], reverse=True): - f.write(" {:10d} {:10d} {:10s} {}\n" - .format(ext_size['unpacked'][extname], - size, - datestr(ext_deleted_data[extname]), - extname or '')) + # List files in reverse sorted order of unpacked size + with open(os.path.join(reportdir, "path-deleted-sizes.txt"), 'w') as f: + f.write("=== Deleted paths by reverse accumulated size ===\n") + f.write("Format: unpacked size, packed size, date deleted, path name(s)\n") + for pathname, size in sorted(path_size['packed'].iteritems(), + key=lambda x:x[1], reverse=True): + when = stats['file_deletions'].get(pathname, None) + if when: + f.write(" {:10d} {:10d} {:10s} {}\n" + .format(path_size['unpacked'][pathname], + size, + datestr(when), + pathname)) - # List files in reverse sorted order of unpacked size - with open(os.path.join(reportdir, "path-deleted-sizes.txt"), 'w') as f: - f.write("=== Deleted paths by reverse accumulated size ===\n") - f.write("Format: unpacked size, packed size, date deleted, path name(s)\n") - for pathname, size in sorted(path_size['packed'].iteritems(), - key=lambda x:x[1], reverse=True): - when = stats['file_deletions'].get(pathname, None) - if when: + with open(os.path.join(reportdir, "path-all-sizes.txt"), 'w') as f: + f.write("=== All paths by reverse accumulated size ===\n") + f.write("Format: unpacked size, packed size, date deleted, pathectory name\n") + for pathname, size in sorted(path_size['packed'].iteritems(), + key=lambda x:x[1], reverse=True): + when = stats['file_deletions'].get(pathname, None) f.write(" {:10d} {:10d} {:10s} {}\n" .format(path_size['unpacked'][pathname], size, datestr(when), pathname)) - with open(os.path.join(reportdir, "path-all-sizes.txt"), 'w') as f: - f.write("=== All paths by reverse accumulated size ===\n") - f.write("Format: unpacked size, packed size, date deleted, pathectory name\n") - for pathname, size in sorted(path_size['packed'].iteritems(), - key=lambda x:x[1], reverse=True): - when = stats['file_deletions'].get(pathname, None) - f.write(" {:10d} {:10d} {:10s} {}\n" - .format(path_size['unpacked'][pathname], - size, - datestr(when), - pathname)) + # List of filenames and sizes in descending order + with open(os.path.join(reportdir, "blob-shas-and-paths.txt"), 'w') as f: + f.write("== Files by sha and associated pathnames in reverse size ==\n") + f.write("Format: sha, unpacked size, packed size, filename(s) object stored as\n") + for sha, size in sorted(stats['packed_size'].iteritems(), + key=lambda x:x[1], reverse=True): + if sha not in stats['names']: + # Some objects in the repository might not be referenced, or not + # referenced by the branches/tags the user cares about; skip them. + continue + names_with_sha = stats['names'][sha] + if len(names_with_sha) == 1: + names_with_sha = names_with_sha.pop() + else: + names_with_sha = sorted(list(names_with_sha)) + f.write(" {} {:10d} {:10d} {}\n".format(sha, + stats['unpacked_size'][sha], + size, + names_with_sha)) - # List of filenames and sizes in descending order - with open(os.path.join(reportdir, "blob-shas-and-paths.txt"), 'w') as f: - f.write("== Files by sha and associated pathnames in reverse size ==\n") - f.write("Format: sha, unpacked size, packed size, filename(s) object stored as\n") - for sha, size in sorted(stats['packed_size'].iteritems(), - key=lambda x:x[1], reverse=True): - if sha not in stats['names']: - # Some objects in the repository might not be referenced, or not - # referenced by the branches/tags the user cares about; skip them. - continue - names_with_sha = stats['names'][sha] - if len(names_with_sha) == 1: - names_with_sha = names_with_sha.pop() - else: - names_with_sha = sorted(list(names_with_sha)) - f.write(" {} {:10d} {:10d} {}\n".format(sha, - stats['unpacked_size'][sha], - size, - names_with_sha)) + @staticmethod + def run(args, git_dir): + # Create the report directory as necessary + results_tmp_dir = os.path.join(git_dir, 'filter-repo') + if not os.path.isdir(results_tmp_dir): + os.mkdir(results_tmp_dir) + reportdir = os.path.join(results_tmp_dir, "analysis") + if not args.force and os.path.isdir(reportdir): + raise SystemExit("Error: {} already exists; refusing to overwrite!". + format(reportdir)) + os.mkdir(reportdir) -def do_analysis(args, git_dir): - # Create the report directory as necessary - results_tmp_dir = os.path.join(git_dir, 'filter-repo') - if not os.path.isdir(results_tmp_dir): - os.mkdir(results_tmp_dir) - reportdir = os.path.join(results_tmp_dir, "analysis") - if not args.force and os.path.isdir(reportdir): - raise SystemExit("Error: {} already exists; refusing to overwrite!". - format(reportdir)) - os.mkdir(reportdir) + # Gather the data we need + stats = RepoAnalyze.gather_data(args) - # Gather the data we need - stats = gather_data(args) - - # Write the reports - sys.stdout.write("Writing reports to {}...".format(reportdir)) - sys.stdout.flush() - write_report(reportdir, stats) - sys.stdout.write("done.\n") + # Write the reports + sys.stdout.write("Writing reports to {}...".format(reportdir)) + sys.stdout.flush() + RepoAnalyze.write_report(reportdir, stats) + sys.stdout.write("done.\n") def sanity_check(refs, is_bare): def abort(reason): @@ -2506,7 +2524,7 @@ def run_fast_filter(): # Do analysis, if requested if args.analyze: - do_analysis(args, git_dir) + RepoAnalyze.run(args, git_dir) return # Do sanity checks