filter-repo: group repo analysis functions into a class

Signed-off-by: Elijah Newren <newren@gmail.com>
2024-07-06 18:32:14 +02:00 · 2018-12-25 21:54:16 -08:00 · 2018-12-25 21:54:16 -08:00 · 4e2110136e
commit 4e2110136e
parent 9887dd5cbe
1 changed files with 376 additions and 358 deletions
--- a/734
+++ b/734
@ -1891,10 +1891,15 @@ class FilteringOptions(object):
    FilteringOptions.sanity_check_args(args)
    return args

-def analyze_commit(stats, graph, commit, parents, date, file_changes):
-  def equiv_class(filename):
+class RepoAnalyze(object):
+
+  # First, several helper functions for analyze_commit()
+
+  @staticmethod
+  def equiv_class(stats, filename):
    return stats['equivalence'].get(filename, (filename,))

+  @staticmethod
  def setup_equivalence_for_rename(stats, oldname, newname):
    # if A is renamed to B and B is renamed to C, then the user thinks of
    # A, B, and C as all being different names for the same 'file'.  We record
@ -1911,18 +1916,22 @@ def analyze_commit(stats, graph, commit, parents, date, file_changes):
    for f in new_tuple:
      stats['equivalence'][f] = new_tuple

+  @staticmethod
  def setup_or_update_rename_history(stats, commit, oldname, newname):
    rename_commits = stats['rename_history'].get(oldname, set())
    rename_commits.add(commit)
    stats['rename_history'][oldname] = rename_commits

+  @staticmethod
  def handle_renames(stats, commit, change_types, filenames):
    for index, change_type in enumerate(change_types):
      if change_type == 'R':
        oldname, newname = filenames[index], filenames[-1]
-        setup_equivalence_for_rename(stats, oldname, newname)
-        setup_or_update_rename_history(stats, commit, oldname, newname)
+        RepoAnalyze.setup_equivalence_for_rename(stats, oldname, newname)
+        RepoAnalyze.setup_or_update_rename_history(stats, commit,
+                                                   oldname, newname)

+  @staticmethod
  def handle_file(stats, graph, commit, modes, shas, filenames):
    mode, sha, filename = modes[-1], shas[-1], filenames[-1]

@ -1936,7 +1945,7 @@ def analyze_commit(stats, graph, commit, parents, date, file_changes):

    # If the file (or equivalence class of files) was recorded as deleted,
    # clearly it isn't anymore
-    equiv = equiv_class(filename)
+    equiv = RepoAnalyze.equiv_class(stats, filename)
    for f in equiv:
      stats[delmode].pop(f, None)

@ -1954,383 +1963,392 @@ def analyze_commit(stats, graph, commit, parents, date, file_changes):
        if f in stats['equivalence']:
          del stats['equivalence'][f]

-  graph.add_commit_and_parents(commit, parents)
-  for change in file_changes:
-    modes, shas, change_types, filenames = change
-    if len(parents) == 1 and change_types.startswith('R'):
-      change_types = 'R'  # remove the rename score; we don't care
-    if modes[-1] == '160000':
-      continue
-    elif modes[-1] == '000000':
-      # Track when files/directories are deleted; see 'R' below about equiv_class
-      for f in equiv_class(filenames[-1]):
-        if any(x == '040000' for x in modes[0:-1]):
-          stats['tree_deletions'][f] = date
-        else:
-          stats['file_deletions'][f] = date
-    elif change_types.strip('AMT') == '':
-      handle_file(stats, graph, commit, modes, shas, filenames)
-    elif modes[-1] == '040000' and change_types.strip('RAM') == '':
-      handle_file(stats, graph, commit, modes, shas, filenames)
-    elif change_types.strip('RAM') == '':
-      handle_file(stats, graph, commit, modes, shas, filenames)
-      handle_renames(stats, commit, change_types, filenames)
-    else:
-      raise SystemExit("Unhandled change type(s): {} (in commit {})"
-                       .format(change_types, commit))
-
-def gather_data(args):
-  blob_size_progress = ProgressWriter()
-  num_blobs = 0
-
-  # Get sizes of blobs by sha1
-  a='--batch-check=%(objectname) %(objecttype) %(objectsize) %(objectsize:disk)'
-  cf = subprocess.Popen(['git', 'cat-file', '--batch-all-objects', a],
-                        bufsize = -1,
-                        stdout = subprocess.PIPE)
-  unpacked_size = {}
-  packed_size = {}
-  for line in cf.stdout:
-    sha, objtype, objsize, objdisksize = line.split()
-    objsize, objdisksize = int(objsize), int(objdisksize)
-    if objtype == 'blob':
-      unpacked_size[sha] = objsize
-      packed_size[sha] = objdisksize
-    num_blobs += 1
-    blob_size_progress.show("Processed {} blob sizes".format(num_blobs))
-  cf.wait()
-  blob_size_progress.finish()
-  stats = {'names': collections.defaultdict(set),
-           'allnames' : set(),
-           'file_deletions': {},
-           'tree_deletions': {},
-           'equivalence': {},
-           'rename_history': collections.defaultdict(set),
-           'unpacked_size': unpacked_size,
-           'packed_size': packed_size,
-           'num_commits': 0}
-
-  # Setup the rev-list/diff-tree process
-  commit_parse_progress = ProgressWriter()
-  num_commits = 0
-  cmd = 'git rev-list --topo-order --reverse {} | git diff-tree --stdin --always --root --format="%H%n%P%n%cd" --date=short -M -t -c --raw --combined-all-paths'.format(' '.join(args.refs))
-  dtp = subprocess.Popen(cmd, shell=True, bufsize=-1, stdout=subprocess.PIPE)
-  f = dtp.stdout
-  line = f.next()
-  cont = bool(line)
-  graph = AncestryGraph()
-  while cont:
-    commit = line.rstrip()
-    parents = f.next().split()
-    date = f.next().rstrip()
-
-    # We expect a blank line next; if we get a non-blank line then
-    # this commit modified no files and we need to move on to the next.
-    # If there is no line, we've reached end-of-input.
-    try:
-      line = f.next().rstrip()
-      cont = True
-    except StopIteration:
-      cont = False
-
-    # If we haven't reached end of input, and we got a blank line  meaning
-    # a commit that has modified files, then get the file changes associated
-    # with this commit.
-    file_changes = []
-    if cont and not line:
-      cont = False
-      for line in f:
-        if not line.startswith(':'):
-          cont = True
-          break
-        n = 1+max(1, len(parents))
-        assert line.startswith(':'*(n-1))
-        relevant = line[n-1:-1]
-        splits = relevant.split(None, n)
-        modes = splits[0:n]
-        splits = splits[n].split(None, n)
-        shas = splits[0:n]
-        splits = splits[n].split('\t')
-        change_types = splits[0]
-        filenames = [PathQuoting.dequote(x) for x in splits[1:]]
-        file_changes.append([modes, shas, change_types, filenames])
-
-    # Analyze this commit and update progress
-    analyze_commit(stats, graph, commit, parents, date, file_changes)
-    num_commits += 1
-    commit_parse_progress.show("Processed {} commits".format(num_commits))
-
-  # Show the final commits processed message and record the number of commits
-  commit_parse_progress.finish()
-  stats['num_commits'] = num_commits
-
-  # Close the output, ensure rev-list|diff-tree pipeline completed successfully
-  dtp.stdout.close()
-  if dtp.wait():
-    raise SystemExit("Error: rev-list|diff-tree pipeline failed; see above.")
-
-  return stats
-
-def write_report(reportdir, stats):
-  def datestr(datetimestr):
-    return datetimestr if datetimestr else '<present>'
-
-  def dirnames(path):
-    while True:
-      path = os.path.dirname(path)
-      yield path
-      if path == '':
-        break
-
-  # Compute aggregate size information for paths, extensions, and dirs
-  total_size = {'packed': 0, 'unpacked': 0}
-  path_size = {'packed': collections.defaultdict(int),
-               'unpacked': collections.defaultdict(int)}
-  ext_size = {'packed': collections.defaultdict(int),
-              'unpacked': collections.defaultdict(int)}
-  dir_size = {'packed': collections.defaultdict(int),
-              'unpacked': collections.defaultdict(int)}
-  for sha in stats['names']:
-    size = {'packed': stats['packed_size'][sha],
-            'unpacked': stats['unpacked_size'][sha]}
-    for which in ('packed', 'unpacked'):
-      for name in stats['names'][sha]:
-        total_size[which] += size[which]
-        path_size[which][name] += size[which]
-        basename, ext = os.path.splitext(name)
-        ext_size[which][ext] += size[which]
-        for dirname in dirnames(name):
-          dir_size[which][dirname] += size[which]
-
-  # Determine if and when extensions and directories were deleted
-  ext_deleted_data = {}
-  for name in stats['allnames']:
-    when = stats['file_deletions'].get(name, None)
-
-    # Update the extension
-    basename, ext = os.path.splitext(name)
-    if when is None:
-      ext_deleted_data[ext] = None
-    elif ext in ext_deleted_data:
-      if ext_deleted_data[ext] is not None:
-        ext_deleted_data[ext] = max(ext_deleted_data[ext], when)
-    else:
-      ext_deleted_data[ext] = when
-
-  dir_deleted_data = {}
-  for name in dir_size['packed']:
-    dir_deleted_data[name] = stats['tree_deletions'].get(name, None)
-
-  with open(os.path.join(reportdir, "README"), 'w') as f:
-    # Give a basic overview of this file
-    f.write("== Overal Statistics ==\n")
-    f.write("  Number of commits:         {}\n".format(stats['num_commits']))
-    f.write("  Number of filenames:       {}\n".format(len(path_size['packed'])))
-    f.write("  Number of directories:     {}\n".format(len(dir_size['packed'])))
-    f.write("  Number of file extensions: {}\n".format(len(ext_size['packed'])))
-    f.write("\n")
-    f.write("  Total unpacked size (bytes): {:10d}\n"
-            .format(total_size['unpacked']))
-    f.write("  Total packed size (bytes):   {:10d}\n"
-            .format(total_size['packed']))
-    f.write("\n")
-
-    # Mention issues with the report
-    f.write("== Caveats ==\n")
-    f.write("=== Sizes ===\n")
-    f.write(textwrap.dedent("""
-      Packed size represents what size your repository would be if no
-      trees, commits, tags, or other metadata were included (though it may
-      fail to represent de-duplication; see below).  It also represents the
-      current packing, which may be suboptimal if you haven't gc'ed for a
-      while.
-
-      Unpacked size represents what size your repository would be if no if
-      no trees, commits, tags, or other metadata were included AND if no
-      files were packed; i.e., without delta-ing or compression.
-
-      Both unpacked and packed sizes can be slightly misleading.  Deleting
-      a blob from history not save as much space as the unpacked size,
-      because it is obviously normally stored in packed form.  Also,
-      deleting a blob from history may not save as much space as its packed
-      size either, because another blob could be stored as a delta against
-      that blob, so when you remove one blob another blob's packed size may
-      grow.
-
-      Also, the sum of the packed sizes can add up to more than the
-      repository size; if the same contents appeared in the repository in
-      multiple places, git will automatically de-dupe and store only one
-      copy, while the way sizes are added in this analysis adds the size
-      for each file path that has those contents.  Further, if a file is
-      ever reverted to a previous version's contents, the previous
-      version's size will be counted multiple times in this analysis, even
-      though git will only store it once.
-      """[1:]))
-    f.write("\n")
-    f.write("=== Deletions ===\n")
-    f.write(textwrap.dedent("""
-      Whether a file is deleted is not a binary quality, since it can be
-      deleted on some branches but still exist in others.  Also, it might
-      exist in an old tag, but have been deleted in versions newer than
-      that.  More thorough tracking could be done, including looking at
-      merge commits where one side of history deleted and the other modified,
-      in order to give a more holistic picture of deletions.  However, that
-      algorithm would not only be more complex to implement, it'd also be
-      quite difficult to present and interpret by users.  Since --analyze
-      is just about getting a high-level rough picture of history, it instead
-      implements the simplistic rule that is good enough for 98% of cases:
-        A file is marked as deleted if the last commit in the fast-export
-        stream that mentions the file lists it as deleted.
-      This makes it dependent on topological ordering, but generally gives
-      the "right" answer.
-      """[1:]))
-    f.write("\n")
-    f.write("=== Renames ===\n")
-    f.write(textwrap.dedent("""
-      Renames share the same non-binary nature that deletions do, plus
-      additional challenges:
-        * If the renamed file is renamed again, instead of just two names for
-          a path you can have three or more.
-        * Rename pairs of the form (oldname, newname) that we consider to be
-          different names of the "same file" might only be valid over certain
-          commit ranges.  For example, if a new commit reintroduces a file
-          named oldname, then new versions of oldname aren't the "same file"
-          anymore.  We could try to portray this to the user, but it's easier
-          for the user to just break the pairing and only report unbroken
-          rename pairings to the user.
-        * The ability for users to rename files differently in different
-          branches means that our chains of renames will not necessarily be
-          linear but may branch out.
-      """[1:]))
-    f.write("\n")
-
-  # Equivalence classes for names, so if folks only want to keep a
-  # certain set of paths, they know the old names they want to include
-  # too.
-  with open(os.path.join(reportdir, "renames.txt"), 'w') as f:
-    seen = set()
-    for pathname,equiv_group in sorted(stats['equivalence'].iteritems(),
-                                       key=lambda x:x[1]):
-      if equiv_group in seen:
+  @staticmethod
+  def analyze_commit(stats, graph, commit, parents, date, file_changes):
+    graph.add_commit_and_parents(commit, parents)
+    for change in file_changes:
+      modes, shas, change_types, filenames = change
+      if len(parents) == 1 and change_types.startswith('R'):
+        change_types = 'R'  # remove the rename score; we don't care
+      if modes[-1] == '160000':
        continue
-      seen.add(equiv_group)
-      f.write("{} ->\n    ".format(equiv_group[0]) +
-                   "\n    ".join(equiv_group[1:]) +
-              "\n")
+      elif modes[-1] == '000000':
+        # Track when files/directories are deleted
+        for f in RepoAnalyze.equiv_class(stats, filenames[-1]):
+          if any(x == '040000' for x in modes[0:-1]):
+            stats['tree_deletions'][f] = date
+          else:
+            stats['file_deletions'][f] = date
+      elif change_types.strip('AMT') == '':
+        RepoAnalyze.handle_file(stats, graph, commit, modes, shas, filenames)
+      elif modes[-1] == '040000' and change_types.strip('RAM') == '':
+        RepoAnalyze.handle_file(stats, graph, commit, modes, shas, filenames)
+      elif change_types.strip('RAM') == '':
+        RepoAnalyze.handle_file(stats, graph, commit, modes, shas, filenames)
+        RepoAnalyze.handle_renames(stats, commit, change_types, filenames)
+      else:
+        raise SystemExit("Unhandled change type(s): {} (in commit {})"
+                         .format(change_types, commit))

-  # List directories in reverse sorted order of unpacked size
-  with open(os.path.join(reportdir, "directories-deleted-sizes.txt"), 'w') as f:
-    f.write("=== Deleted directories by reverse size ===\n")
-    f.write("Format: unpacked size, packed size, date deleted, directory name\n")
-    for dirname, size in sorted(dir_size['packed'].iteritems(),
-                                key=lambda x:x[1], reverse=True):
-      if (dir_deleted_data[dirname]):
+  @staticmethod
+  def gather_data(args):
+    blob_size_progress = ProgressWriter()
+    num_blobs = 0
+
+    # Get sizes of blobs by sha1
+    cmd = '--batch-check=%(objectname) %(objecttype) ' + \
+          '%(objectsize) %(objectsize:disk)'
+    cf = subprocess.Popen(['git', 'cat-file', '--batch-all-objects', cmd],
+                          bufsize = -1,
+                          stdout = subprocess.PIPE)
+    unpacked_size = {}
+    packed_size = {}
+    for line in cf.stdout:
+      sha, objtype, objsize, objdisksize = line.split()
+      objsize, objdisksize = int(objsize), int(objdisksize)
+      if objtype == 'blob':
+        unpacked_size[sha] = objsize
+        packed_size[sha] = objdisksize
+      num_blobs += 1
+      blob_size_progress.show("Processed {} blob sizes".format(num_blobs))
+    cf.wait()
+    blob_size_progress.finish()
+    stats = {'names': collections.defaultdict(set),
+             'allnames' : set(),
+             'file_deletions': {},
+             'tree_deletions': {},
+             'equivalence': {},
+             'rename_history': collections.defaultdict(set),
+             'unpacked_size': unpacked_size,
+             'packed_size': packed_size,
+             'num_commits': 0}
+
+    # Setup the rev-list/diff-tree process
+    commit_parse_progress = ProgressWriter()
+    num_commits = 0
+    cmd = ('git rev-list --topo-order --reverse {}'.format(' '.join(args.refs)) +
+           ' | git diff-tree --stdin --always --root --format=%H%n%P%n%cd' +
+           ' --date=short -M -t -c --raw --combined-all-paths')
+    dtp = subprocess.Popen(cmd, shell=True, bufsize=-1, stdout=subprocess.PIPE)
+    f = dtp.stdout
+    line = f.next()
+    cont = bool(line)
+    graph = AncestryGraph()
+    while cont:
+      commit = line.rstrip()
+      parents = f.next().split()
+      date = f.next().rstrip()
+
+      # We expect a blank line next; if we get a non-blank line then
+      # this commit modified no files and we need to move on to the next.
+      # If there is no line, we've reached end-of-input.
+      try:
+        line = f.next().rstrip()
+        cont = True
+      except StopIteration:
+        cont = False
+
+      # If we haven't reached end of input, and we got a blank line meaning
+      # a commit that has modified files, then get the file changes associated
+      # with this commit.
+      file_changes = []
+      if cont and not line:
+        cont = False
+        for line in f:
+          if not line.startswith(':'):
+            cont = True
+            break
+          n = 1+max(1, len(parents))
+          assert line.startswith(':'*(n-1))
+          relevant = line[n-1:-1]
+          splits = relevant.split(None, n)
+          modes = splits[0:n]
+          splits = splits[n].split(None, n)
+          shas = splits[0:n]
+          splits = splits[n].split('\t')
+          change_types = splits[0]
+          filenames = [PathQuoting.dequote(x) for x in splits[1:]]
+          file_changes.append([modes, shas, change_types, filenames])
+
+      # Analyze this commit and update progress
+      RepoAnalyze.analyze_commit(stats, graph, commit, parents, date,
+                                 file_changes)
+      num_commits += 1
+      commit_parse_progress.show("Processed {} commits".format(num_commits))
+
+    # Show the final commits processed message and record the number of commits
+    commit_parse_progress.finish()
+    stats['num_commits'] = num_commits
+
+    # Close the output, ensure rev-list|diff-tree pipeline completed successfully
+    dtp.stdout.close()
+    if dtp.wait():
+      raise SystemExit("Error: rev-list|diff-tree pipeline failed; see above.")
+
+    return stats
+
+  @staticmethod
+  def write_report(reportdir, stats):
+    def datestr(datetimestr):
+      return datetimestr if datetimestr else '<present>'
+
+    def dirnames(path):
+      while True:
+        path = os.path.dirname(path)
+        yield path
+        if path == '':
+          break
+
+    # Compute aggregate size information for paths, extensions, and dirs
+    total_size = {'packed': 0, 'unpacked': 0}
+    path_size = {'packed': collections.defaultdict(int),
+                 'unpacked': collections.defaultdict(int)}
+    ext_size = {'packed': collections.defaultdict(int),
+                'unpacked': collections.defaultdict(int)}
+    dir_size = {'packed': collections.defaultdict(int),
+                'unpacked': collections.defaultdict(int)}
+    for sha in stats['names']:
+      size = {'packed': stats['packed_size'][sha],
+              'unpacked': stats['unpacked_size'][sha]}
+      for which in ('packed', 'unpacked'):
+        for name in stats['names'][sha]:
+          total_size[which] += size[which]
+          path_size[which][name] += size[which]
+          basename, ext = os.path.splitext(name)
+          ext_size[which][ext] += size[which]
+          for dirname in dirnames(name):
+            dir_size[which][dirname] += size[which]
+
+    # Determine if and when extensions and directories were deleted
+    ext_deleted_data = {}
+    for name in stats['allnames']:
+      when = stats['file_deletions'].get(name, None)
+
+      # Update the extension
+      basename, ext = os.path.splitext(name)
+      if when is None:
+        ext_deleted_data[ext] = None
+      elif ext in ext_deleted_data:
+        if ext_deleted_data[ext] is not None:
+          ext_deleted_data[ext] = max(ext_deleted_data[ext], when)
+      else:
+        ext_deleted_data[ext] = when
+
+    dir_deleted_data = {}
+    for name in dir_size['packed']:
+      dir_deleted_data[name] = stats['tree_deletions'].get(name, None)
+
+    with open(os.path.join(reportdir, "README"), 'w') as f:
+      # Give a basic overview of this file
+      f.write("== Overal Statistics ==\n")
+      f.write("  Number of commits:         {}\n".format(stats['num_commits']))
+      f.write("  Number of filenames:       {}\n".format(len(path_size['packed'])))
+      f.write("  Number of directories:     {}\n".format(len(dir_size['packed'])))
+      f.write("  Number of file extensions: {}\n".format(len(ext_size['packed'])))
+      f.write("\n")
+      f.write("  Total unpacked size (bytes): {:10d}\n"
+              .format(total_size['unpacked']))
+      f.write("  Total packed size (bytes):   {:10d}\n"
+              .format(total_size['packed']))
+      f.write("\n")
+
+      # Mention issues with the report
+      f.write("== Caveats ==\n")
+      f.write("=== Sizes ===\n")
+      f.write(textwrap.dedent("""
+        Packed size represents what size your repository would be if no
+        trees, commits, tags, or other metadata were included (though it may
+        fail to represent de-duplication; see below).  It also represents the
+        current packing, which may be suboptimal if you haven't gc'ed for a
+        while.
+
+        Unpacked size represents what size your repository would be if no if
+        no trees, commits, tags, or other metadata were included AND if no
+        files were packed; i.e., without delta-ing or compression.
+
+        Both unpacked and packed sizes can be slightly misleading.  Deleting
+        a blob from history not save as much space as the unpacked size,
+        because it is obviously normally stored in packed form.  Also,
+        deleting a blob from history may not save as much space as its packed
+        size either, because another blob could be stored as a delta against
+        that blob, so when you remove one blob another blob's packed size may
+        grow.
+
+        Also, the sum of the packed sizes can add up to more than the
+        repository size; if the same contents appeared in the repository in
+        multiple places, git will automatically de-dupe and store only one
+        copy, while the way sizes are added in this analysis adds the size
+        for each file path that has those contents.  Further, if a file is
+        ever reverted to a previous version's contents, the previous
+        version's size will be counted multiple times in this analysis, even
+        though git will only store it once.
+        """[1:]))
+      f.write("\n")
+      f.write("=== Deletions ===\n")
+      f.write(textwrap.dedent("""
+        Whether a file is deleted is not a binary quality, since it can be
+        deleted on some branches but still exist in others.  Also, it might
+        exist in an old tag, but have been deleted in versions newer than
+        that.  More thorough tracking could be done, including looking at
+        merge commits where one side of history deleted and the other modified,
+        in order to give a more holistic picture of deletions.  However, that
+        algorithm would not only be more complex to implement, it'd also be
+        quite difficult to present and interpret by users.  Since --analyze
+        is just about getting a high-level rough picture of history, it instead
+        implements the simplistic rule that is good enough for 98% of cases:
+          A file is marked as deleted if the last commit in the fast-export
+          stream that mentions the file lists it as deleted.
+        This makes it dependent on topological ordering, but generally gives
+        the "right" answer.
+        """[1:]))
+      f.write("\n")
+      f.write("=== Renames ===\n")
+      f.write(textwrap.dedent("""
+        Renames share the same non-binary nature that deletions do, plus
+        additional challenges:
+          * If the renamed file is renamed again, instead of just two names for
+            a path you can have three or more.
+          * Rename pairs of the form (oldname, newname) that we consider to be
+            different names of the "same file" might only be valid over certain
+            commit ranges.  For example, if a new commit reintroduces a file
+            named oldname, then new versions of oldname aren't the "same file"
+            anymore.  We could try to portray this to the user, but it's easier
+            for the user to just break the pairing and only report unbroken
+            rename pairings to the user.
+          * The ability for users to rename files differently in different
+            branches means that our chains of renames will not necessarily be
+            linear but may branch out.
+        """[1:]))
+      f.write("\n")
+
+    # Equivalence classes for names, so if folks only want to keep a
+    # certain set of paths, they know the old names they want to include
+    # too.
+    with open(os.path.join(reportdir, "renames.txt"), 'w') as f:
+      seen = set()
+      for pathname,equiv_group in sorted(stats['equivalence'].iteritems(),
+                                         key=lambda x:x[1]):
+        if equiv_group in seen:
+          continue
+        seen.add(equiv_group)
+        f.write("{} ->\n    ".format(equiv_group[0]) +
+                     "\n    ".join(equiv_group[1:]) +
+                "\n")
+
+    # List directories in reverse sorted order of unpacked size
+    with open(os.path.join(reportdir, "directories-deleted-sizes.txt"), 'w') as f:
+      f.write("=== Deleted directories by reverse size ===\n")
+      f.write("Format: unpacked size, packed size, date deleted, directory name\n")
+      for dirname, size in sorted(dir_size['packed'].iteritems(),
+                                  key=lambda x:x[1], reverse=True):
+        if (dir_deleted_data[dirname]):
+          f.write("  {:10d} {:10d} {:10s} {}\n"
+                  .format(dir_size['unpacked'][dirname],
+                          size,
+                          datestr(dir_deleted_data[dirname]),
+                          dirname or '<toplevel>'))
+
+    with open(os.path.join(reportdir, "directories-all-sizes.txt"), 'w') as f:
+      f.write("=== All directories by reverse size ===\n")
+      f.write("Format: unpacked size, packed size, date deleted, directory name\n")
+      for dirname, size in sorted(dir_size['packed'].iteritems(),
+                                  key=lambda x:x[1], reverse=True):
        f.write("  {:10d} {:10d} {:10s} {}\n"
                .format(dir_size['unpacked'][dirname],
                        size,
                        datestr(dir_deleted_data[dirname]),
                        dirname or '<toplevel>'))

-  with open(os.path.join(reportdir, "directories-all-sizes.txt"), 'w') as f:
-    f.write("=== All directories by reverse size ===\n")
-    f.write("Format: unpacked size, packed size, date deleted, directory name\n")
-    for dirname, size in sorted(dir_size['packed'].iteritems(),
-                                key=lambda x:x[1], reverse=True):
-      f.write("  {:10d} {:10d} {:10s} {}\n"
-              .format(dir_size['unpacked'][dirname],
-                      size,
-                      datestr(dir_deleted_data[dirname]),
-                      dirname or '<toplevel>'))
+    # List extensions in reverse sorted order of unpacked size
+    with open(os.path.join(reportdir, "extensions-deleted-sizes.txt"), 'w') as f:
+      f.write("=== Deleted extensions by reverse size ===\n")
+      f.write("Format: unpacked size, packed size, date deleted, extension name\n")
+      for extname, size in sorted(ext_size['packed'].iteritems(),
+                                  key=lambda x:x[1], reverse=True):
+        if (ext_deleted_data[extname]):
+          f.write("  {:10d} {:10d} {:10s} {}\n"
+                  .format(ext_size['unpacked'][extname],
+                          size,
+                          datestr(ext_deleted_data[extname]),
+                          extname or '<no extension>'))

-  # List extensions in reverse sorted order of unpacked size
-  with open(os.path.join(reportdir, "extensions-deleted-sizes.txt"), 'w') as f:
-    f.write("=== Deleted extensions by reverse size ===\n")
-    f.write("Format: unpacked size, packed size, date deleted, extension name\n")
-    for extname, size in sorted(ext_size['packed'].iteritems(),
-                                key=lambda x:x[1], reverse=True):
-      if (ext_deleted_data[extname]):
+    with open(os.path.join(reportdir, "extensions-all-sizes.txt"), 'w') as f:
+      f.write("=== All extensions by reverse size ===\n")
+      f.write("Format: unpacked size, packed size, date deleted, extension name\n")
+      for extname, size in sorted(ext_size['packed'].iteritems(),
+                                  key=lambda x:x[1], reverse=True):
        f.write("  {:10d} {:10d} {:10s} {}\n"
                .format(ext_size['unpacked'][extname],
                        size,
                        datestr(ext_deleted_data[extname]),
                        extname or '<no extension>'))

-  with open(os.path.join(reportdir, "extensions-all-sizes.txt"), 'w') as f:
-    f.write("=== All extensions by reverse size ===\n")
-    f.write("Format: unpacked size, packed size, date deleted, extension name\n")
-    for extname, size in sorted(ext_size['packed'].iteritems(),
-                                key=lambda x:x[1], reverse=True):
-      f.write("  {:10d} {:10d} {:10s} {}\n"
-              .format(ext_size['unpacked'][extname],
-                      size,
-                      datestr(ext_deleted_data[extname]),
-                      extname or '<no extension>'))
+    # List files in reverse sorted order of unpacked size
+    with open(os.path.join(reportdir, "path-deleted-sizes.txt"), 'w') as f:
+      f.write("=== Deleted paths by reverse accumulated size ===\n")
+      f.write("Format: unpacked size, packed size, date deleted, path name(s)\n")
+      for pathname, size in sorted(path_size['packed'].iteritems(),
+                                   key=lambda x:x[1], reverse=True):
+        when = stats['file_deletions'].get(pathname, None)
+        if when:
+          f.write("  {:10d} {:10d} {:10s} {}\n"
+                  .format(path_size['unpacked'][pathname],
+                          size,
+                          datestr(when),
+                          pathname))

-  # List files in reverse sorted order of unpacked size
-  with open(os.path.join(reportdir, "path-deleted-sizes.txt"), 'w') as f:
-    f.write("=== Deleted paths by reverse accumulated size ===\n")
-    f.write("Format: unpacked size, packed size, date deleted, path name(s)\n")
-    for pathname, size in sorted(path_size['packed'].iteritems(),
-                                 key=lambda x:x[1], reverse=True):
-      when = stats['file_deletions'].get(pathname, None)
-      if when:
+    with open(os.path.join(reportdir, "path-all-sizes.txt"), 'w') as f:
+      f.write("=== All paths by reverse accumulated size ===\n")
+      f.write("Format: unpacked size, packed size, date deleted, pathectory name\n")
+      for pathname, size in sorted(path_size['packed'].iteritems(),
+                                   key=lambda x:x[1], reverse=True):
+        when = stats['file_deletions'].get(pathname, None)
        f.write("  {:10d} {:10d} {:10s} {}\n"
                .format(path_size['unpacked'][pathname],
                        size,
                        datestr(when),
                        pathname))

-  with open(os.path.join(reportdir, "path-all-sizes.txt"), 'w') as f:
-    f.write("=== All paths by reverse accumulated size ===\n")
-    f.write("Format: unpacked size, packed size, date deleted, pathectory name\n")
-    for pathname, size in sorted(path_size['packed'].iteritems(),
-                                 key=lambda x:x[1], reverse=True):
-      when = stats['file_deletions'].get(pathname, None)
-      f.write("  {:10d} {:10d} {:10s} {}\n"
-              .format(path_size['unpacked'][pathname],
-                      size,
-                      datestr(when),
-                      pathname))
+    # List of filenames and sizes in descending order
+    with open(os.path.join(reportdir, "blob-shas-and-paths.txt"), 'w') as f:
+      f.write("== Files by sha and associated pathnames in reverse size ==\n")
+      f.write("Format: sha, unpacked size, packed size, filename(s) object stored as\n")
+      for sha, size in sorted(stats['packed_size'].iteritems(),
+                              key=lambda x:x[1], reverse=True):
+        if sha not in stats['names']:
+          # Some objects in the repository might not be referenced, or not
+          # referenced by the branches/tags the user cares about; skip them.
+          continue
+        names_with_sha = stats['names'][sha]
+        if len(names_with_sha) == 1:
+          names_with_sha = names_with_sha.pop()
+        else:
+          names_with_sha = sorted(list(names_with_sha))
+        f.write("  {} {:10d} {:10d} {}\n".format(sha,
+                                                 stats['unpacked_size'][sha],
+                                                 size,
+                                                 names_with_sha))

-  # List of filenames and sizes in descending order
-  with open(os.path.join(reportdir, "blob-shas-and-paths.txt"), 'w') as f:
-    f.write("== Files by sha and associated pathnames in reverse size ==\n")
-    f.write("Format: sha, unpacked size, packed size, filename(s) object stored as\n")
-    for sha, size in sorted(stats['packed_size'].iteritems(),
-                            key=lambda x:x[1], reverse=True):
-      if sha not in stats['names']:
-        # Some objects in the repository might not be referenced, or not
-        # referenced by the branches/tags the user cares about; skip them.
-        continue
-      names_with_sha = stats['names'][sha]
-      if len(names_with_sha) == 1:
-        names_with_sha = names_with_sha.pop()
-      else:
-        names_with_sha = sorted(list(names_with_sha))
-      f.write("  {} {:10d} {:10d} {}\n".format(sha,
-                                               stats['unpacked_size'][sha],
-                                               size,
-                                               names_with_sha))
+  @staticmethod
+  def run(args, git_dir):
+    # Create the report directory as necessary
+    results_tmp_dir = os.path.join(git_dir, 'filter-repo')
+    if not os.path.isdir(results_tmp_dir):
+      os.mkdir(results_tmp_dir)
+    reportdir = os.path.join(results_tmp_dir, "analysis")
+    if not args.force and os.path.isdir(reportdir):
+      raise SystemExit("Error: {} already exists; refusing to overwrite!".
+                       format(reportdir))
+    os.mkdir(reportdir)

-def do_analysis(args, git_dir):
-  # Create the report directory as necessary
-  results_tmp_dir = os.path.join(git_dir, 'filter-repo')
-  if not os.path.isdir(results_tmp_dir):
-    os.mkdir(results_tmp_dir)
-  reportdir = os.path.join(results_tmp_dir, "analysis")
-  if not args.force and os.path.isdir(reportdir):
-    raise SystemExit("Error: {} already exists; refusing to overwrite!".
-                     format(reportdir))
-  os.mkdir(reportdir)
+    # Gather the data we need
+    stats = RepoAnalyze.gather_data(args)

-  # Gather the data we need
-  stats = gather_data(args)
-
-  # Write the reports
-  sys.stdout.write("Writing reports to {}...".format(reportdir))
-  sys.stdout.flush()
-  write_report(reportdir, stats)
-  sys.stdout.write("done.\n")
+    # Write the reports
+    sys.stdout.write("Writing reports to {}...".format(reportdir))
+    sys.stdout.flush()
+    RepoAnalyze.write_report(reportdir, stats)
+    sys.stdout.write("done.\n")

 def sanity_check(refs, is_bare):
  def abort(reason):
@ -2506,7 +2524,7 @@ def run_fast_filter():

  # Do analysis, if requested
  if args.analyze:
-    do_analysis(args, git_dir)
+    RepoAnalyze.run(args, git_dir)
    return

  # Do sanity checks